Model save

Browse files

Files changed (14) hide show

README.md +68 -0
all_results.json +9 -0
config.json +30 -0
generation_config.json +7 -0
merges.txt +0 -0
model.safetensors +3 -0
runs/Oct30_23-27-03_ip-26-0-164-18/events.out.tfevents.1730332615.ip-26-0-164-18.40006.0 +3 -0
special_tokens_map.json +28 -0
tokenizer.json +0 -0
tokenizer_config.json +154 -0
train_results.json +9 -0
trainer_state.json +2564 -0
training_args.bin +3 -0
vocab.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,68 @@

+---
+base_model: HuggingFaceTB/SmolLM2-1.7B-8k
+tags:
+- trl
+- sft
+- generated_from_trainer
+datasets:
+- generator
+model-index:
+- name: smollm2-1.7B-8k-mix7-ep2-v2
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/loubnabnl/huggingface/runs/6rp7tpcv)
+# smollm2-1.7B-8k-mix7-ep2-v2
+This model is a fine-tuned version of [HuggingFaceTB/SmolLM2-1.7B-8k](https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-8k) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.0630
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0003
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 128
+- total_eval_batch_size: 32
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 2
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 0.7442        | 0.9992 | 893  | 1.0789          |
+| 0.6705        | 1.9983 | 1786 | 1.0630          |
+### Framework versions
+- Transformers 4.42.3
+- Pytorch 2.1.2
+- Datasets 2.20.0
+- Tokenizers 0.19.1

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.9983216783216782,
+    "total_flos": 1127451463778304.0,
+    "train_loss": 0.777597175032935,
+    "train_runtime": 16051.5455,
+    "train_samples": 1029132,
+    "train_samples_per_second": 14.253,
+    "train_steps_per_second": 0.111
+}

config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_name_or_path": "HuggingFaceTB/SmolLM2-1.7B-8k",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 32,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 130000,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.42.3",
+  "use_cache": false,
+  "vocab_size": 49152
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 2,
+  "transformers_version": "4.42.3"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98360aebc58238dc07b4bcb78b709b7eb24d639e50a662f6bbdb57f610f6779c
+size 3422777952

runs/Oct30_23-27-03_ip-26-0-164-18/events.out.tfevents.1730332615.ip-26-0-164-18.40006.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0873dd28e429f1d2206edf9d1303fb97e99dda399fea3d137c0e2cff6cda7703
+size 81405

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": "<|im_start|>",
+  "eos_token": "<|im_end|>",
+  "pad_token": "<|im_end|>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,154 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": "<|im_start|>",
+  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "model_max_length": 2048,
+  "pad_token": "<|im_end|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.9983216783216782,
+    "total_flos": 1127451463778304.0,
+    "train_loss": 0.777597175032935,
+    "train_runtime": 16051.5455,
+    "train_samples": 1029132,
+    "train_samples_per_second": 14.253,
+    "train_steps_per_second": 0.111
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2564 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9983216783216782,
+  "eval_steps": 500,
+  "global_step": 1786,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0011188811188811189,
+      "grad_norm": 153.8507844996399,
+      "learning_rate": 1.6759776536312848e-06,
+      "loss": 4.8778,
+      "step": 1
+    },
+    {
+      "epoch": 0.005594405594405594,
+      "grad_norm": 153.39272858190785,
+      "learning_rate": 8.379888268156423e-06,
+      "loss": 4.7304,
+      "step": 5
+    },
+    {
+      "epoch": 0.011188811188811189,
+      "grad_norm": 96.41612092958107,
+      "learning_rate": 1.6759776536312845e-05,
+      "loss": 4.1396,
+      "step": 10
+    },
+    {
+      "epoch": 0.016783216783216783,
+      "grad_norm": 23.87392825650757,
+      "learning_rate": 2.513966480446927e-05,
+      "loss": 2.5431,
+      "step": 15
+    },
+    {
+      "epoch": 0.022377622377622378,
+      "grad_norm": 6.903016270261399,
+      "learning_rate": 3.351955307262569e-05,
+      "loss": 1.742,
+      "step": 20
+    },
+    {
+      "epoch": 0.027972027972027972,
+      "grad_norm": 2.441067831027445,
+      "learning_rate": 4.189944134078212e-05,
+      "loss": 1.3057,
+      "step": 25
+    },
+    {
+      "epoch": 0.033566433566433566,
+      "grad_norm": 1.0984763176398198,
+      "learning_rate": 5.027932960893854e-05,
+      "loss": 1.1221,
+      "step": 30
+    },
+    {
+      "epoch": 0.039160839160839164,
+      "grad_norm": 0.5734705027472714,
+      "learning_rate": 5.865921787709496e-05,
+      "loss": 1.0604,
+      "step": 35
+    },
+    {
+      "epoch": 0.044755244755244755,
+      "grad_norm": 0.47491071619847164,
+      "learning_rate": 6.703910614525138e-05,
+      "loss": 0.9951,
+      "step": 40
+    },
+    {
+      "epoch": 0.05034965034965035,
+      "grad_norm": 0.48157929747551576,
+      "learning_rate": 7.541899441340782e-05,
+      "loss": 0.9713,
+      "step": 45
+    },
+    {
+      "epoch": 0.055944055944055944,
+      "grad_norm": 0.45861937937875213,
+      "learning_rate": 8.379888268156423e-05,
+      "loss": 0.9551,
+      "step": 50
+    },
+    {
+      "epoch": 0.06153846153846154,
+      "grad_norm": 0.5047783392223855,
+      "learning_rate": 9.217877094972066e-05,
+      "loss": 0.948,
+      "step": 55
+    },
+    {
+      "epoch": 0.06713286713286713,
+      "grad_norm": 0.4461540398939836,
+      "learning_rate": 0.00010055865921787709,
+      "loss": 0.9392,
+      "step": 60
+    },
+    {
+      "epoch": 0.07272727272727272,
+      "grad_norm": 0.3316401246824197,
+      "learning_rate": 0.00010893854748603351,
+      "loss": 0.9146,
+      "step": 65
+    },
+    {
+      "epoch": 0.07832167832167833,
+      "grad_norm": 0.21722066760341413,
+      "learning_rate": 0.00011731843575418992,
+      "loss": 0.9012,
+      "step": 70
+    },
+    {
+      "epoch": 0.08391608391608392,
+      "grad_norm": 0.1499443883049875,
+      "learning_rate": 0.00012569832402234635,
+      "loss": 0.8927,
+      "step": 75
+    },
+    {
+      "epoch": 0.08951048951048951,
+      "grad_norm": 0.22243490348769648,
+      "learning_rate": 0.00013407821229050276,
+      "loss": 0.8937,
+      "step": 80
+    },
+    {
+      "epoch": 0.0951048951048951,
+      "grad_norm": 0.37625345216705214,
+      "learning_rate": 0.0001424581005586592,
+      "loss": 0.8718,
+      "step": 85
+    },
+    {
+      "epoch": 0.1006993006993007,
+      "grad_norm": 0.24963786248117179,
+      "learning_rate": 0.00015083798882681564,
+      "loss": 0.8921,
+      "step": 90
+    },
+    {
+      "epoch": 0.1062937062937063,
+      "grad_norm": 0.26880921530140445,
+      "learning_rate": 0.00015921787709497208,
+      "loss": 0.8799,
+      "step": 95
+    },
+    {
+      "epoch": 0.11188811188811189,
+      "grad_norm": 0.20889087265244094,
+      "learning_rate": 0.00016759776536312847,
+      "loss": 0.8678,
+      "step": 100
+    },
+    {
+      "epoch": 0.11748251748251748,
+      "grad_norm": 0.20865989038699168,
+      "learning_rate": 0.0001759776536312849,
+      "loss": 0.8715,
+      "step": 105
+    },
+    {
+      "epoch": 0.12307692307692308,
+      "grad_norm": 0.17048448558431092,
+      "learning_rate": 0.00018435754189944132,
+      "loss": 0.866,
+      "step": 110
+    },
+    {
+      "epoch": 0.12867132867132866,
+      "grad_norm": 0.2563726823096772,
+      "learning_rate": 0.00019273743016759776,
+      "loss": 0.8786,
+      "step": 115
+    },
+    {
+      "epoch": 0.13426573426573427,
+      "grad_norm": 0.22854402328286066,
+      "learning_rate": 0.00020111731843575417,
+      "loss": 0.8641,
+      "step": 120
+    },
+    {
+      "epoch": 0.13986013986013987,
+      "grad_norm": 0.17876773160294948,
+      "learning_rate": 0.00020949720670391058,
+      "loss": 0.8625,
+      "step": 125
+    },
+    {
+      "epoch": 0.14545454545454545,
+      "grad_norm": 0.3615856863579962,
+      "learning_rate": 0.00021787709497206702,
+      "loss": 0.8504,
+      "step": 130
+    },
+    {
+      "epoch": 0.15104895104895105,
+      "grad_norm": 0.22751374781762473,
+      "learning_rate": 0.00022625698324022346,
+      "loss": 0.8683,
+      "step": 135
+    },
+    {
+      "epoch": 0.15664335664335666,
+      "grad_norm": 0.21790912834318224,
+      "learning_rate": 0.00023463687150837985,
+      "loss": 0.8519,
+      "step": 140
+    },
+    {
+      "epoch": 0.16223776223776223,
+      "grad_norm": 0.18707627983249808,
+      "learning_rate": 0.0002430167597765363,
+      "loss": 0.8426,
+      "step": 145
+    },
+    {
+      "epoch": 0.16783216783216784,
+      "grad_norm": 0.22404831326656477,
+      "learning_rate": 0.0002513966480446927,
+      "loss": 0.8509,
+      "step": 150
+    },
+    {
+      "epoch": 0.17342657342657342,
+      "grad_norm": 0.20919591839479193,
+      "learning_rate": 0.00025977653631284914,
+      "loss": 0.8594,
+      "step": 155
+    },
+    {
+      "epoch": 0.17902097902097902,
+      "grad_norm": 0.15579954649232924,
+      "learning_rate": 0.0002681564245810055,
+      "loss": 0.8319,
+      "step": 160
+    },
+    {
+      "epoch": 0.18461538461538463,
+      "grad_norm": 0.4055853577053493,
+      "learning_rate": 0.00027653631284916196,
+      "loss": 0.8577,
+      "step": 165
+    },
+    {
+      "epoch": 0.1902097902097902,
+      "grad_norm": 0.2945253672148607,
+      "learning_rate": 0.0002849162011173184,
+      "loss": 0.8579,
+      "step": 170
+    },
+    {
+      "epoch": 0.1958041958041958,
+      "grad_norm": 0.21772142109085163,
+      "learning_rate": 0.00029329608938547484,
+      "loss": 0.8382,
+      "step": 175
+    },
+    {
+      "epoch": 0.2013986013986014,
+      "grad_norm": 0.2281215849497856,
+      "learning_rate": 0.00029999971336506766,
+      "loss": 0.8331,
+      "step": 180
+    },
+    {
+      "epoch": 0.206993006993007,
+      "grad_norm": 0.17899592128620762,
+      "learning_rate": 0.0002999896812574594,
+      "loss": 0.8434,
+      "step": 185
+    },
+    {
+      "epoch": 0.2125874125874126,
+      "grad_norm": 0.14731246899285452,
+      "learning_rate": 0.0002999653184986775,
+      "loss": 0.8458,
+      "step": 190
+    },
+    {
+      "epoch": 0.21818181818181817,
+      "grad_norm": 0.14945248170502354,
+      "learning_rate": 0.00029992662741644334,
+      "loss": 0.8457,
+      "step": 195
+    },
+    {
+      "epoch": 0.22377622377622378,
+      "grad_norm": 0.14124929357614965,
+      "learning_rate": 0.0002998736117074673,
+      "loss": 0.8219,
+      "step": 200
+    },
+    {
+      "epoch": 0.22937062937062938,
+      "grad_norm": 0.18621904528265473,
+      "learning_rate": 0.0002998062764370954,
+      "loss": 0.8299,
+      "step": 205
+    },
+    {
+      "epoch": 0.23496503496503496,
+      "grad_norm": 0.22513445720588005,
+      "learning_rate": 0.00029972462803882523,
+      "loss": 0.8502,
+      "step": 210
+    },
+    {
+      "epoch": 0.24055944055944056,
+      "grad_norm": 0.1636384983699745,
+      "learning_rate": 0.0002996286743136916,
+      "loss": 0.8506,
+      "step": 215
+    },
+    {
+      "epoch": 0.24615384615384617,
+      "grad_norm": 0.19587892370520038,
+      "learning_rate": 0.000299518424429521,
+      "loss": 0.8382,
+      "step": 220
+    },
+    {
+      "epoch": 0.2517482517482518,
+      "grad_norm": 0.17371559645370255,
+      "learning_rate": 0.0002993938889200556,
+      "loss": 0.8322,
+      "step": 225
+    },
+    {
+      "epoch": 0.2573426573426573,
+      "grad_norm": 0.13779589201692538,
+      "learning_rate": 0.0002992550796839468,
+      "loss": 0.8243,
+      "step": 230
+    },
+    {
+      "epoch": 0.2629370629370629,
+      "grad_norm": 0.22266694135489237,
+      "learning_rate": 0.00029910200998361857,
+      "loss": 0.8332,
+      "step": 235
+    },
+    {
+      "epoch": 0.26853146853146853,
+      "grad_norm": 0.21324201617886082,
+      "learning_rate": 0.0002989346944440003,
+      "loss": 0.8377,
+      "step": 240
+    },
+    {
+      "epoch": 0.27412587412587414,
+      "grad_norm": 0.1378342855087125,
+      "learning_rate": 0.0002987531490511291,
+      "loss": 0.8346,
+      "step": 245
+    },
+    {
+      "epoch": 0.27972027972027974,
+      "grad_norm": 0.13885067623091277,
+      "learning_rate": 0.000298557391150623,
+      "loss": 0.8366,
+      "step": 250
+    },
+    {
+      "epoch": 0.2853146853146853,
+      "grad_norm": 0.13254675983004904,
+      "learning_rate": 0.00029834743944602316,
+      "loss": 0.8389,
+      "step": 255
+    },
+    {
+      "epoch": 0.2909090909090909,
+      "grad_norm": 0.15399189190157112,
+      "learning_rate": 0.0002981233139970071,
+      "loss": 0.8247,
+      "step": 260
+    },
+    {
+      "epoch": 0.2965034965034965,
+      "grad_norm": 0.16475565381996887,
+      "learning_rate": 0.0002978850362174722,
+      "loss": 0.8042,
+      "step": 265
+    },
+    {
+      "epoch": 0.3020979020979021,
+      "grad_norm": 0.1449584149552635,
+      "learning_rate": 0.0002976326288734894,
+      "loss": 0.8113,
+      "step": 270
+    },
+    {
+      "epoch": 0.3076923076923077,
+      "grad_norm": 0.1479148461740847,
+      "learning_rate": 0.0002973661160811284,
+      "loss": 0.8327,
+      "step": 275
+    },
+    {
+      "epoch": 0.3132867132867133,
+      "grad_norm": 0.11641111946130961,
+      "learning_rate": 0.00029708552330415337,
+      "loss": 0.8266,
+      "step": 280
+    },
+    {
+      "epoch": 0.31888111888111886,
+      "grad_norm": 0.1289644823631784,
+      "learning_rate": 0.0002967908773515898,
+      "loss": 0.8031,
+      "step": 285
+    },
+    {
+      "epoch": 0.32447552447552447,
+      "grad_norm": 0.19696780235530817,
+      "learning_rate": 0.0002964822063751635,
+      "loss": 0.8195,
+      "step": 290
+    },
+    {
+      "epoch": 0.3300699300699301,
+      "grad_norm": 0.13415245793361397,
+      "learning_rate": 0.00029615953986661056,
+      "loss": 0.8232,
+      "step": 295
+    },
+    {
+      "epoch": 0.3356643356643357,
+      "grad_norm": 0.1289064453230423,
+      "learning_rate": 0.0002958229086548595,
+      "loss": 0.811,
+      "step": 300
+    },
+    {
+      "epoch": 0.3412587412587413,
+      "grad_norm": 0.2030532789276685,
+      "learning_rate": 0.00029547234490308604,
+      "loss": 0.8196,
+      "step": 305
+    },
+    {
+      "epoch": 0.34685314685314683,
+      "grad_norm": 0.1690018504223368,
+      "learning_rate": 0.00029510788210563996,
+      "loss": 0.8176,
+      "step": 310
+    },
+    {
+      "epoch": 0.35244755244755244,
+      "grad_norm": 0.18348727298089326,
+      "learning_rate": 0.0002947295550848448,
+      "loss": 0.8106,
+      "step": 315
+    },
+    {
+      "epoch": 0.35804195804195804,
+      "grad_norm": 0.12176145136770436,
+      "learning_rate": 0.000294337399987671,
+      "loss": 0.8106,
+      "step": 320
+    },
+    {
+      "epoch": 0.36363636363636365,
+      "grad_norm": 0.14033633981287297,
+      "learning_rate": 0.0002939314542822821,
+      "loss": 0.8061,
+      "step": 325
+    },
+    {
+      "epoch": 0.36923076923076925,
+      "grad_norm": 0.1360601366236884,
+      "learning_rate": 0.0002935117567544547,
+      "loss": 0.8026,
+      "step": 330
+    },
+    {
+      "epoch": 0.3748251748251748,
+      "grad_norm": 0.15461610864168454,
+      "learning_rate": 0.0002930783475038734,
+      "loss": 0.8232,
+      "step": 335
+    },
+    {
+      "epoch": 0.3804195804195804,
+      "grad_norm": 0.10363695784053935,
+      "learning_rate": 0.0002926312679402985,
+      "loss": 0.8049,
+      "step": 340
+    },
+    {
+      "epoch": 0.386013986013986,
+      "grad_norm": 0.12894656658416323,
+      "learning_rate": 0.00029217056077961043,
+      "loss": 0.7993,
+      "step": 345
+    },
+    {
+      "epoch": 0.3916083916083916,
+      "grad_norm": 0.12013728582042035,
+      "learning_rate": 0.000291696270039728,
+      "loss": 0.7863,
+      "step": 350
+    },
+    {
+      "epoch": 0.3972027972027972,
+      "grad_norm": 0.10813698150493352,
+      "learning_rate": 0.0002912084410364029,
+      "loss": 0.7997,
+      "step": 355
+    },
+    {
+      "epoch": 0.4027972027972028,
+      "grad_norm": 0.17771298556035153,
+      "learning_rate": 0.00029070712037889,
+      "loss": 0.8018,
+      "step": 360
+    },
+    {
+      "epoch": 0.4083916083916084,
+      "grad_norm": 0.10837497819616222,
+      "learning_rate": 0.00029019235596549394,
+      "loss": 0.8078,
+      "step": 365
+    },
+    {
+      "epoch": 0.413986013986014,
+      "grad_norm": 0.12560903325827824,
+      "learning_rate": 0.0002896641969789932,
+      "loss": 0.8182,
+      "step": 370
+    },
+    {
+      "epoch": 0.4195804195804196,
+      "grad_norm": 0.17753966157649756,
+      "learning_rate": 0.0002891226938819405,
+      "loss": 0.8059,
+      "step": 375
+    },
+    {
+      "epoch": 0.4251748251748252,
+      "grad_norm": 0.12975220832490364,
+      "learning_rate": 0.0002885678984118415,
+      "loss": 0.7811,
+      "step": 380
+    },
+    {
+      "epoch": 0.4307692307692308,
+      "grad_norm": 0.18552799892105348,
+      "learning_rate": 0.0002879998635762118,
+      "loss": 0.799,
+      "step": 385
+    },
+    {
+      "epoch": 0.43636363636363634,
+      "grad_norm": 0.17744047449094297,
+      "learning_rate": 0.000287418643647512,
+      "loss": 0.7974,
+      "step": 390
+    },
+    {
+      "epoch": 0.44195804195804195,
+      "grad_norm": 0.13797503272598644,
+      "learning_rate": 0.00028682429415796267,
+      "loss": 0.7931,
+      "step": 395
+    },
+    {
+      "epoch": 0.44755244755244755,
+      "grad_norm": 0.12860800142514783,
+      "learning_rate": 0.0002862168718942383,
+      "loss": 0.7861,
+      "step": 400
+    },
+    {
+      "epoch": 0.45314685314685316,
+      "grad_norm": 0.12668805301774957,
+      "learning_rate": 0.00028559643489204186,
+      "loss": 0.8107,
+      "step": 405
+    },
+    {
+      "epoch": 0.45874125874125876,
+      "grad_norm": 0.11338400624283074,
+      "learning_rate": 0.0002849630424305595,
+      "loss": 0.8088,
+      "step": 410
+    },
+    {
+      "epoch": 0.4643356643356643,
+      "grad_norm": 0.1345570809457518,
+      "learning_rate": 0.00028431675502679717,
+      "loss": 0.8038,
+      "step": 415
+    },
+    {
+      "epoch": 0.4699300699300699,
+      "grad_norm": 0.11102734283999488,
+      "learning_rate": 0.00028365763442979823,
+      "loss": 0.8163,
+      "step": 420
+    },
+    {
+      "epoch": 0.4755244755244755,
+      "grad_norm": 0.12810877098380058,
+      "learning_rate": 0.000282985743614744,
+      "loss": 0.8017,
+      "step": 425
+    },
+    {
+      "epoch": 0.4811188811188811,
+      "grad_norm": 0.11873107897366113,
+      "learning_rate": 0.0002823011467769364,
+      "loss": 0.7957,
+      "step": 430
+    },
+    {
+      "epoch": 0.48671328671328673,
+      "grad_norm": 0.13272846461187762,
+      "learning_rate": 0.000281603909325665,
+      "loss": 0.8054,
+      "step": 435
+    },
+    {
+      "epoch": 0.49230769230769234,
+      "grad_norm": 0.10875049751356179,
+      "learning_rate": 0.00028089409787795716,
+      "loss": 0.7976,
+      "step": 440
+    },
+    {
+      "epoch": 0.4979020979020979,
+      "grad_norm": 0.11634952268956057,
+      "learning_rate": 0.0002801717802522132,
+      "loss": 0.792,
+      "step": 445
+    },
+    {
+      "epoch": 0.5034965034965035,
+      "grad_norm": 0.10607280522202725,
+      "learning_rate": 0.00027943702546172697,
+      "loss": 0.8078,
+      "step": 450
+    },
+    {
+      "epoch": 0.509090909090909,
+      "grad_norm": 0.15901805386261234,
+      "learning_rate": 0.00027868990370809164,
+      "loss": 0.8023,
+      "step": 455
+    },
+    {
+      "epoch": 0.5146853146853146,
+      "grad_norm": 0.12222508621029372,
+      "learning_rate": 0.00027793048637449273,
+      "loss": 0.7956,
+      "step": 460
+    },
+    {
+      "epoch": 0.5202797202797202,
+      "grad_norm": 0.10769602993732762,
+      "learning_rate": 0.0002771588460188876,
+      "loss": 0.7897,
+      "step": 465
+    },
+    {
+      "epoch": 0.5258741258741259,
+      "grad_norm": 0.1119724845850892,
+      "learning_rate": 0.00027637505636707315,
+      "loss": 0.7901,
+      "step": 470
+    },
+    {
+      "epoch": 0.5314685314685315,
+      "grad_norm": 0.11279764446201575,
+      "learning_rate": 0.0002755791923056415,
+      "loss": 0.79,
+      "step": 475
+    },
+    {
+      "epoch": 0.5370629370629371,
+      "grad_norm": 0.16791268697388115,
+      "learning_rate": 0.0002747713298748253,
+      "loss": 0.7909,
+      "step": 480
+    },
+    {
+      "epoch": 0.5426573426573427,
+      "grad_norm": 0.14084691053374204,
+      "learning_rate": 0.00027395154626123225,
+      "loss": 0.8013,
+      "step": 485
+    },
+    {
+      "epoch": 0.5482517482517483,
+      "grad_norm": 0.12452861008817544,
+      "learning_rate": 0.00027311991979047046,
+      "loss": 0.7888,
+      "step": 490
+    },
+    {
+      "epoch": 0.5538461538461539,
+      "grad_norm": 0.12432408060887955,
+      "learning_rate": 0.00027227652991966507,
+      "loss": 0.7736,
+      "step": 495
+    },
+    {
+      "epoch": 0.5594405594405595,
+      "grad_norm": 0.1167273022435753,
+      "learning_rate": 0.00027142145722986637,
+      "loss": 0.7892,
+      "step": 500
+    },
+    {
+      "epoch": 0.5650349650349651,
+      "grad_norm": 0.1570515101772784,
+      "learning_rate": 0.0002705547834183506,
+      "loss": 0.7735,
+      "step": 505
+    },
+    {
+      "epoch": 0.5706293706293706,
+      "grad_norm": 0.1757157276572827,
+      "learning_rate": 0.00026967659129081465,
+      "loss": 0.7947,
+      "step": 510
+    },
+    {
+      "epoch": 0.5762237762237762,
+      "grad_norm": 0.10842613803532425,
+      "learning_rate": 0.0002687869647534643,
+      "loss": 0.7844,
+      "step": 515
+    },
+    {
+      "epoch": 0.5818181818181818,
+      "grad_norm": 0.14559223171878447,
+      "learning_rate": 0.0002678859888049972,
+      "loss": 0.7881,
+      "step": 520
+    },
+    {
+      "epoch": 0.5874125874125874,
+      "grad_norm": 0.13398356861671987,
+      "learning_rate": 0.0002669737495284819,
+      "loss": 0.7731,
+      "step": 525
+    },
+    {
+      "epoch": 0.593006993006993,
+      "grad_norm": 0.16426998549603838,
+      "learning_rate": 0.00026605033408313354,
+      "loss": 0.7819,
+      "step": 530
+    },
+    {
+      "epoch": 0.5986013986013986,
+      "grad_norm": 0.12765279377973468,
+      "learning_rate": 0.0002651158306959855,
+      "loss": 0.7725,
+      "step": 535
+    },
+    {
+      "epoch": 0.6041958041958042,
+      "grad_norm": 0.11028117529632633,
+      "learning_rate": 0.00026417032865346023,
+      "loss": 0.7926,
+      "step": 540
+    },
+    {
+      "epoch": 0.6097902097902098,
+      "grad_norm": 0.12782289803306612,
+      "learning_rate": 0.00026321391829283884,
+      "loss": 0.7634,
+      "step": 545
+    },
+    {
+      "epoch": 0.6153846153846154,
+      "grad_norm": 0.1369549911517866,
+      "learning_rate": 0.0002622466909936289,
+      "loss": 0.7628,
+      "step": 550
+    },
+    {
+      "epoch": 0.620979020979021,
+      "grad_norm": 0.11521467825966626,
+      "learning_rate": 0.0002612687391688347,
+      "loss": 0.7763,
+      "step": 555
+    },
+    {
+      "epoch": 0.6265734265734266,
+      "grad_norm": 0.12350512688772212,
+      "learning_rate": 0.00026028015625612706,
+      "loss": 0.7884,
+      "step": 560
+    },
+    {
+      "epoch": 0.6321678321678321,
+      "grad_norm": 0.10825882258061882,
+      "learning_rate": 0.000259281036708916,
+      "loss": 0.7945,
+      "step": 565
+    },
+    {
+      "epoch": 0.6377622377622377,
+      "grad_norm": 0.11150400278758185,
+      "learning_rate": 0.00025827147598732656,
+      "loss": 0.7862,
+      "step": 570
+    },
+    {
+      "epoch": 0.6433566433566433,
+      "grad_norm": 0.10337975508652891,
+      "learning_rate": 0.00025725157054907777,
+      "loss": 0.7838,
+      "step": 575
+    },
+    {
+      "epoch": 0.6489510489510489,
+      "grad_norm": 0.09770680702107988,
+      "learning_rate": 0.0002562214178402669,
+      "loss": 0.7969,
+      "step": 580
+    },
+    {
+      "epoch": 0.6545454545454545,
+      "grad_norm": 0.1163323874465344,
+      "learning_rate": 0.00025518111628605885,
+      "loss": 0.7819,
+      "step": 585
+    },
+    {
+      "epoch": 0.6601398601398601,
+      "grad_norm": 0.11896295851189105,
+      "learning_rate": 0.00025413076528128255,
+      "loss": 0.7709,
+      "step": 590
+    },
+    {
+      "epoch": 0.6657342657342658,
+      "grad_norm": 0.11330549108690711,
+      "learning_rate": 0.0002530704651809339,
+      "loss": 0.7744,
+      "step": 595
+    },
+    {
+      "epoch": 0.6713286713286714,
+      "grad_norm": 0.13654029100152085,
+      "learning_rate": 0.0002520003172905878,
+      "loss": 0.7952,
+      "step": 600
+    },
+    {
+      "epoch": 0.676923076923077,
+      "grad_norm": 0.10351068732360345,
+      "learning_rate": 0.0002509204238567186,
+      "loss": 0.7755,
+      "step": 605
+    },
+    {
+      "epoch": 0.6825174825174826,
+      "grad_norm": 0.14077884569885882,
+      "learning_rate": 0.00024983088805693163,
+      "loss": 0.7831,
+      "step": 610
+    },
+    {
+      "epoch": 0.6881118881118881,
+      "grad_norm": 0.11948510265909233,
+      "learning_rate": 0.00024873181399010446,
+      "loss": 0.7861,
+      "step": 615
+    },
+    {
+      "epoch": 0.6937062937062937,
+      "grad_norm": 0.12755481254736167,
+      "learning_rate": 0.00024762330666644136,
+      "loss": 0.7782,
+      "step": 620
+    },
+    {
+      "epoch": 0.6993006993006993,
+      "grad_norm": 0.10102773025347063,
+      "learning_rate": 0.0002465054719974401,
+      "loss": 0.7731,
+      "step": 625
+    },
+    {
+      "epoch": 0.7048951048951049,
+      "grad_norm": 0.11665875780674151,
+      "learning_rate": 0.0002453784167857725,
+      "loss": 0.7839,
+      "step": 630
+    },
+    {
+      "epoch": 0.7104895104895105,
+      "grad_norm": 0.1074509342573267,
+      "learning_rate": 0.00024424224871508014,
+      "loss": 0.7769,
+      "step": 635
+    },
+    {
+      "epoch": 0.7160839160839161,
+      "grad_norm": 0.12422918258396683,
+      "learning_rate": 0.0002430970763396861,
+      "loss": 0.7754,
+      "step": 640
+    },
+    {
+      "epoch": 0.7216783216783217,
+      "grad_norm": 0.11786940518440715,
+      "learning_rate": 0.00024194300907422276,
+      "loss": 0.7974,
+      "step": 645
+    },
+    {
+      "epoch": 0.7272727272727273,
+      "grad_norm": 0.10398253596299759,
+      "learning_rate": 0.00024078015718317818,
+      "loss": 0.7729,
+      "step": 650
+    },
+    {
+      "epoch": 0.7328671328671329,
+      "grad_norm": 0.09903776559603776,
+      "learning_rate": 0.00023960863177036079,
+      "loss": 0.774,
+      "step": 655
+    },
+    {
+      "epoch": 0.7384615384615385,
+      "grad_norm": 0.09179018818178636,
+      "learning_rate": 0.00023842854476828411,
+      "loss": 0.7629,
+      "step": 660
+    },
+    {
+      "epoch": 0.7440559440559441,
+      "grad_norm": 0.10143244330106851,
+      "learning_rate": 0.0002372400089274724,
+      "loss": 0.781,
+      "step": 665
+    },
+    {
+      "epoch": 0.7496503496503496,
+      "grad_norm": 0.1121174501554115,
+      "learning_rate": 0.00023604313780568772,
+      "loss": 0.7811,
+      "step": 670
+    },
+    {
+      "epoch": 0.7552447552447552,
+      "grad_norm": 0.10047620860251714,
+      "learning_rate": 0.00023483804575708027,
+      "loss": 0.7752,
+      "step": 675
+    },
+    {
+      "epoch": 0.7608391608391608,
+      "grad_norm": 0.11068987100435401,
+      "learning_rate": 0.0002336248479212626,
+      "loss": 0.7657,
+      "step": 680
+    },
+    {
+      "epoch": 0.7664335664335664,
+      "grad_norm": 0.12535370469312143,
+      "learning_rate": 0.0002324036602123086,
+      "loss": 0.7731,
+      "step": 685
+    },
+    {
+      "epoch": 0.772027972027972,
+      "grad_norm": 0.11059830294321518,
+      "learning_rate": 0.00023117459930767847,
+      "loss": 0.7831,
+      "step": 690
+    },
+    {
+      "epoch": 0.7776223776223776,
+      "grad_norm": 0.11073279493368117,
+      "learning_rate": 0.00022993778263707105,
+      "loss": 0.7705,
+      "step": 695
+    },
+    {
+      "epoch": 0.7832167832167832,
+      "grad_norm": 0.12764331660853,
+      "learning_rate": 0.000228693328371204,
+      "loss": 0.7816,
+      "step": 700
+    },
+    {
+      "epoch": 0.7888111888111888,
+      "grad_norm": 0.10951855988735575,
+      "learning_rate": 0.0002274413554105232,
+      "loss": 0.7577,
+      "step": 705
+    },
+    {
+      "epoch": 0.7944055944055944,
+      "grad_norm": 0.12320185793067193,
+      "learning_rate": 0.00022618198337384264,
+      "loss": 0.7744,
+      "step": 710
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.10716655164464266,
+      "learning_rate": 0.00022491533258691546,
+      "loss": 0.7752,
+      "step": 715
+    },
+    {
+      "epoch": 0.8055944055944056,
+      "grad_norm": 0.11962877035421185,
+      "learning_rate": 0.00022364152407093737,
+      "loss": 0.7812,
+      "step": 720
+    },
+    {
+      "epoch": 0.8111888111888111,
+      "grad_norm": 0.10651252517324981,
+      "learning_rate": 0.00022236067953098414,
+      "loss": 0.78,
+      "step": 725
+    },
+    {
+      "epoch": 0.8167832167832167,
+      "grad_norm": 0.1086890422520719,
+      "learning_rate": 0.00022107292134438298,
+      "loss": 0.7801,
+      "step": 730
+    },
+    {
+      "epoch": 0.8223776223776224,
+      "grad_norm": 0.09743032182612474,
+      "learning_rate": 0.00021977837254902034,
+      "loss": 0.7762,
+      "step": 735
+    },
+    {
+      "epoch": 0.827972027972028,
+      "grad_norm": 0.09964450561189496,
+      "learning_rate": 0.0002184771568315862,
+      "loss": 0.7809,
+      "step": 740
+    },
+    {
+      "epoch": 0.8335664335664336,
+      "grad_norm": 0.09852105354101268,
+      "learning_rate": 0.0002171693985157567,
+      "loss": 0.7803,
+      "step": 745
+    },
+    {
+      "epoch": 0.8391608391608392,
+      "grad_norm": 0.10751247181443387,
+      "learning_rate": 0.00021585522255031554,
+      "loss": 0.754,
+      "step": 750
+    },
+    {
+      "epoch": 0.8447552447552448,
+      "grad_norm": 0.10083658202350868,
+      "learning_rate": 0.00021453475449721593,
+      "loss": 0.7689,
+      "step": 755
+    },
+    {
+      "epoch": 0.8503496503496504,
+      "grad_norm": 0.1132302634053956,
+      "learning_rate": 0.00021320812051958392,
+      "loss": 0.7667,
+      "step": 760
+    },
+    {
+      "epoch": 0.855944055944056,
+      "grad_norm": 0.09942757452904256,
+      "learning_rate": 0.00021187544736966403,
+      "loss": 0.7798,
+      "step": 765
+    },
+    {
+      "epoch": 0.8615384615384616,
+      "grad_norm": 0.11119180151464317,
+      "learning_rate": 0.00021053686237670912,
+      "loss": 0.7768,
+      "step": 770
+    },
+    {
+      "epoch": 0.8671328671328671,
+      "grad_norm": 0.11012292213675227,
+      "learning_rate": 0.0002091924934348146,
+      "loss": 0.7641,
+      "step": 775
+    },
+    {
+      "epoch": 0.8727272727272727,
+      "grad_norm": 0.10508899250762296,
+      "learning_rate": 0.0002078424689906988,
+      "loss": 0.772,
+      "step": 780
+    },
+    {
+      "epoch": 0.8783216783216783,
+      "grad_norm": 0.09916724236606267,
+      "learning_rate": 0.00020648691803143088,
+      "loss": 0.7798,
+      "step": 785
+    },
+    {
+      "epoch": 0.8839160839160839,
+      "grad_norm": 0.10139136991498161,
+      "learning_rate": 0.00020512597007210672,
+      "loss": 0.7595,
+      "step": 790
+    },
+    {
+      "epoch": 0.8895104895104895,
+      "grad_norm": 0.09922677378640356,
+      "learning_rate": 0.00020375975514347447,
+      "loss": 0.7582,
+      "step": 795
+    },
+    {
+      "epoch": 0.8951048951048951,
+      "grad_norm": 0.10734407771584646,
+      "learning_rate": 0.0002023884037795109,
+      "loss": 0.7747,
+      "step": 800
+    },
+    {
+      "epoch": 0.9006993006993007,
+      "grad_norm": 0.09003146154158007,
+      "learning_rate": 0.00020101204700494963,
+      "loss": 0.772,
+      "step": 805
+    },
+    {
+      "epoch": 0.9062937062937063,
+      "grad_norm": 0.10224227095976963,
+      "learning_rate": 0.00019963081632276244,
+      "loss": 0.7632,
+      "step": 810
+    },
+    {
+      "epoch": 0.9118881118881119,
+      "grad_norm": 0.10634964894349899,
+      "learning_rate": 0.00019824484370159511,
+      "loss": 0.7621,
+      "step": 815
+    },
+    {
+      "epoch": 0.9174825174825175,
+      "grad_norm": 0.08669203171955091,
+      "learning_rate": 0.00019685426156315817,
+      "loss": 0.7678,
+      "step": 820
+    },
+    {
+      "epoch": 0.9230769230769231,
+      "grad_norm": 0.09236536065178236,
+      "learning_rate": 0.00019545920276957512,
+      "loss": 0.7615,
+      "step": 825
+    },
+    {
+      "epoch": 0.9286713286713286,
+      "grad_norm": 0.09051106183048227,
+      "learning_rate": 0.00019405980061068813,
+      "loss": 0.7538,
+      "step": 830
+    },
+    {
+      "epoch": 0.9342657342657342,
+      "grad_norm": 0.09075314123364801,
+      "learning_rate": 0.00019265618879132294,
+      "loss": 0.7695,
+      "step": 835
+    },
+    {
+      "epoch": 0.9398601398601398,
+      "grad_norm": 0.09779655862085175,
+      "learning_rate": 0.000191248501418514,
+      "loss": 0.7486,
+      "step": 840
+    },
+    {
+      "epoch": 0.9454545454545454,
+      "grad_norm": 0.09284439899834651,
+      "learning_rate": 0.00018983687298869165,
+      "loss": 0.7757,
+      "step": 845
+    },
+    {
+      "epoch": 0.951048951048951,
+      "grad_norm": 0.10906263614923466,
+      "learning_rate": 0.00018842143837483137,
+      "loss": 0.7654,
+      "step": 850
+    },
+    {
+      "epoch": 0.9566433566433566,
+      "grad_norm": 0.08711563033296012,
+      "learning_rate": 0.00018700233281356774,
+      "loss": 0.7661,
+      "step": 855
+    },
+    {
+      "epoch": 0.9622377622377623,
+      "grad_norm": 0.10105386217514606,
+      "learning_rate": 0.00018557969189227327,
+      "loss": 0.7566,
+      "step": 860
+    },
+    {
+      "epoch": 0.9678321678321679,
+      "grad_norm": 0.09374075010702233,
+      "learning_rate": 0.00018415365153610363,
+      "loss": 0.7505,
+      "step": 865
+    },
+    {
+      "epoch": 0.9734265734265735,
+      "grad_norm": 0.09217739631223294,
+      "learning_rate": 0.00018272434799501108,
+      "loss": 0.7513,
+      "step": 870
+    },
+    {
+      "epoch": 0.9790209790209791,
+      "grad_norm": 0.10860300198578388,
+      "learning_rate": 0.00018129191783072644,
+      "loss": 0.7586,
+      "step": 875
+    },
+    {
+      "epoch": 0.9846153846153847,
+      "grad_norm": 0.10382198085691989,
+      "learning_rate": 0.00017985649790371123,
+      "loss": 0.7712,
+      "step": 880
+    },
+    {
+      "epoch": 0.9902097902097902,
+      "grad_norm": 0.09718012392096963,
+      "learning_rate": 0.00017841822536008174,
+      "loss": 0.7548,
+      "step": 885
+    },
+    {
+      "epoch": 0.9958041958041958,
+      "grad_norm": 0.09129380340752612,
+      "learning_rate": 0.00017697723761850529,
+      "loss": 0.7442,
+      "step": 890
+    },
+    {
+      "epoch": 0.9991608391608392,
+      "eval_loss": 1.078917145729065,
+      "eval_runtime": 368.4649,
+      "eval_samples_per_second": 55.878,
+      "eval_steps_per_second": 1.748,
+      "step": 893
+    },
+    {
+      "epoch": 1.0013986013986014,
+      "grad_norm": 0.2193429577704184,
+      "learning_rate": 0.0001755336723570709,
+      "loss": 0.7304,
+      "step": 895
+    },
+    {
+      "epoch": 1.006993006993007,
+      "grad_norm": 0.12029696024418647,
+      "learning_rate": 0.00017408766750013455,
+      "loss": 0.6883,
+      "step": 900
+    },
+    {
+      "epoch": 1.0125874125874126,
+      "grad_norm": 0.11157245101821324,
+      "learning_rate": 0.0001726393612051416,
+      "loss": 0.6937,
+      "step": 905
+    },
+    {
+      "epoch": 1.018181818181818,
+      "grad_norm": 0.11562847307206518,
+      "learning_rate": 0.0001711888918494268,
+      "loss": 0.7072,
+      "step": 910
+    },
+    {
+      "epoch": 1.0237762237762238,
+      "grad_norm": 0.09699091401508494,
+      "learning_rate": 0.00016973639801699258,
+      "loss": 0.7002,
+      "step": 915
+    },
+    {
+      "epoch": 1.0293706293706293,
+      "grad_norm": 0.10329864513789806,
+      "learning_rate": 0.0001682820184852687,
+      "loss": 0.7049,
+      "step": 920
+    },
+    {
+      "epoch": 1.034965034965035,
+      "grad_norm": 0.10505439144371424,
+      "learning_rate": 0.0001668258922118525,
+      "loss": 0.7062,
+      "step": 925
+    },
+    {
+      "epoch": 1.0405594405594405,
+      "grad_norm": 0.09345853743066072,
+      "learning_rate": 0.0001653681583212326,
+      "loss": 0.705,
+      "step": 930
+    },
+    {
+      "epoch": 1.0461538461538462,
+      "grad_norm": 0.14624259074546206,
+      "learning_rate": 0.00016390895609149608,
+      "loss": 0.6862,
+      "step": 935
+    },
+    {
+      "epoch": 1.0517482517482517,
+      "grad_norm": 0.09484870970062163,
+      "learning_rate": 0.00016244842494102135,
+      "loss": 0.6794,
+      "step": 940
+    },
+    {
+      "epoch": 1.0573426573426574,
+      "grad_norm": 0.1093126433741654,
+      "learning_rate": 0.00016098670441515759,
+      "loss": 0.6965,
+      "step": 945
+    },
+    {
+      "epoch": 1.062937062937063,
+      "grad_norm": 0.08709955416922696,
+      "learning_rate": 0.000159523934172892,
+      "loss": 0.6875,
+      "step": 950
+    },
+    {
+      "epoch": 1.0685314685314686,
+      "grad_norm": 0.08587078936550793,
+      "learning_rate": 0.00015806025397350617,
+      "loss": 0.6816,
+      "step": 955
+    },
+    {
+      "epoch": 1.0741258741258741,
+      "grad_norm": 0.09226255228367097,
+      "learning_rate": 0.00015659580366322265,
+      "loss": 0.6909,
+      "step": 960
+    },
+    {
+      "epoch": 1.0797202797202796,
+      "grad_norm": 0.0936240844549708,
+      "learning_rate": 0.00015513072316184393,
+      "loss": 0.6904,
+      "step": 965
+    },
+    {
+      "epoch": 1.0853146853146853,
+      "grad_norm": 0.085659439411888,
+      "learning_rate": 0.0001536651524493834,
+      "loss": 0.6874,
+      "step": 970
+    },
+    {
+      "epoch": 1.0909090909090908,
+      "grad_norm": 0.09924612907304702,
+      "learning_rate": 0.00015219923155269157,
+      "loss": 0.6953,
+      "step": 975
+    },
+    {
+      "epoch": 1.0965034965034965,
+      "grad_norm": 0.10028513455623837,
+      "learning_rate": 0.00015073310053207665,
+      "loss": 0.6967,
+      "step": 980
+    },
+    {
+      "epoch": 1.102097902097902,
+      "grad_norm": 0.09345136457275896,
+      "learning_rate": 0.00014926689946792332,
+      "loss": 0.6905,
+      "step": 985
+    },
+    {
+      "epoch": 1.1076923076923078,
+      "grad_norm": 0.08893419418399162,
+      "learning_rate": 0.00014780076844730849,
+      "loss": 0.6985,
+      "step": 990
+    },
+    {
+      "epoch": 1.1132867132867132,
+      "grad_norm": 0.08853702003030435,
+      "learning_rate": 0.00014633484755061658,
+      "loss": 0.7014,
+      "step": 995
+    },
+    {
+      "epoch": 1.118881118881119,
+      "grad_norm": 0.08365464476596743,
+      "learning_rate": 0.0001448692768381561,
+      "loss": 0.697,
+      "step": 1000
+    },
+    {
+      "epoch": 1.1244755244755245,
+      "grad_norm": 0.08780213786162074,
+      "learning_rate": 0.00014340419633677732,
+      "loss": 0.7025,
+      "step": 1005
+    },
+    {
+      "epoch": 1.1300699300699302,
+      "grad_norm": 0.0884295613746686,
+      "learning_rate": 0.00014193974602649386,
+      "loss": 0.6993,
+      "step": 1010
+    },
+    {
+      "epoch": 1.1356643356643357,
+      "grad_norm": 0.09059540370700289,
+      "learning_rate": 0.00014047606582710798,
+      "loss": 0.6948,
+      "step": 1015
+    },
+    {
+      "epoch": 1.1412587412587412,
+      "grad_norm": 0.09899100050319447,
+      "learning_rate": 0.00013901329558484236,
+      "loss": 0.6992,
+      "step": 1020
+    },
+    {
+      "epoch": 1.1468531468531469,
+      "grad_norm": 0.09283620847093584,
+      "learning_rate": 0.00013755157505897868,
+      "loss": 0.7184,
+      "step": 1025
+    },
+    {
+      "epoch": 1.1524475524475524,
+      "grad_norm": 0.08887024389956383,
+      "learning_rate": 0.00013609104390850392,
+      "loss": 0.697,
+      "step": 1030
+    },
+    {
+      "epoch": 1.158041958041958,
+      "grad_norm": 0.08693816785751791,
+      "learning_rate": 0.0001346318416787674,
+      "loss": 0.6939,
+      "step": 1035
+    },
+    {
+      "epoch": 1.1636363636363636,
+      "grad_norm": 0.09116425941497776,
+      "learning_rate": 0.00013317410778814745,
+      "loss": 0.6989,
+      "step": 1040
+    },
+    {
+      "epoch": 1.1692307692307693,
+      "grad_norm": 0.08570466903745366,
+      "learning_rate": 0.00013171798151473133,
+      "loss": 0.6956,
+      "step": 1045
+    },
+    {
+      "epoch": 1.1748251748251748,
+      "grad_norm": 0.0974948891545614,
+      "learning_rate": 0.0001302636019830074,
+      "loss": 0.6965,
+      "step": 1050
+    },
+    {
+      "epoch": 1.1804195804195805,
+      "grad_norm": 0.08995597994573838,
+      "learning_rate": 0.0001288111081505732,
+      "loss": 0.7041,
+      "step": 1055
+    },
+    {
+      "epoch": 1.186013986013986,
+      "grad_norm": 0.07861264078439004,
+      "learning_rate": 0.00012736063879485837,
+      "loss": 0.7032,
+      "step": 1060
+    },
+    {
+      "epoch": 1.1916083916083915,
+      "grad_norm": 0.0841514205254347,
+      "learning_rate": 0.0001259123324998655,
+      "loss": 0.6905,
+      "step": 1065
+    },
+    {
+      "epoch": 1.1972027972027972,
+      "grad_norm": 0.08692540683602883,
+      "learning_rate": 0.0001244663276429291,
+      "loss": 0.7074,
+      "step": 1070
+    },
+    {
+      "epoch": 1.2027972027972027,
+      "grad_norm": 0.08776833500890557,
+      "learning_rate": 0.00012302276238149463,
+      "loss": 0.7041,
+      "step": 1075
+    },
+    {
+      "epoch": 1.2083916083916084,
+      "grad_norm": 0.08727180482054969,
+      "learning_rate": 0.00012158177463991828,
+      "loss": 0.696,
+      "step": 1080
+    },
+    {
+      "epoch": 1.213986013986014,
+      "grad_norm": 0.08046559487267867,
+      "learning_rate": 0.00012014350209628875,
+      "loss": 0.6826,
+      "step": 1085
+    },
+    {
+      "epoch": 1.2195804195804196,
+      "grad_norm": 0.08781752670322365,
+      "learning_rate": 0.00011870808216927356,
+      "loss": 0.6999,
+      "step": 1090
+    },
+    {
+      "epoch": 1.2251748251748251,
+      "grad_norm": 0.08813740501797863,
+      "learning_rate": 0.00011727565200498888,
+      "loss": 0.7037,
+      "step": 1095
+    },
+    {
+      "epoch": 1.2307692307692308,
+      "grad_norm": 0.08772222997454243,
+      "learning_rate": 0.00011584634846389638,
+      "loss": 0.6986,
+      "step": 1100
+    },
+    {
+      "epoch": 1.2363636363636363,
+      "grad_norm": 0.10332929019104932,
+      "learning_rate": 0.00011442030810772673,
+      "loss": 0.6725,
+      "step": 1105
+    },
+    {
+      "epoch": 1.241958041958042,
+      "grad_norm": 0.08709301562019137,
+      "learning_rate": 0.00011299766718643226,
+      "loss": 0.7063,
+      "step": 1110
+    },
+    {
+      "epoch": 1.2475524475524475,
+      "grad_norm": 0.08449696490718896,
+      "learning_rate": 0.00011157856162516863,
+      "loss": 0.692,
+      "step": 1115
+    },
+    {
+      "epoch": 1.2531468531468533,
+      "grad_norm": 0.08124616270666368,
+      "learning_rate": 0.00011016312701130841,
+      "loss": 0.6915,
+      "step": 1120
+    },
+    {
+      "epoch": 1.2587412587412588,
+      "grad_norm": 0.09376360999458151,
+      "learning_rate": 0.000108751498581486,
+      "loss": 0.6939,
+      "step": 1125
+    },
+    {
+      "epoch": 1.2643356643356642,
+      "grad_norm": 0.08677694965635109,
+      "learning_rate": 0.00010734381120867707,
+      "loss": 0.7029,
+      "step": 1130
+    },
+    {
+      "epoch": 1.26993006993007,
+      "grad_norm": 0.08737879433308637,
+      "learning_rate": 0.00010594019938931187,
+      "loss": 0.6849,
+      "step": 1135
+    },
+    {
+      "epoch": 1.2755244755244755,
+      "grad_norm": 0.08311271942623545,
+      "learning_rate": 0.00010454079723042485,
+      "loss": 0.6799,
+      "step": 1140
+    },
+    {
+      "epoch": 1.2811188811188812,
+      "grad_norm": 0.07508666997892824,
+      "learning_rate": 0.00010314573843684183,
+      "loss": 0.6979,
+      "step": 1145
+    },
+    {
+      "epoch": 1.2867132867132867,
+      "grad_norm": 0.07967922610661686,
+      "learning_rate": 0.00010175515629840487,
+      "loss": 0.6793,
+      "step": 1150
+    },
+    {
+      "epoch": 1.2923076923076924,
+      "grad_norm": 0.08373250104545919,
+      "learning_rate": 0.00010036918367723754,
+      "loss": 0.6942,
+      "step": 1155
+    },
+    {
+      "epoch": 1.2979020979020979,
+      "grad_norm": 0.08415435083106365,
+      "learning_rate": 9.898795299505037e-05,
+      "loss": 0.6843,
+      "step": 1160
+    },
+    {
+      "epoch": 1.3034965034965036,
+      "grad_norm": 0.0972925237131085,
+      "learning_rate": 9.761159622048914e-05,
+      "loss": 0.6786,
+      "step": 1165
+    },
+    {
+      "epoch": 1.309090909090909,
+      "grad_norm": 0.08332589008281578,
+      "learning_rate": 9.624024485652552e-05,
+      "loss": 0.6895,
+      "step": 1170
+    },
+    {
+      "epoch": 1.3146853146853146,
+      "grad_norm": 0.08324352673680815,
+      "learning_rate": 9.48740299278933e-05,
+      "loss": 0.6902,
+      "step": 1175
+    },
+    {
+      "epoch": 1.3202797202797203,
+      "grad_norm": 0.08212510567034546,
+      "learning_rate": 9.351308196856911e-05,
+      "loss": 0.6861,
+      "step": 1180
+    },
+    {
+      "epoch": 1.325874125874126,
+      "grad_norm": 0.08320001036155726,
+      "learning_rate": 9.215753100930118e-05,
+      "loss": 0.6943,
+      "step": 1185
+    },
+    {
+      "epoch": 1.3314685314685315,
+      "grad_norm": 0.09026366102833903,
+      "learning_rate": 9.08075065651854e-05,
+      "loss": 0.7031,
+      "step": 1190
+    },
+    {
+      "epoch": 1.337062937062937,
+      "grad_norm": 0.07809171125830346,
+      "learning_rate": 8.946313762329081e-05,
+      "loss": 0.6974,
+      "step": 1195
+    },
+    {
+      "epoch": 1.3426573426573427,
+      "grad_norm": 0.07708050011669573,
+      "learning_rate": 8.812455263033595e-05,
+      "loss": 0.7072,
+      "step": 1200
+    },
+    {
+      "epoch": 1.3482517482517482,
+      "grad_norm": 0.08672106026682487,
+      "learning_rate": 8.679187948041605e-05,
+      "loss": 0.6946,
+      "step": 1205
+    },
+    {
+      "epoch": 1.353846153846154,
+      "grad_norm": 0.08223294476361295,
+      "learning_rate": 8.546524550278405e-05,
+      "loss": 0.6917,
+      "step": 1210
+    },
+    {
+      "epoch": 1.3594405594405594,
+      "grad_norm": 0.07572041983819904,
+      "learning_rate": 8.414477744968441e-05,
+      "loss": 0.7068,
+      "step": 1215
+    },
+    {
+      "epoch": 1.365034965034965,
+      "grad_norm": 0.07935310328973877,
+      "learning_rate": 8.283060148424328e-05,
+      "loss": 0.6825,
+      "step": 1220
+    },
+    {
+      "epoch": 1.3706293706293706,
+      "grad_norm": 0.08084222220036809,
+      "learning_rate": 8.152284316841382e-05,
+      "loss": 0.6895,
+      "step": 1225
+    },
+    {
+      "epoch": 1.3762237762237763,
+      "grad_norm": 0.08362754142551204,
+      "learning_rate": 8.02216274509797e-05,
+      "loss": 0.6855,
+      "step": 1230
+    },
+    {
+      "epoch": 1.3818181818181818,
+      "grad_norm": 0.08263146843817087,
+      "learning_rate": 7.892707865561702e-05,
+      "loss": 0.685,
+      "step": 1235
+    },
+    {
+      "epoch": 1.3874125874125873,
+      "grad_norm": 0.07614085795637057,
+      "learning_rate": 7.763932046901587e-05,
+      "loss": 0.698,
+      "step": 1240
+    },
+    {
+      "epoch": 1.393006993006993,
+      "grad_norm": 0.08297289392137173,
+      "learning_rate": 7.635847592906259e-05,
+      "loss": 0.6892,
+      "step": 1245
+    },
+    {
+      "epoch": 1.3986013986013985,
+      "grad_norm": 0.09257478488971461,
+      "learning_rate": 7.50846674130845e-05,
+      "loss": 0.6819,
+      "step": 1250
+    },
+    {
+      "epoch": 1.4041958041958043,
+      "grad_norm": 0.0820635181024322,
+      "learning_rate": 7.381801662615731e-05,
+      "loss": 0.6836,
+      "step": 1255
+    },
+    {
+      "epoch": 1.4097902097902097,
+      "grad_norm": 0.08479286795375468,
+      "learning_rate": 7.255864458947677e-05,
+      "loss": 0.6838,
+      "step": 1260
+    },
+    {
+      "epoch": 1.4153846153846155,
+      "grad_norm": 0.08169299364006637,
+      "learning_rate": 7.130667162879602e-05,
+      "loss": 0.6912,
+      "step": 1265
+    },
+    {
+      "epoch": 1.420979020979021,
+      "grad_norm": 0.0847296311273153,
+      "learning_rate": 7.006221736292892e-05,
+      "loss": 0.6824,
+      "step": 1270
+    },
+    {
+      "epoch": 1.4265734265734267,
+      "grad_norm": 0.07999623959693618,
+      "learning_rate": 6.882540069232155e-05,
+      "loss": 0.6806,
+      "step": 1275
+    },
+    {
+      "epoch": 1.4321678321678322,
+      "grad_norm": 0.07964123970317083,
+      "learning_rate": 6.759633978769139e-05,
+      "loss": 0.7052,
+      "step": 1280
+    },
+    {
+      "epoch": 1.4377622377622377,
+      "grad_norm": 0.0966871184419301,
+      "learning_rate": 6.63751520787374e-05,
+      "loss": 0.7002,
+      "step": 1285
+    },
+    {
+      "epoch": 1.4433566433566434,
+      "grad_norm": 0.09626897693462537,
+      "learning_rate": 6.516195424291972e-05,
+      "loss": 0.6912,
+      "step": 1290
+    },
+    {
+      "epoch": 1.4489510489510489,
+      "grad_norm": 0.08145120964067751,
+      "learning_rate": 6.395686219431232e-05,
+      "loss": 0.6877,
+      "step": 1295
+    },
+    {
+      "epoch": 1.4545454545454546,
+      "grad_norm": 0.08789566288442652,
+      "learning_rate": 6.275999107252758e-05,
+      "loss": 0.6847,
+      "step": 1300
+    },
+    {
+      "epoch": 1.46013986013986,
+      "grad_norm": 0.0806679485875012,
+      "learning_rate": 6.157145523171587e-05,
+      "loss": 0.6869,
+      "step": 1305
+    },
+    {
+      "epoch": 1.4657342657342658,
+      "grad_norm": 0.07942983212018809,
+      "learning_rate": 6.039136822963924e-05,
+      "loss": 0.6767,
+      "step": 1310
+    },
+    {
+      "epoch": 1.4713286713286713,
+      "grad_norm": 0.08489122954397245,
+      "learning_rate": 5.9219842816821796e-05,
+      "loss": 0.6814,
+      "step": 1315
+    },
+    {
+      "epoch": 1.476923076923077,
+      "grad_norm": 0.08653144402466852,
+      "learning_rate": 5.805699092577722e-05,
+      "loss": 0.6968,
+      "step": 1320
+    },
+    {
+      "epoch": 1.4825174825174825,
+      "grad_norm": 0.08218462407837061,
+      "learning_rate": 5.6902923660313855e-05,
+      "loss": 0.6781,
+      "step": 1325
+    },
+    {
+      "epoch": 1.488111888111888,
+      "grad_norm": 0.07605468398853794,
+      "learning_rate": 5.5757751284919836e-05,
+      "loss": 0.6837,
+      "step": 1330
+    },
+    {
+      "epoch": 1.4937062937062937,
+      "grad_norm": 0.07347540817224318,
+      "learning_rate": 5.462158321422751e-05,
+      "loss": 0.678,
+      "step": 1335
+    },
+    {
+      "epoch": 1.4993006993006994,
+      "grad_norm": 0.08136761352155132,
+      "learning_rate": 5.34945280025599e-05,
+      "loss": 0.6843,
+      "step": 1340
+    },
+    {
+      "epoch": 1.504895104895105,
+      "grad_norm": 0.079793665685405,
+      "learning_rate": 5.237669333355863e-05,
+      "loss": 0.6919,
+      "step": 1345
+    },
+    {
+      "epoch": 1.5104895104895104,
+      "grad_norm": 0.07721773411337415,
+      "learning_rate": 5.126818600989557e-05,
+      "loss": 0.6826,
+      "step": 1350
+    },
+    {
+      "epoch": 1.5160839160839161,
+      "grad_norm": 0.07622519698946323,
+      "learning_rate": 5.0169111943068374e-05,
+      "loss": 0.6901,
+      "step": 1355
+    },
+    {
+      "epoch": 1.5216783216783218,
+      "grad_norm": 0.07862095262493349,
+      "learning_rate": 4.9079576143281326e-05,
+      "loss": 0.691,
+      "step": 1360
+    },
+    {
+      "epoch": 1.5272727272727273,
+      "grad_norm": 0.07949120411421966,
+      "learning_rate": 4.7999682709412216e-05,
+      "loss": 0.6806,
+      "step": 1365
+    },
+    {
+      "epoch": 1.5328671328671328,
+      "grad_norm": 0.08530944602336236,
+      "learning_rate": 4.692953481906605e-05,
+      "loss": 0.6847,
+      "step": 1370
+    },
+    {
+      "epoch": 1.5384615384615383,
+      "grad_norm": 0.08074798934800313,
+      "learning_rate": 4.586923471871743e-05,
+      "loss": 0.681,
+      "step": 1375
+    },
+    {
+      "epoch": 1.544055944055944,
+      "grad_norm": 0.0777449518216711,
+      "learning_rate": 4.481888371394115e-05,
+      "loss": 0.6874,
+      "step": 1380
+    },
+    {
+      "epoch": 1.5496503496503498,
+      "grad_norm": 0.07535968734459726,
+      "learning_rate": 4.377858215973318e-05,
+      "loss": 0.6751,
+      "step": 1385
+    },
+    {
+      "epoch": 1.5552447552447553,
+      "grad_norm": 0.07188075306681382,
+      "learning_rate": 4.2748429450922263e-05,
+      "loss": 0.6745,
+      "step": 1390
+    },
+    {
+      "epoch": 1.5608391608391607,
+      "grad_norm": 0.0781312708212868,
+      "learning_rate": 4.172852401267347e-05,
+      "loss": 0.688,
+      "step": 1395
+    },
+    {
+      "epoch": 1.5664335664335665,
+      "grad_norm": 0.07521827314744757,
+      "learning_rate": 4.0718963291084e-05,
+      "loss": 0.6757,
+      "step": 1400
+    },
+    {
+      "epoch": 1.5720279720279722,
+      "grad_norm": 0.07373545234599518,
+      "learning_rate": 3.9719843743872964e-05,
+      "loss": 0.6778,
+      "step": 1405
+    },
+    {
+      "epoch": 1.5776223776223777,
+      "grad_norm": 0.07264270828709123,
+      "learning_rate": 3.873126083116525e-05,
+      "loss": 0.6864,
+      "step": 1410
+    },
+    {
+      "epoch": 1.5832167832167832,
+      "grad_norm": 0.0763271584430483,
+      "learning_rate": 3.775330900637108e-05,
+      "loss": 0.683,
+      "step": 1415
+    },
+    {
+      "epoch": 1.5888111888111887,
+      "grad_norm": 0.07981572030031307,
+      "learning_rate": 3.678608170716117e-05,
+      "loss": 0.6795,
+      "step": 1420
+    },
+    {
+      "epoch": 1.5944055944055944,
+      "grad_norm": 0.0746896810608302,
+      "learning_rate": 3.582967134653972e-05,
+      "loss": 0.675,
+      "step": 1425
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.07449703423407898,
+      "learning_rate": 3.488416930401457e-05,
+      "loss": 0.6805,
+      "step": 1430
+    },
+    {
+      "epoch": 1.6055944055944056,
+      "grad_norm": 0.07104343697673421,
+      "learning_rate": 3.3949665916866466e-05,
+      "loss": 0.6752,
+      "step": 1435
+    },
+    {
+      "epoch": 1.611188811188811,
+      "grad_norm": 0.07498548014087274,
+      "learning_rate": 3.302625047151807e-05,
+      "loss": 0.6949,
+      "step": 1440
+    },
+    {
+      "epoch": 1.6167832167832168,
+      "grad_norm": 0.07539631481012679,
+      "learning_rate": 3.211401119500283e-05,
+      "loss": 0.6892,
+      "step": 1445
+    },
+    {
+      "epoch": 1.6223776223776225,
+      "grad_norm": 0.0765200306903301,
+      "learning_rate": 3.12130352465357e-05,
+      "loss": 0.6891,
+      "step": 1450
+    },
+    {
+      "epoch": 1.627972027972028,
+      "grad_norm": 0.0730441214772004,
+      "learning_rate": 3.032340870918527e-05,
+      "loss": 0.6981,
+      "step": 1455
+    },
+    {
+      "epoch": 1.6335664335664335,
+      "grad_norm": 0.08171074628055888,
+      "learning_rate": 2.9445216581649384e-05,
+      "loss": 0.6936,
+      "step": 1460
+    },
+    {
+      "epoch": 1.6391608391608392,
+      "grad_norm": 0.083112396316528,
+      "learning_rate": 2.8578542770133654e-05,
+      "loss": 0.6737,
+      "step": 1465
+    },
+    {
+      "epoch": 1.6447552447552447,
+      "grad_norm": 0.08252295941296424,
+      "learning_rate": 2.772347008033492e-05,
+      "loss": 0.701,
+      "step": 1470
+    },
+    {
+      "epoch": 1.6503496503496504,
+      "grad_norm": 0.07835293344801121,
+      "learning_rate": 2.688008020952952e-05,
+      "loss": 0.6921,
+      "step": 1475
+    },
+    {
+      "epoch": 1.655944055944056,
+      "grad_norm": 0.0778212904803543,
+      "learning_rate": 2.6048453738767755e-05,
+      "loss": 0.6764,
+      "step": 1480
+    },
+    {
+      "epoch": 1.6615384615384614,
+      "grad_norm": 0.07182079524548696,
+      "learning_rate": 2.5228670125174704e-05,
+      "loss": 0.6841,
+      "step": 1485
+    },
+    {
+      "epoch": 1.6671328671328671,
+      "grad_norm": 0.072274928421158,
+      "learning_rate": 2.4420807694358468e-05,
+      "loss": 0.6823,
+      "step": 1490
+    },
+    {
+      "epoch": 1.6727272727272728,
+      "grad_norm": 0.07124500713293849,
+      "learning_rate": 2.3624943632926853e-05,
+      "loss": 0.6816,
+      "step": 1495
+    },
+    {
+      "epoch": 1.6783216783216783,
+      "grad_norm": 0.06940723884896695,
+      "learning_rate": 2.2841153981112397e-05,
+      "loss": 0.6805,
+      "step": 1500
+    },
+    {
+      "epoch": 1.6839160839160838,
+      "grad_norm": 0.07325019748524,
+      "learning_rate": 2.20695136255073e-05,
+      "loss": 0.6614,
+      "step": 1505
+    },
+    {
+      "epoch": 1.6895104895104895,
+      "grad_norm": 0.0727025067676513,
+      "learning_rate": 2.1310096291908347e-05,
+      "loss": 0.6851,
+      "step": 1510
+    },
+    {
+      "epoch": 1.6951048951048953,
+      "grad_norm": 0.07406758207149883,
+      "learning_rate": 2.0562974538273024e-05,
+      "loss": 0.6978,
+      "step": 1515
+    },
+    {
+      "epoch": 1.7006993006993008,
+      "grad_norm": 0.0684645890113912,
+      "learning_rate": 1.9828219747786733e-05,
+      "loss": 0.6814,
+      "step": 1520
+    },
+    {
+      "epoch": 1.7062937062937062,
+      "grad_norm": 0.07103130256598,
+      "learning_rate": 1.910590212204281e-05,
+      "loss": 0.6955,
+      "step": 1525
+    },
+    {
+      "epoch": 1.7118881118881117,
+      "grad_norm": 0.08660646823989741,
+      "learning_rate": 1.839609067433495e-05,
+      "loss": 0.6768,
+      "step": 1530
+    },
+    {
+      "epoch": 1.7174825174825175,
+      "grad_norm": 0.07000885068796428,
+      "learning_rate": 1.7698853223063554e-05,
+      "loss": 0.6814,
+      "step": 1535
+    },
+    {
+      "epoch": 1.7230769230769232,
+      "grad_norm": 0.07227717547773495,
+      "learning_rate": 1.701425638525601e-05,
+      "loss": 0.6863,
+      "step": 1540
+    },
+    {
+      "epoch": 1.7286713286713287,
+      "grad_norm": 0.08472297754432541,
+      "learning_rate": 1.634236557020174e-05,
+      "loss": 0.6739,
+      "step": 1545
+    },
+    {
+      "epoch": 1.7342657342657342,
+      "grad_norm": 0.07635635084022904,
+      "learning_rate": 1.5683244973202848e-05,
+      "loss": 0.6849,
+      "step": 1550
+    },
+    {
+      "epoch": 1.7398601398601399,
+      "grad_norm": 0.0733376716964744,
+      "learning_rate": 1.5036957569440488e-05,
+      "loss": 0.6736,
+      "step": 1555
+    },
+    {
+      "epoch": 1.7454545454545456,
+      "grad_norm": 0.06984335423469197,
+      "learning_rate": 1.4403565107958142e-05,
+      "loss": 0.6801,
+      "step": 1560
+    },
+    {
+      "epoch": 1.751048951048951,
+      "grad_norm": 0.07299591046039727,
+      "learning_rate": 1.3783128105761649e-05,
+      "loss": 0.6703,
+      "step": 1565
+    },
+    {
+      "epoch": 1.7566433566433566,
+      "grad_norm": 0.06856518432971286,
+      "learning_rate": 1.3175705842037332e-05,
+      "loss": 0.6811,
+      "step": 1570
+    },
+    {
+      "epoch": 1.762237762237762,
+      "grad_norm": 0.06892528407637724,
+      "learning_rate": 1.2581356352488003e-05,
+      "loss": 0.6895,
+      "step": 1575
+    },
+    {
+      "epoch": 1.7678321678321678,
+      "grad_norm": 0.0731333956161155,
+      "learning_rate": 1.2000136423788226e-05,
+      "loss": 0.6947,
+      "step": 1580
+    },
+    {
+      "epoch": 1.7734265734265735,
+      "grad_norm": 0.07510563722528918,
+      "learning_rate": 1.1432101588158487e-05,
+      "loss": 0.6782,
+      "step": 1585
+    },
+    {
+      "epoch": 1.779020979020979,
+      "grad_norm": 0.07335149840877105,
+      "learning_rate": 1.0877306118059498e-05,
+      "loss": 0.6832,
+      "step": 1590
+    },
+    {
+      "epoch": 1.7846153846153845,
+      "grad_norm": 0.07016146300305476,
+      "learning_rate": 1.0335803021006783e-05,
+      "loss": 0.6735,
+      "step": 1595
+    },
+    {
+      "epoch": 1.7902097902097902,
+      "grad_norm": 0.07110506905460164,
+      "learning_rate": 9.807644034506024e-06,
+      "loss": 0.6933,
+      "step": 1600
+    },
+    {
+      "epoch": 1.795804195804196,
+      "grad_norm": 0.07366692101954259,
+      "learning_rate": 9.292879621110022e-06,
+      "loss": 0.6775,
+      "step": 1605
+    },
+    {
+      "epoch": 1.8013986013986014,
+      "grad_norm": 0.07086756664676215,
+      "learning_rate": 8.791558963597045e-06,
+      "loss": 0.6847,
+      "step": 1610
+    },
+    {
+      "epoch": 1.806993006993007,
+      "grad_norm": 0.0704179943193231,
+      "learning_rate": 8.30372996027195e-06,
+      "loss": 0.6802,
+      "step": 1615
+    },
+    {
+      "epoch": 1.8125874125874126,
+      "grad_norm": 0.07178710401033903,
+      "learning_rate": 7.829439220389521e-06,
+      "loss": 0.6892,
+      "step": 1620
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 0.07500788961567526,
+      "learning_rate": 7.368732059701499e-06,
+      "loss": 0.6822,
+      "step": 1625
+    },
+    {
+      "epoch": 1.8237762237762238,
+      "grad_norm": 0.07030773912011665,
+      "learning_rate": 6.921652496126623e-06,
+      "loss": 0.6749,
+      "step": 1630
+    },
+    {
+      "epoch": 1.8293706293706293,
+      "grad_norm": 0.07046870723595076,
+      "learning_rate": 6.4882432455452606e-06,
+      "loss": 0.6748,
+      "step": 1635
+    },
+    {
+      "epoch": 1.8349650349650348,
+      "grad_norm": 0.07117585543308619,
+      "learning_rate": 6.068545717717916e-06,
+      "loss": 0.6828,
+      "step": 1640
+    },
+    {
+      "epoch": 1.8405594405594405,
+      "grad_norm": 0.06915527262243856,
+      "learning_rate": 5.662600012328944e-06,
+      "loss": 0.6883,
+      "step": 1645
+    },
+    {
+      "epoch": 1.8461538461538463,
+      "grad_norm": 0.06677186359807996,
+      "learning_rate": 5.27044491515512e-06,
+      "loss": 0.6701,
+      "step": 1650
+    },
+    {
+      "epoch": 1.8517482517482518,
+      "grad_norm": 0.06975135811394965,
+      "learning_rate": 4.892117894359981e-06,
+      "loss": 0.6896,
+      "step": 1655
+    },
+    {
+      "epoch": 1.8573426573426572,
+      "grad_norm": 0.06646442932877118,
+      "learning_rate": 4.527655096913913e-06,
+      "loss": 0.6736,
+      "step": 1660
+    },
+    {
+      "epoch": 1.862937062937063,
+      "grad_norm": 0.0696338374277028,
+      "learning_rate": 4.177091345140488e-06,
+      "loss": 0.6824,
+      "step": 1665
+    },
+    {
+      "epoch": 1.8685314685314687,
+      "grad_norm": 0.06913237278308428,
+      "learning_rate": 3.840460133389434e-06,
+      "loss": 0.6708,
+      "step": 1670
+    },
+    {
+      "epoch": 1.8741258741258742,
+      "grad_norm": 0.07472788742872244,
+      "learning_rate": 3.5177936248364236e-06,
+      "loss": 0.6843,
+      "step": 1675
+    },
+    {
+      "epoch": 1.8797202797202797,
+      "grad_norm": 0.07149423452101296,
+      "learning_rate": 3.2091226484101506e-06,
+      "loss": 0.6716,
+      "step": 1680
+    },
+    {
+      "epoch": 1.8853146853146852,
+      "grad_norm": 0.0676192443302248,
+      "learning_rate": 2.9144766958466014e-06,
+      "loss": 0.6816,
+      "step": 1685
+    },
+    {
+      "epoch": 1.8909090909090909,
+      "grad_norm": 0.07125330690039695,
+      "learning_rate": 2.6338839188715433e-06,
+      "loss": 0.686,
+      "step": 1690
+    },
+    {
+      "epoch": 1.8965034965034966,
+      "grad_norm": 0.07150788853257964,
+      "learning_rate": 2.3673711265105754e-06,
+      "loss": 0.6845,
+      "step": 1695
+    },
+    {
+      "epoch": 1.902097902097902,
+      "grad_norm": 0.06921602353027008,
+      "learning_rate": 2.1149637825277953e-06,
+      "loss": 0.6851,
+      "step": 1700
+    },
+    {
+      "epoch": 1.9076923076923076,
+      "grad_norm": 0.07103009932661779,
+      "learning_rate": 1.876686002992861e-06,
+      "loss": 0.6879,
+      "step": 1705
+    },
+    {
+      "epoch": 1.9132867132867133,
+      "grad_norm": 0.07078839389394587,
+      "learning_rate": 1.6525605539768173e-06,
+      "loss": 0.6842,
+      "step": 1710
+    },
+    {
+      "epoch": 1.918881118881119,
+      "grad_norm": 0.0677463790575426,
+      "learning_rate": 1.4426088493769695e-06,
+      "loss": 0.6822,
+      "step": 1715
+    },
+    {
+      "epoch": 1.9244755244755245,
+      "grad_norm": 0.0689426207416468,
+      "learning_rate": 1.2468509488708534e-06,
+      "loss": 0.671,
+      "step": 1720
+    },
+    {
+      "epoch": 1.93006993006993,
+      "grad_norm": 0.06820108494343277,
+      "learning_rate": 1.0653055559997014e-06,
+      "loss": 0.6775,
+      "step": 1725
+    },
+    {
+      "epoch": 1.9356643356643357,
+      "grad_norm": 0.06821597918112166,
+      "learning_rate": 8.979900163813891e-07,
+      "loss": 0.6701,
+      "step": 1730
+    },
+    {
+      "epoch": 1.9412587412587412,
+      "grad_norm": 0.06837657131866866,
+      "learning_rate": 7.449203160532102e-07,
+      "loss": 0.6725,
+      "step": 1735
+    },
+    {
+      "epoch": 1.946853146853147,
+      "grad_norm": 0.07000431064997485,
+      "learning_rate": 6.061110799443991e-07,
+      "loss": 0.6709,
+      "step": 1740
+    },
+    {
+      "epoch": 1.9524475524475524,
+      "grad_norm": 0.07108137280182684,
+      "learning_rate": 4.815755704789481e-07,
+      "loss": 0.6846,
+      "step": 1745
+    },
+    {
+      "epoch": 1.958041958041958,
+      "grad_norm": 0.07117415645327746,
+      "learning_rate": 3.7132568630833804e-07,
+      "loss": 0.6865,
+      "step": 1750
+    },
+    {
+      "epoch": 1.9636363636363636,
+      "grad_norm": 0.06606487210968705,
+      "learning_rate": 2.753719611747474e-07,
+      "loss": 0.6736,
+      "step": 1755
+    },
+    {
+      "epoch": 1.9692307692307693,
+      "grad_norm": 0.06965546022369541,
+      "learning_rate": 1.9372356290460744e-07,
+      "loss": 0.7023,
+      "step": 1760
+    },
+    {
+      "epoch": 1.9748251748251748,
+      "grad_norm": 0.06925022116566176,
+      "learning_rate": 1.2638829253265316e-07,
+      "loss": 0.6665,
+      "step": 1765
+    },
+    {
+      "epoch": 1.9804195804195803,
+      "grad_norm": 0.06743940727744605,
+      "learning_rate": 7.337258355660236e-08,
+      "loss": 0.6704,
+      "step": 1770
+    },
+    {
+      "epoch": 1.986013986013986,
+      "grad_norm": 0.06697392386506328,
+      "learning_rate": 3.4681501322464386e-08,
+      "loss": 0.6703,
+      "step": 1775
+    },
+    {
+      "epoch": 1.9916083916083918,
+      "grad_norm": 0.06864360169538268,
+      "learning_rate": 1.0318742540560421e-08,
+      "loss": 0.6753,
+      "step": 1780
+    },
+    {
+      "epoch": 1.9972027972027973,
+      "grad_norm": 0.06821105929452137,
+      "learning_rate": 2.8663493232272684e-10,
+      "loss": 0.6705,
+      "step": 1785
+    },
+    {
+      "epoch": 1.9983216783216782,
+      "eval_loss": 1.0629972219467163,
+      "eval_runtime": 366.656,
+      "eval_samples_per_second": 56.153,
+      "eval_steps_per_second": 1.756,
+      "step": 1786
+    },
+    {
+      "epoch": 1.9983216783216782,
+      "step": 1786,
+      "total_flos": 1127451463778304.0,
+      "train_loss": 0.777597175032935,
+      "train_runtime": 16051.5455,
+      "train_samples_per_second": 14.253,
+      "train_steps_per_second": 0.111
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1786,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1127451463778304.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:290d7ecc3e2f19388b5c15b1a5cf555114883a038f5dc255f5173455bbbea8a2
+size 6392

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff