Upload 11 files

Browse files

Files changed (12) hide show

.gitattributes +1 -0
config.json +51 -0
model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +15 -0
spiece.model +3 -0
tokenizer.json +3 -0
tokenizer_config.json +59 -0
trainer_state.json +2250 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "architectures": [
+    "AlbertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0,
+  "bos_token_id": 2,
+  "classifier_dropout_prob": 0.1,
+  "down_scale_factor": 1,
+  "embedding_size": 128,
+  "eos_token_id": 3,
+  "gap_size": 0,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6"
+  },
+  "initializer_range": 0.02,
+  "inner_group_num": 1,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2,
+    "LABEL_3": 3,
+    "LABEL_4": 4,
+    "LABEL_5": 5,
+    "LABEL_6": 6
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "albert",
+  "net_structure_type": 0,
+  "num_attention_heads": 12,
+  "num_hidden_groups": 1,
+  "num_hidden_layers": 12,
+  "num_memory_blocks": 0,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "type_vocab_size": 2,
+  "vocab_size": 200000
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7f93f7b4555e056c35a7d4748796173d46540140b6e7a85ec393676a44b7618
+size 133799332

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d11a1ee19953ce093f706e2e782887ccb11deaf230e1593090c767f12f73497c
+size 267615198

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0eaec201f0f27ed5d1bdbd712de60e83e30528c37214b3426905fac408516111
+size 14645

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc918f895ce80a4a3b539adc765848b50a825b2f2e16d95d972bcb3a66764a12
+size 1465

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "[CLS]",
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "[SEP]",
+  "unk_token": "<unk>"
+}

spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a1173c2b6e144a02c001e289a05b5dbefddf247c50d4dcf42633158b2968fcb
+size 5646064

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1dde479dd4e493c0bea2cdefa97a961e84ba598a35311c44c4a8b1f56482220
+size 15285941

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "eos_token": "[SEP]",
+  "extra_special_tokens": {},
+  "keep_accents": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "remove_space": true,
+  "sep_token": "[SEP]",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "AlbertTokenizer",
+  "unk_token": "<unk>"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2250 @@

+{
+  "best_global_step": 5760,
+  "best_metric": 0.05086889490485191,
+  "best_model_checkpoint": "./results/checkpoint-5760",
+  "epoch": 25.0,
+  "eval_steps": 500,
+  "global_step": 14400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.08680555555555555,
+      "grad_norm": 3.413226842880249,
+      "learning_rate": 1.9931944444444447e-05,
+      "loss": 1.8426,
+      "step": 50
+    },
+    {
+      "epoch": 0.1736111111111111,
+      "grad_norm": 19.604671478271484,
+      "learning_rate": 1.98625e-05,
+      "loss": 1.2984,
+      "step": 100
+    },
+    {
+      "epoch": 0.2604166666666667,
+      "grad_norm": 73.20538330078125,
+      "learning_rate": 1.979305555555556e-05,
+      "loss": 0.9999,
+      "step": 150
+    },
+    {
+      "epoch": 0.3472222222222222,
+      "grad_norm": 4.863669395446777,
+      "learning_rate": 1.972361111111111e-05,
+      "loss": 0.8302,
+      "step": 200
+    },
+    {
+      "epoch": 0.4340277777777778,
+      "grad_norm": 17.846479415893555,
+      "learning_rate": 1.965416666666667e-05,
+      "loss": 0.6309,
+      "step": 250
+    },
+    {
+      "epoch": 0.5208333333333334,
+      "grad_norm": 11.574165344238281,
+      "learning_rate": 1.9584722222222224e-05,
+      "loss": 0.4987,
+      "step": 300
+    },
+    {
+      "epoch": 0.6076388888888888,
+      "grad_norm": 31.07317352294922,
+      "learning_rate": 1.9515277777777778e-05,
+      "loss": 0.3274,
+      "step": 350
+    },
+    {
+      "epoch": 0.6944444444444444,
+      "grad_norm": 3.164802074432373,
+      "learning_rate": 1.9445833333333336e-05,
+      "loss": 0.3305,
+      "step": 400
+    },
+    {
+      "epoch": 0.78125,
+      "grad_norm": 2.5798959732055664,
+      "learning_rate": 1.937638888888889e-05,
+      "loss": 0.2624,
+      "step": 450
+    },
+    {
+      "epoch": 0.8680555555555556,
+      "grad_norm": 12.735908508300781,
+      "learning_rate": 1.9306944444444445e-05,
+      "loss": 0.3035,
+      "step": 500
+    },
+    {
+      "epoch": 0.9548611111111112,
+      "grad_norm": 17.255126953125,
+      "learning_rate": 1.9237500000000003e-05,
+      "loss": 0.2323,
+      "step": 550
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.17769542336463928,
+      "eval_runtime": 2.138,
+      "eval_samples_per_second": 478.963,
+      "eval_steps_per_second": 29.935,
+      "step": 576
+    },
+    {
+      "epoch": 1.0416666666666667,
+      "grad_norm": 32.9837646484375,
+      "learning_rate": 1.9168055555555558e-05,
+      "loss": 0.1779,
+      "step": 600
+    },
+    {
+      "epoch": 1.1284722222222223,
+      "grad_norm": 3.967150926589966,
+      "learning_rate": 1.9098611111111113e-05,
+      "loss": 0.145,
+      "step": 650
+    },
+    {
+      "epoch": 1.2152777777777777,
+      "grad_norm": 12.614818572998047,
+      "learning_rate": 1.9029166666666667e-05,
+      "loss": 0.1287,
+      "step": 700
+    },
+    {
+      "epoch": 1.3020833333333333,
+      "grad_norm": 26.14882469177246,
+      "learning_rate": 1.8959722222222222e-05,
+      "loss": 0.1397,
+      "step": 750
+    },
+    {
+      "epoch": 1.3888888888888888,
+      "grad_norm": 39.56580352783203,
+      "learning_rate": 1.889027777777778e-05,
+      "loss": 0.1194,
+      "step": 800
+    },
+    {
+      "epoch": 1.4756944444444444,
+      "grad_norm": 26.76836395263672,
+      "learning_rate": 1.8820833333333335e-05,
+      "loss": 0.1274,
+      "step": 850
+    },
+    {
+      "epoch": 1.5625,
+      "grad_norm": 26.581575393676758,
+      "learning_rate": 1.875138888888889e-05,
+      "loss": 0.1633,
+      "step": 900
+    },
+    {
+      "epoch": 1.6493055555555556,
+      "grad_norm": 0.08225402981042862,
+      "learning_rate": 1.8681944444444447e-05,
+      "loss": 0.0979,
+      "step": 950
+    },
+    {
+      "epoch": 1.7361111111111112,
+      "grad_norm": 0.38262873888015747,
+      "learning_rate": 1.8612500000000002e-05,
+      "loss": 0.1227,
+      "step": 1000
+    },
+    {
+      "epoch": 1.8229166666666665,
+      "grad_norm": 1.3281371593475342,
+      "learning_rate": 1.8543055555555556e-05,
+      "loss": 0.0741,
+      "step": 1050
+    },
+    {
+      "epoch": 1.9097222222222223,
+      "grad_norm": 48.16311264038086,
+      "learning_rate": 1.847361111111111e-05,
+      "loss": 0.1376,
+      "step": 1100
+    },
+    {
+      "epoch": 1.9965277777777777,
+      "grad_norm": 0.10062026977539062,
+      "learning_rate": 1.840416666666667e-05,
+      "loss": 0.1454,
+      "step": 1150
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.08461333066225052,
+      "eval_runtime": 2.1421,
+      "eval_samples_per_second": 478.037,
+      "eval_steps_per_second": 29.877,
+      "step": 1152
+    },
+    {
+      "epoch": 2.0833333333333335,
+      "grad_norm": 0.03965457156300545,
+      "learning_rate": 1.8334722222222224e-05,
+      "loss": 0.0425,
+      "step": 1200
+    },
+    {
+      "epoch": 2.170138888888889,
+      "grad_norm": 0.06563384085893631,
+      "learning_rate": 1.8265277777777778e-05,
+      "loss": 0.0516,
+      "step": 1250
+    },
+    {
+      "epoch": 2.2569444444444446,
+      "grad_norm": 1.1895874738693237,
+      "learning_rate": 1.8195833333333336e-05,
+      "loss": 0.0601,
+      "step": 1300
+    },
+    {
+      "epoch": 2.34375,
+      "grad_norm": 0.5909057259559631,
+      "learning_rate": 1.812638888888889e-05,
+      "loss": 0.0484,
+      "step": 1350
+    },
+    {
+      "epoch": 2.4305555555555554,
+      "grad_norm": 0.06617053598165512,
+      "learning_rate": 1.8056944444444446e-05,
+      "loss": 0.093,
+      "step": 1400
+    },
+    {
+      "epoch": 2.517361111111111,
+      "grad_norm": 0.03617614507675171,
+      "learning_rate": 1.7987500000000004e-05,
+      "loss": 0.0812,
+      "step": 1450
+    },
+    {
+      "epoch": 2.6041666666666665,
+      "grad_norm": 1.4127732515335083,
+      "learning_rate": 1.7918055555555558e-05,
+      "loss": 0.0834,
+      "step": 1500
+    },
+    {
+      "epoch": 2.6909722222222223,
+      "grad_norm": 0.03689781203866005,
+      "learning_rate": 1.7848611111111113e-05,
+      "loss": 0.0581,
+      "step": 1550
+    },
+    {
+      "epoch": 2.7777777777777777,
+      "grad_norm": 82.96894073486328,
+      "learning_rate": 1.7779166666666667e-05,
+      "loss": 0.0607,
+      "step": 1600
+    },
+    {
+      "epoch": 2.8645833333333335,
+      "grad_norm": 20.012392044067383,
+      "learning_rate": 1.7709722222222222e-05,
+      "loss": 0.0886,
+      "step": 1650
+    },
+    {
+      "epoch": 2.951388888888889,
+      "grad_norm": 0.08668874949216843,
+      "learning_rate": 1.764027777777778e-05,
+      "loss": 0.1223,
+      "step": 1700
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.08079180121421814,
+      "eval_runtime": 2.156,
+      "eval_samples_per_second": 474.96,
+      "eval_steps_per_second": 29.685,
+      "step": 1728
+    },
+    {
+      "epoch": 3.0381944444444446,
+      "grad_norm": 0.5279293656349182,
+      "learning_rate": 1.7570833333333335e-05,
+      "loss": 0.0713,
+      "step": 1750
+    },
+    {
+      "epoch": 3.125,
+      "grad_norm": 31.233030319213867,
+      "learning_rate": 1.750138888888889e-05,
+      "loss": 0.0746,
+      "step": 1800
+    },
+    {
+      "epoch": 3.2118055555555554,
+      "grad_norm": 0.0305875763297081,
+      "learning_rate": 1.7431944444444447e-05,
+      "loss": 0.0742,
+      "step": 1850
+    },
+    {
+      "epoch": 3.298611111111111,
+      "grad_norm": 43.64688491821289,
+      "learning_rate": 1.7362500000000002e-05,
+      "loss": 0.0365,
+      "step": 1900
+    },
+    {
+      "epoch": 3.3854166666666665,
+      "grad_norm": 0.034333955496549606,
+      "learning_rate": 1.7293055555555557e-05,
+      "loss": 0.058,
+      "step": 1950
+    },
+    {
+      "epoch": 3.4722222222222223,
+      "grad_norm": 86.51885986328125,
+      "learning_rate": 1.722361111111111e-05,
+      "loss": 0.0559,
+      "step": 2000
+    },
+    {
+      "epoch": 3.5590277777777777,
+      "grad_norm": 71.18854522705078,
+      "learning_rate": 1.7154166666666666e-05,
+      "loss": 0.0511,
+      "step": 2050
+    },
+    {
+      "epoch": 3.6458333333333335,
+      "grad_norm": 37.69118881225586,
+      "learning_rate": 1.7084722222222224e-05,
+      "loss": 0.0393,
+      "step": 2100
+    },
+    {
+      "epoch": 3.732638888888889,
+      "grad_norm": 0.011939575895667076,
+      "learning_rate": 1.701527777777778e-05,
+      "loss": 0.0394,
+      "step": 2150
+    },
+    {
+      "epoch": 3.8194444444444446,
+      "grad_norm": 0.23084449768066406,
+      "learning_rate": 1.6945833333333333e-05,
+      "loss": 0.02,
+      "step": 2200
+    },
+    {
+      "epoch": 3.90625,
+      "grad_norm": 0.22158057987689972,
+      "learning_rate": 1.687638888888889e-05,
+      "loss": 0.0248,
+      "step": 2250
+    },
+    {
+      "epoch": 3.9930555555555554,
+      "grad_norm": 0.06880240887403488,
+      "learning_rate": 1.6806944444444446e-05,
+      "loss": 0.054,
+      "step": 2300
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 0.07962702214717865,
+      "eval_runtime": 2.1469,
+      "eval_samples_per_second": 476.958,
+      "eval_steps_per_second": 29.81,
+      "step": 2304
+    },
+    {
+      "epoch": 4.079861111111111,
+      "grad_norm": 0.0237971693277359,
+      "learning_rate": 1.6737500000000004e-05,
+      "loss": 0.0151,
+      "step": 2350
+    },
+    {
+      "epoch": 4.166666666666667,
+      "grad_norm": 0.012101912871003151,
+      "learning_rate": 1.6668055555555558e-05,
+      "loss": 0.0195,
+      "step": 2400
+    },
+    {
+      "epoch": 4.253472222222222,
+      "grad_norm": 0.010883960872888565,
+      "learning_rate": 1.6598611111111113e-05,
+      "loss": 0.0415,
+      "step": 2450
+    },
+    {
+      "epoch": 4.340277777777778,
+      "grad_norm": 0.021902142092585564,
+      "learning_rate": 1.6529166666666668e-05,
+      "loss": 0.0426,
+      "step": 2500
+    },
+    {
+      "epoch": 4.427083333333333,
+      "grad_norm": 0.04989313334226608,
+      "learning_rate": 1.6459722222222222e-05,
+      "loss": 0.0136,
+      "step": 2550
+    },
+    {
+      "epoch": 4.513888888888889,
+      "grad_norm": 0.22097109258174896,
+      "learning_rate": 1.639027777777778e-05,
+      "loss": 0.0465,
+      "step": 2600
+    },
+    {
+      "epoch": 4.600694444444445,
+      "grad_norm": 0.010039995424449444,
+      "learning_rate": 1.6320833333333335e-05,
+      "loss": 0.028,
+      "step": 2650
+    },
+    {
+      "epoch": 4.6875,
+      "grad_norm": 0.013202900998294353,
+      "learning_rate": 1.625138888888889e-05,
+      "loss": 0.0461,
+      "step": 2700
+    },
+    {
+      "epoch": 4.774305555555555,
+      "grad_norm": 0.015310668386518955,
+      "learning_rate": 1.6181944444444447e-05,
+      "loss": 0.0278,
+      "step": 2750
+    },
+    {
+      "epoch": 4.861111111111111,
+      "grad_norm": 0.008756318129599094,
+      "learning_rate": 1.6112500000000002e-05,
+      "loss": 0.0218,
+      "step": 2800
+    },
+    {
+      "epoch": 4.947916666666667,
+      "grad_norm": 0.015537718310952187,
+      "learning_rate": 1.6043055555555557e-05,
+      "loss": 0.0525,
+      "step": 2850
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 0.07712521404027939,
+      "eval_runtime": 2.1488,
+      "eval_samples_per_second": 476.534,
+      "eval_steps_per_second": 29.783,
+      "step": 2880
+    },
+    {
+      "epoch": 5.034722222222222,
+      "grad_norm": 0.011155390180647373,
+      "learning_rate": 1.597361111111111e-05,
+      "loss": 0.0136,
+      "step": 2900
+    },
+    {
+      "epoch": 5.121527777777778,
+      "grad_norm": 0.013077272102236748,
+      "learning_rate": 1.5904166666666666e-05,
+      "loss": 0.038,
+      "step": 2950
+    },
+    {
+      "epoch": 5.208333333333333,
+      "grad_norm": 0.009396527893841267,
+      "learning_rate": 1.5834722222222224e-05,
+      "loss": 0.0007,
+      "step": 3000
+    },
+    {
+      "epoch": 5.295138888888889,
+      "grad_norm": 0.03254946321249008,
+      "learning_rate": 1.576527777777778e-05,
+      "loss": 0.0218,
+      "step": 3050
+    },
+    {
+      "epoch": 5.381944444444445,
+      "grad_norm": 0.005586525425314903,
+      "learning_rate": 1.5695833333333333e-05,
+      "loss": 0.0203,
+      "step": 3100
+    },
+    {
+      "epoch": 5.46875,
+      "grad_norm": 0.009627276100218296,
+      "learning_rate": 1.562638888888889e-05,
+      "loss": 0.0668,
+      "step": 3150
+    },
+    {
+      "epoch": 5.555555555555555,
+      "grad_norm": 0.0053014811128377914,
+      "learning_rate": 1.5556944444444446e-05,
+      "loss": 0.0119,
+      "step": 3200
+    },
+    {
+      "epoch": 5.642361111111111,
+      "grad_norm": 0.038525983691215515,
+      "learning_rate": 1.54875e-05,
+      "loss": 0.0255,
+      "step": 3250
+    },
+    {
+      "epoch": 5.729166666666667,
+      "grad_norm": 0.04675915837287903,
+      "learning_rate": 1.541805555555556e-05,
+      "loss": 0.0644,
+      "step": 3300
+    },
+    {
+      "epoch": 5.815972222222222,
+      "grad_norm": 0.04972713068127632,
+      "learning_rate": 1.5348611111111113e-05,
+      "loss": 0.0225,
+      "step": 3350
+    },
+    {
+      "epoch": 5.902777777777778,
+      "grad_norm": 0.004159330390393734,
+      "learning_rate": 1.5279166666666668e-05,
+      "loss": 0.0151,
+      "step": 3400
+    },
+    {
+      "epoch": 5.989583333333333,
+      "grad_norm": 0.01926554925739765,
+      "learning_rate": 1.5209722222222222e-05,
+      "loss": 0.0205,
+      "step": 3450
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 0.07027073949575424,
+      "eval_runtime": 2.1711,
+      "eval_samples_per_second": 471.645,
+      "eval_steps_per_second": 29.478,
+      "step": 3456
+    },
+    {
+      "epoch": 6.076388888888889,
+      "grad_norm": 0.09293384104967117,
+      "learning_rate": 1.5140277777777779e-05,
+      "loss": 0.0077,
+      "step": 3500
+    },
+    {
+      "epoch": 6.163194444444445,
+      "grad_norm": 0.010024973191320896,
+      "learning_rate": 1.5070833333333335e-05,
+      "loss": 0.0423,
+      "step": 3550
+    },
+    {
+      "epoch": 6.25,
+      "grad_norm": 0.029724491760134697,
+      "learning_rate": 1.500138888888889e-05,
+      "loss": 0.0199,
+      "step": 3600
+    },
+    {
+      "epoch": 6.336805555555555,
+      "grad_norm": 0.015937596559524536,
+      "learning_rate": 1.4931944444444446e-05,
+      "loss": 0.017,
+      "step": 3650
+    },
+    {
+      "epoch": 6.423611111111111,
+      "grad_norm": 0.009556828066706657,
+      "learning_rate": 1.4862500000000002e-05,
+      "loss": 0.0004,
+      "step": 3700
+    },
+    {
+      "epoch": 6.510416666666667,
+      "grad_norm": 0.032106589525938034,
+      "learning_rate": 1.4793055555555558e-05,
+      "loss": 0.0147,
+      "step": 3750
+    },
+    {
+      "epoch": 6.597222222222222,
+      "grad_norm": 0.005125823896378279,
+      "learning_rate": 1.4723611111111111e-05,
+      "loss": 0.0056,
+      "step": 3800
+    },
+    {
+      "epoch": 6.684027777777778,
+      "grad_norm": 0.018202291801571846,
+      "learning_rate": 1.4654166666666668e-05,
+      "loss": 0.0264,
+      "step": 3850
+    },
+    {
+      "epoch": 6.770833333333333,
+      "grad_norm": 31.37969970703125,
+      "learning_rate": 1.4584722222222222e-05,
+      "loss": 0.0119,
+      "step": 3900
+    },
+    {
+      "epoch": 6.857638888888889,
+      "grad_norm": 0.037925682961940765,
+      "learning_rate": 1.4515277777777779e-05,
+      "loss": 0.0203,
+      "step": 3950
+    },
+    {
+      "epoch": 6.944444444444445,
+      "grad_norm": 0.1194610670208931,
+      "learning_rate": 1.4445833333333335e-05,
+      "loss": 0.0158,
+      "step": 4000
+    },
+    {
+      "epoch": 7.0,
+      "eval_loss": 0.0702182799577713,
+      "eval_runtime": 2.1701,
+      "eval_samples_per_second": 471.867,
+      "eval_steps_per_second": 29.492,
+      "step": 4032
+    },
+    {
+      "epoch": 7.03125,
+      "grad_norm": 0.0035032748710364103,
+      "learning_rate": 1.437638888888889e-05,
+      "loss": 0.0065,
+      "step": 4050
+    },
+    {
+      "epoch": 7.118055555555555,
+      "grad_norm": 0.0025438766460865736,
+      "learning_rate": 1.4306944444444446e-05,
+      "loss": 0.0003,
+      "step": 4100
+    },
+    {
+      "epoch": 7.204861111111111,
+      "grad_norm": 4.609809875488281,
+      "learning_rate": 1.4237500000000002e-05,
+      "loss": 0.02,
+      "step": 4150
+    },
+    {
+      "epoch": 7.291666666666667,
+      "grad_norm": 0.0041930885054171085,
+      "learning_rate": 1.4168055555555558e-05,
+      "loss": 0.0496,
+      "step": 4200
+    },
+    {
+      "epoch": 7.378472222222222,
+      "grad_norm": 0.3876318037509918,
+      "learning_rate": 1.4098611111111111e-05,
+      "loss": 0.0078,
+      "step": 4250
+    },
+    {
+      "epoch": 7.465277777777778,
+      "grad_norm": 0.003726869821548462,
+      "learning_rate": 1.4029166666666668e-05,
+      "loss": 0.0122,
+      "step": 4300
+    },
+    {
+      "epoch": 7.552083333333333,
+      "grad_norm": 0.00823493953794241,
+      "learning_rate": 1.3959722222222222e-05,
+      "loss": 0.0136,
+      "step": 4350
+    },
+    {
+      "epoch": 7.638888888888889,
+      "grad_norm": 0.004769823048263788,
+      "learning_rate": 1.3890277777777779e-05,
+      "loss": 0.0132,
+      "step": 4400
+    },
+    {
+      "epoch": 7.725694444444445,
+      "grad_norm": 45.848167419433594,
+      "learning_rate": 1.3820833333333335e-05,
+      "loss": 0.0116,
+      "step": 4450
+    },
+    {
+      "epoch": 7.8125,
+      "grad_norm": 0.0101216621696949,
+      "learning_rate": 1.375138888888889e-05,
+      "loss": 0.0275,
+      "step": 4500
+    },
+    {
+      "epoch": 7.899305555555555,
+      "grad_norm": 0.009534699842333794,
+      "learning_rate": 1.3681944444444446e-05,
+      "loss": 0.0073,
+      "step": 4550
+    },
+    {
+      "epoch": 7.986111111111111,
+      "grad_norm": 0.003179518273100257,
+      "learning_rate": 1.3612500000000002e-05,
+      "loss": 0.0128,
+      "step": 4600
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 0.08423544466495514,
+      "eval_runtime": 2.1727,
+      "eval_samples_per_second": 471.311,
+      "eval_steps_per_second": 29.457,
+      "step": 4608
+    },
+    {
+      "epoch": 8.072916666666666,
+      "grad_norm": 0.0026477461215108633,
+      "learning_rate": 1.3543055555555557e-05,
+      "loss": 0.0261,
+      "step": 4650
+    },
+    {
+      "epoch": 8.159722222222221,
+      "grad_norm": 0.019464371725916862,
+      "learning_rate": 1.3473611111111111e-05,
+      "loss": 0.0015,
+      "step": 4700
+    },
+    {
+      "epoch": 8.246527777777779,
+      "grad_norm": 0.0027786209248006344,
+      "learning_rate": 1.3404166666666668e-05,
+      "loss": 0.0125,
+      "step": 4750
+    },
+    {
+      "epoch": 8.333333333333334,
+      "grad_norm": 0.0063923560082912445,
+      "learning_rate": 1.3334722222222222e-05,
+      "loss": 0.0107,
+      "step": 4800
+    },
+    {
+      "epoch": 8.42013888888889,
+      "grad_norm": 0.0071762921288609505,
+      "learning_rate": 1.3265277777777779e-05,
+      "loss": 0.01,
+      "step": 4850
+    },
+    {
+      "epoch": 8.506944444444445,
+      "grad_norm": 0.003973743878304958,
+      "learning_rate": 1.3195833333333335e-05,
+      "loss": 0.0262,
+      "step": 4900
+    },
+    {
+      "epoch": 8.59375,
+      "grad_norm": 0.015603139996528625,
+      "learning_rate": 1.312638888888889e-05,
+      "loss": 0.0213,
+      "step": 4950
+    },
+    {
+      "epoch": 8.680555555555555,
+      "grad_norm": 0.002647754270583391,
+      "learning_rate": 1.3056944444444446e-05,
+      "loss": 0.01,
+      "step": 5000
+    },
+    {
+      "epoch": 8.76736111111111,
+      "grad_norm": 14.067428588867188,
+      "learning_rate": 1.2987500000000002e-05,
+      "loss": 0.0324,
+      "step": 5050
+    },
+    {
+      "epoch": 8.854166666666666,
+      "grad_norm": 0.004289311822503805,
+      "learning_rate": 1.2918055555555557e-05,
+      "loss": 0.0099,
+      "step": 5100
+    },
+    {
+      "epoch": 8.940972222222221,
+      "grad_norm": 0.01604336127638817,
+      "learning_rate": 1.2848611111111112e-05,
+      "loss": 0.0226,
+      "step": 5150
+    },
+    {
+      "epoch": 9.0,
+      "eval_loss": 0.08734692633152008,
+      "eval_runtime": 2.1655,
+      "eval_samples_per_second": 472.877,
+      "eval_steps_per_second": 29.555,
+      "step": 5184
+    },
+    {
+      "epoch": 9.027777777777779,
+      "grad_norm": 21.617219924926758,
+      "learning_rate": 1.2779166666666666e-05,
+      "loss": 0.0284,
+      "step": 5200
+    },
+    {
+      "epoch": 9.114583333333334,
+      "grad_norm": 0.0036318660713732243,
+      "learning_rate": 1.2709722222222222e-05,
+      "loss": 0.0242,
+      "step": 5250
+    },
+    {
+      "epoch": 9.20138888888889,
+      "grad_norm": 1.550016164779663,
+      "learning_rate": 1.2640277777777779e-05,
+      "loss": 0.0195,
+      "step": 5300
+    },
+    {
+      "epoch": 9.288194444444445,
+      "grad_norm": 0.0063721900805830956,
+      "learning_rate": 1.2570833333333335e-05,
+      "loss": 0.0326,
+      "step": 5350
+    },
+    {
+      "epoch": 9.375,
+      "grad_norm": 6.12042236328125,
+      "learning_rate": 1.250138888888889e-05,
+      "loss": 0.0153,
+      "step": 5400
+    },
+    {
+      "epoch": 9.461805555555555,
+      "grad_norm": 0.0024620750918984413,
+      "learning_rate": 1.2431944444444446e-05,
+      "loss": 0.0126,
+      "step": 5450
+    },
+    {
+      "epoch": 9.54861111111111,
+      "grad_norm": 0.008434666320681572,
+      "learning_rate": 1.2362500000000002e-05,
+      "loss": 0.0168,
+      "step": 5500
+    },
+    {
+      "epoch": 9.635416666666666,
+      "grad_norm": 24.517446517944336,
+      "learning_rate": 1.2293055555555557e-05,
+      "loss": 0.0236,
+      "step": 5550
+    },
+    {
+      "epoch": 9.722222222222221,
+      "grad_norm": 0.0033647818490862846,
+      "learning_rate": 1.2223611111111112e-05,
+      "loss": 0.0128,
+      "step": 5600
+    },
+    {
+      "epoch": 9.809027777777779,
+      "grad_norm": 0.008964108303189278,
+      "learning_rate": 1.2154166666666666e-05,
+      "loss": 0.0064,
+      "step": 5650
+    },
+    {
+      "epoch": 9.895833333333334,
+      "grad_norm": 0.00735318660736084,
+      "learning_rate": 1.2084722222222223e-05,
+      "loss": 0.0069,
+      "step": 5700
+    },
+    {
+      "epoch": 9.98263888888889,
+      "grad_norm": 68.18215942382812,
+      "learning_rate": 1.2015277777777779e-05,
+      "loss": 0.0278,
+      "step": 5750
+    },
+    {
+      "epoch": 10.0,
+      "eval_loss": 0.05086889490485191,
+      "eval_runtime": 2.1757,
+      "eval_samples_per_second": 470.664,
+      "eval_steps_per_second": 29.416,
+      "step": 5760
+    },
+    {
+      "epoch": 10.069444444444445,
+      "grad_norm": 0.008082223124802113,
+      "learning_rate": 1.1945833333333333e-05,
+      "loss": 0.0001,
+      "step": 5800
+    },
+    {
+      "epoch": 10.15625,
+      "grad_norm": 0.0015207990072667599,
+      "learning_rate": 1.187638888888889e-05,
+      "loss": 0.0023,
+      "step": 5850
+    },
+    {
+      "epoch": 10.243055555555555,
+      "grad_norm": 0.003384481882676482,
+      "learning_rate": 1.1806944444444446e-05,
+      "loss": 0.0039,
+      "step": 5900
+    },
+    {
+      "epoch": 10.32986111111111,
+      "grad_norm": 0.0017064920393750072,
+      "learning_rate": 1.1737500000000002e-05,
+      "loss": 0.0092,
+      "step": 5950
+    },
+    {
+      "epoch": 10.416666666666666,
+      "grad_norm": 0.0034557634498924017,
+      "learning_rate": 1.1668055555555557e-05,
+      "loss": 0.0291,
+      "step": 6000
+    },
+    {
+      "epoch": 10.503472222222221,
+      "grad_norm": 0.005785680841654539,
+      "learning_rate": 1.1598611111111112e-05,
+      "loss": 0.0002,
+      "step": 6050
+    },
+    {
+      "epoch": 10.590277777777779,
+      "grad_norm": 0.005150977522134781,
+      "learning_rate": 1.1529166666666666e-05,
+      "loss": 0.0209,
+      "step": 6100
+    },
+    {
+      "epoch": 10.677083333333334,
+      "grad_norm": 0.0023805610835552216,
+      "learning_rate": 1.1459722222222223e-05,
+      "loss": 0.0133,
+      "step": 6150
+    },
+    {
+      "epoch": 10.76388888888889,
+      "grad_norm": 0.0017482911935076118,
+      "learning_rate": 1.1390277777777779e-05,
+      "loss": 0.0034,
+      "step": 6200
+    },
+    {
+      "epoch": 10.850694444444445,
+      "grad_norm": 0.16960591077804565,
+      "learning_rate": 1.1320833333333334e-05,
+      "loss": 0.0023,
+      "step": 6250
+    },
+    {
+      "epoch": 10.9375,
+      "grad_norm": 0.005295192822813988,
+      "learning_rate": 1.125138888888889e-05,
+      "loss": 0.0018,
+      "step": 6300
+    },
+    {
+      "epoch": 11.0,
+      "eval_loss": 0.09595341980457306,
+      "eval_runtime": 2.1636,
+      "eval_samples_per_second": 473.292,
+      "eval_steps_per_second": 29.581,
+      "step": 6336
+    },
+    {
+      "epoch": 11.024305555555555,
+      "grad_norm": 0.0025636740028858185,
+      "learning_rate": 1.1181944444444446e-05,
+      "loss": 0.0003,
+      "step": 6350
+    },
+    {
+      "epoch": 11.11111111111111,
+      "grad_norm": 0.001272220746614039,
+      "learning_rate": 1.1112500000000002e-05,
+      "loss": 0.0071,
+      "step": 6400
+    },
+    {
+      "epoch": 11.197916666666666,
+      "grad_norm": 0.00381342857144773,
+      "learning_rate": 1.1043055555555557e-05,
+      "loss": 0.0146,
+      "step": 6450
+    },
+    {
+      "epoch": 11.284722222222221,
+      "grad_norm": 0.001548461732454598,
+      "learning_rate": 1.0973611111111112e-05,
+      "loss": 0.012,
+      "step": 6500
+    },
+    {
+      "epoch": 11.371527777777779,
+      "grad_norm": 0.0014353245496749878,
+      "learning_rate": 1.0904166666666666e-05,
+      "loss": 0.0001,
+      "step": 6550
+    },
+    {
+      "epoch": 11.458333333333334,
+      "grad_norm": 0.0014444834087044,
+      "learning_rate": 1.0834722222222223e-05,
+      "loss": 0.02,
+      "step": 6600
+    },
+    {
+      "epoch": 11.54513888888889,
+      "grad_norm": 0.002948822919279337,
+      "learning_rate": 1.0765277777777779e-05,
+      "loss": 0.0005,
+      "step": 6650
+    },
+    {
+      "epoch": 11.631944444444445,
+      "grad_norm": 0.001776829012669623,
+      "learning_rate": 1.0695833333333334e-05,
+      "loss": 0.0138,
+      "step": 6700
+    },
+    {
+      "epoch": 11.71875,
+      "grad_norm": 0.0016048089601099491,
+      "learning_rate": 1.062638888888889e-05,
+      "loss": 0.0229,
+      "step": 6750
+    },
+    {
+      "epoch": 11.805555555555555,
+      "grad_norm": 0.0019577471539378166,
+      "learning_rate": 1.0556944444444446e-05,
+      "loss": 0.0318,
+      "step": 6800
+    },
+    {
+      "epoch": 11.89236111111111,
+      "grad_norm": 0.02152959071099758,
+      "learning_rate": 1.04875e-05,
+      "loss": 0.0147,
+      "step": 6850
+    },
+    {
+      "epoch": 11.979166666666666,
+      "grad_norm": 0.003166797338053584,
+      "learning_rate": 1.0418055555555557e-05,
+      "loss": 0.0002,
+      "step": 6900
+    },
+    {
+      "epoch": 12.0,
+      "eval_loss": 0.0941697284579277,
+      "eval_runtime": 2.1655,
+      "eval_samples_per_second": 472.877,
+      "eval_steps_per_second": 29.555,
+      "step": 6912
+    },
+    {
+      "epoch": 12.065972222222221,
+      "grad_norm": 0.0014085551956668496,
+      "learning_rate": 1.0348611111111112e-05,
+      "loss": 0.0127,
+      "step": 6950
+    },
+    {
+      "epoch": 12.152777777777779,
+      "grad_norm": 0.0020881230011582375,
+      "learning_rate": 1.0279166666666666e-05,
+      "loss": 0.0056,
+      "step": 7000
+    },
+    {
+      "epoch": 12.239583333333334,
+      "grad_norm": 0.0022252153139561415,
+      "learning_rate": 1.0209722222222223e-05,
+      "loss": 0.0001,
+      "step": 7050
+    },
+    {
+      "epoch": 12.32638888888889,
+      "grad_norm": 0.014756396412849426,
+      "learning_rate": 1.0140277777777779e-05,
+      "loss": 0.0021,
+      "step": 7100
+    },
+    {
+      "epoch": 12.413194444444445,
+      "grad_norm": 0.0016215373761951923,
+      "learning_rate": 1.0070833333333334e-05,
+      "loss": 0.0017,
+      "step": 7150
+    },
+    {
+      "epoch": 12.5,
+      "grad_norm": 8.211623191833496,
+      "learning_rate": 1.000138888888889e-05,
+      "loss": 0.0081,
+      "step": 7200
+    },
+    {
+      "epoch": 12.586805555555555,
+      "grad_norm": 0.001198956393636763,
+      "learning_rate": 9.931944444444446e-06,
+      "loss": 0.0066,
+      "step": 7250
+    },
+    {
+      "epoch": 12.67361111111111,
+      "grad_norm": 0.09066519886255264,
+      "learning_rate": 9.862500000000001e-06,
+      "loss": 0.0109,
+      "step": 7300
+    },
+    {
+      "epoch": 12.760416666666666,
+      "grad_norm": 0.006189523730427027,
+      "learning_rate": 9.793055555555555e-06,
+      "loss": 0.0133,
+      "step": 7350
+    },
+    {
+      "epoch": 12.847222222222221,
+      "grad_norm": 0.0009115805733017623,
+      "learning_rate": 9.723611111111112e-06,
+      "loss": 0.0002,
+      "step": 7400
+    },
+    {
+      "epoch": 12.934027777777779,
+      "grad_norm": 0.001321441144682467,
+      "learning_rate": 9.654166666666668e-06,
+      "loss": 0.002,
+      "step": 7450
+    },
+    {
+      "epoch": 13.0,
+      "eval_loss": 0.07037966698408127,
+      "eval_runtime": 2.158,
+      "eval_samples_per_second": 474.51,
+      "eval_steps_per_second": 29.657,
+      "step": 7488
+    },
+    {
+      "epoch": 13.020833333333334,
+      "grad_norm": 0.001222062623128295,
+      "learning_rate": 9.584722222222223e-06,
+      "loss": 0.0059,
+      "step": 7500
+    },
+    {
+      "epoch": 13.10763888888889,
+      "grad_norm": 0.0009936870774254203,
+      "learning_rate": 9.515277777777779e-06,
+      "loss": 0.0147,
+      "step": 7550
+    },
+    {
+      "epoch": 13.194444444444445,
+      "grad_norm": 0.0007664511213079095,
+      "learning_rate": 9.445833333333334e-06,
+      "loss": 0.0009,
+      "step": 7600
+    },
+    {
+      "epoch": 13.28125,
+      "grad_norm": 0.0019496489549055696,
+      "learning_rate": 9.37638888888889e-06,
+      "loss": 0.0034,
+      "step": 7650
+    },
+    {
+      "epoch": 13.368055555555555,
+      "grad_norm": 0.0010263716103509068,
+      "learning_rate": 9.306944444444446e-06,
+      "loss": 0.0157,
+      "step": 7700
+    },
+    {
+      "epoch": 13.45486111111111,
+      "grad_norm": 0.0018417923711240292,
+      "learning_rate": 9.237500000000001e-06,
+      "loss": 0.0191,
+      "step": 7750
+    },
+    {
+      "epoch": 13.541666666666666,
+      "grad_norm": 0.009017466567456722,
+      "learning_rate": 9.168055555555556e-06,
+      "loss": 0.0142,
+      "step": 7800
+    },
+    {
+      "epoch": 13.628472222222221,
+      "grad_norm": 0.00774754025042057,
+      "learning_rate": 9.098611111111112e-06,
+      "loss": 0.0213,
+      "step": 7850
+    },
+    {
+      "epoch": 13.715277777777779,
+      "grad_norm": 0.0038997160736471415,
+      "learning_rate": 9.029166666666668e-06,
+      "loss": 0.0006,
+      "step": 7900
+    },
+    {
+      "epoch": 13.802083333333334,
+      "grad_norm": 0.001874367124401033,
+      "learning_rate": 8.959722222222223e-06,
+      "loss": 0.0135,
+      "step": 7950
+    },
+    {
+      "epoch": 13.88888888888889,
+      "grad_norm": 0.0013082403456792235,
+      "learning_rate": 8.890277777777777e-06,
+      "loss": 0.005,
+      "step": 8000
+    },
+    {
+      "epoch": 13.975694444444445,
+      "grad_norm": 0.005922501441091299,
+      "learning_rate": 8.820833333333334e-06,
+      "loss": 0.008,
+      "step": 8050
+    },
+    {
+      "epoch": 14.0,
+      "eval_loss": 0.11005319654941559,
+      "eval_runtime": 2.1582,
+      "eval_samples_per_second": 474.459,
+      "eval_steps_per_second": 29.654,
+      "step": 8064
+    },
+    {
+      "epoch": 14.0625,
+      "grad_norm": 0.000921198632568121,
+      "learning_rate": 8.75138888888889e-06,
+      "loss": 0.0178,
+      "step": 8100
+    },
+    {
+      "epoch": 14.149305555555555,
+      "grad_norm": 0.0010431046830490232,
+      "learning_rate": 8.681944444444446e-06,
+      "loss": 0.0002,
+      "step": 8150
+    },
+    {
+      "epoch": 14.23611111111111,
+      "grad_norm": 0.0012669609859585762,
+      "learning_rate": 8.612500000000001e-06,
+      "loss": 0.0031,
+      "step": 8200
+    },
+    {
+      "epoch": 14.322916666666666,
+      "grad_norm": 0.0029816783498972654,
+      "learning_rate": 8.543055555555556e-06,
+      "loss": 0.025,
+      "step": 8250
+    },
+    {
+      "epoch": 14.409722222222221,
+      "grad_norm": 0.005197410471737385,
+      "learning_rate": 8.473611111111112e-06,
+      "loss": 0.0016,
+      "step": 8300
+    },
+    {
+      "epoch": 14.496527777777779,
+      "grad_norm": 0.0018497951095923781,
+      "learning_rate": 8.404166666666668e-06,
+      "loss": 0.0002,
+      "step": 8350
+    },
+    {
+      "epoch": 14.583333333333334,
+      "grad_norm": 0.0028671796899288893,
+      "learning_rate": 8.334722222222223e-06,
+      "loss": 0.0012,
+      "step": 8400
+    },
+    {
+      "epoch": 14.67013888888889,
+      "grad_norm": 0.010854833759367466,
+      "learning_rate": 8.265277777777777e-06,
+      "loss": 0.0027,
+      "step": 8450
+    },
+    {
+      "epoch": 14.756944444444445,
+      "grad_norm": 0.011619815602898598,
+      "learning_rate": 8.195833333333334e-06,
+      "loss": 0.0051,
+      "step": 8500
+    },
+    {
+      "epoch": 14.84375,
+      "grad_norm": 0.004773242399096489,
+      "learning_rate": 8.12638888888889e-06,
+      "loss": 0.0203,
+      "step": 8550
+    },
+    {
+      "epoch": 14.930555555555555,
+      "grad_norm": 0.0017284239875152707,
+      "learning_rate": 8.056944444444446e-06,
+      "loss": 0.0084,
+      "step": 8600
+    },
+    {
+      "epoch": 15.0,
+      "eval_loss": 0.09546509385108948,
+      "eval_runtime": 2.159,
+      "eval_samples_per_second": 474.301,
+      "eval_steps_per_second": 29.644,
+      "step": 8640
+    },
+    {
+      "epoch": 15.01736111111111,
+      "grad_norm": 0.003186532063409686,
+      "learning_rate": 7.987500000000001e-06,
+      "loss": 0.0044,
+      "step": 8650
+    },
+    {
+      "epoch": 15.104166666666666,
+      "grad_norm": 0.0046905651688575745,
+      "learning_rate": 7.918055555555556e-06,
+      "loss": 0.0003,
+      "step": 8700
+    },
+    {
+      "epoch": 15.190972222222221,
+      "grad_norm": 0.001585672376677394,
+      "learning_rate": 7.848611111111112e-06,
+      "loss": 0.0014,
+      "step": 8750
+    },
+    {
+      "epoch": 15.277777777777779,
+      "grad_norm": 0.001381274894811213,
+      "learning_rate": 7.779166666666668e-06,
+      "loss": 0.0002,
+      "step": 8800
+    },
+    {
+      "epoch": 15.364583333333334,
+      "grad_norm": 0.0018836313392966986,
+      "learning_rate": 7.709722222222223e-06,
+      "loss": 0.0002,
+      "step": 8850
+    },
+    {
+      "epoch": 15.45138888888889,
+      "grad_norm": 0.0009360660915262997,
+      "learning_rate": 7.640277777777778e-06,
+      "loss": 0.0009,
+      "step": 8900
+    },
+    {
+      "epoch": 15.538194444444445,
+      "grad_norm": 0.0008543449803255498,
+      "learning_rate": 7.570833333333334e-06,
+      "loss": 0.0001,
+      "step": 8950
+    },
+    {
+      "epoch": 15.625,
+      "grad_norm": 0.38564541935920715,
+      "learning_rate": 7.501388888888889e-06,
+      "loss": 0.0148,
+      "step": 9000
+    },
+    {
+      "epoch": 15.711805555555555,
+      "grad_norm": 0.0016453195130452514,
+      "learning_rate": 7.431944444444446e-06,
+      "loss": 0.0103,
+      "step": 9050
+    },
+    {
+      "epoch": 15.79861111111111,
+      "grad_norm": 0.001604390563443303,
+      "learning_rate": 7.3625e-06,
+      "loss": 0.0039,
+      "step": 9100
+    },
+    {
+      "epoch": 15.885416666666666,
+      "grad_norm": 0.0020437692292034626,
+      "learning_rate": 7.293055555555556e-06,
+      "loss": 0.0001,
+      "step": 9150
+    },
+    {
+      "epoch": 15.972222222222221,
+      "grad_norm": 0.004224094562232494,
+      "learning_rate": 7.223611111111112e-06,
+      "loss": 0.0024,
+      "step": 9200
+    },
+    {
+      "epoch": 16.0,
+      "eval_loss": 0.057210568338632584,
+      "eval_runtime": 2.1916,
+      "eval_samples_per_second": 467.23,
+      "eval_steps_per_second": 29.202,
+      "step": 9216
+    },
+    {
+      "epoch": 16.05902777777778,
+      "grad_norm": 0.008274748921394348,
+      "learning_rate": 7.1541666666666675e-06,
+      "loss": 0.0002,
+      "step": 9250
+    },
+    {
+      "epoch": 16.145833333333332,
+      "grad_norm": 0.0013211554614827037,
+      "learning_rate": 7.084722222222222e-06,
+      "loss": 0.0051,
+      "step": 9300
+    },
+    {
+      "epoch": 16.23263888888889,
+      "grad_norm": 0.0025199875235557556,
+      "learning_rate": 7.015277777777778e-06,
+      "loss": 0.0001,
+      "step": 9350
+    },
+    {
+      "epoch": 16.319444444444443,
+      "grad_norm": 0.00225885515101254,
+      "learning_rate": 6.945833333333334e-06,
+      "loss": 0.0144,
+      "step": 9400
+    },
+    {
+      "epoch": 16.40625,
+      "grad_norm": 8.875648498535156,
+      "learning_rate": 6.876388888888889e-06,
+      "loss": 0.0097,
+      "step": 9450
+    },
+    {
+      "epoch": 16.493055555555557,
+      "grad_norm": 0.0012961571337655187,
+      "learning_rate": 6.806944444444446e-06,
+      "loss": 0.0008,
+      "step": 9500
+    },
+    {
+      "epoch": 16.57986111111111,
+      "grad_norm": 0.0071833692491054535,
+      "learning_rate": 6.7375e-06,
+      "loss": 0.0107,
+      "step": 9550
+    },
+    {
+      "epoch": 16.666666666666668,
+      "grad_norm": 0.0016137856291607022,
+      "learning_rate": 6.668055555555556e-06,
+      "loss": 0.011,
+      "step": 9600
+    },
+    {
+      "epoch": 16.75347222222222,
+      "grad_norm": 0.002170548541471362,
+      "learning_rate": 6.598611111111112e-06,
+      "loss": 0.0002,
+      "step": 9650
+    },
+    {
+      "epoch": 16.84027777777778,
+      "grad_norm": 0.15523186326026917,
+      "learning_rate": 6.5291666666666675e-06,
+      "loss": 0.0001,
+      "step": 9700
+    },
+    {
+      "epoch": 16.927083333333332,
+      "grad_norm": 0.0006897877901792526,
+      "learning_rate": 6.459722222222222e-06,
+      "loss": 0.0,
+      "step": 9750
+    },
+    {
+      "epoch": 17.0,
+      "eval_loss": 0.07502129673957825,
+      "eval_runtime": 2.1485,
+      "eval_samples_per_second": 476.611,
+      "eval_steps_per_second": 29.788,
+      "step": 9792
+    },
+    {
+      "epoch": 17.01388888888889,
+      "grad_norm": 0.007862378843128681,
+      "learning_rate": 6.390277777777778e-06,
+      "loss": 0.0072,
+      "step": 9800
+    },
+    {
+      "epoch": 17.100694444444443,
+      "grad_norm": 0.0048260875046253204,
+      "learning_rate": 6.320833333333334e-06,
+      "loss": 0.0001,
+      "step": 9850
+    },
+    {
+      "epoch": 17.1875,
+      "grad_norm": 0.001168318442068994,
+      "learning_rate": 6.251388888888889e-06,
+      "loss": 0.0001,
+      "step": 9900
+    },
+    {
+      "epoch": 17.274305555555557,
+      "grad_norm": 0.0010404903441667557,
+      "learning_rate": 6.181944444444446e-06,
+      "loss": 0.0,
+      "step": 9950
+    },
+    {
+      "epoch": 17.36111111111111,
+      "grad_norm": 0.0008959461702033877,
+      "learning_rate": 6.1125e-06,
+      "loss": 0.0,
+      "step": 10000
+    },
+    {
+      "epoch": 17.447916666666668,
+      "grad_norm": 0.0006551714614033699,
+      "learning_rate": 6.043055555555556e-06,
+      "loss": 0.0083,
+      "step": 10050
+    },
+    {
+      "epoch": 17.53472222222222,
+      "grad_norm": 0.004672654904425144,
+      "learning_rate": 5.973611111111111e-06,
+      "loss": 0.0053,
+      "step": 10100
+    },
+    {
+      "epoch": 17.62152777777778,
+      "grad_norm": 0.012874463573098183,
+      "learning_rate": 5.904166666666668e-06,
+      "loss": 0.0024,
+      "step": 10150
+    },
+    {
+      "epoch": 17.708333333333332,
+      "grad_norm": 0.0007252011564560235,
+      "learning_rate": 5.834722222222222e-06,
+      "loss": 0.0081,
+      "step": 10200
+    },
+    {
+      "epoch": 17.79513888888889,
+      "grad_norm": 0.003966485150158405,
+      "learning_rate": 5.765277777777778e-06,
+      "loss": 0.0001,
+      "step": 10250
+    },
+    {
+      "epoch": 17.881944444444443,
+      "grad_norm": 0.004643677733838558,
+      "learning_rate": 5.695833333333334e-06,
+      "loss": 0.0016,
+      "step": 10300
+    },
+    {
+      "epoch": 17.96875,
+      "grad_norm": 0.0022119064815342426,
+      "learning_rate": 5.6263888888888895e-06,
+      "loss": 0.0,
+      "step": 10350
+    },
+    {
+      "epoch": 18.0,
+      "eval_loss": 0.081531822681427,
+      "eval_runtime": 2.1481,
+      "eval_samples_per_second": 476.707,
+      "eval_steps_per_second": 29.794,
+      "step": 10368
+    },
+    {
+      "epoch": 18.055555555555557,
+      "grad_norm": 0.0007135890191420913,
+      "learning_rate": 5.556944444444446e-06,
+      "loss": 0.0008,
+      "step": 10400
+    },
+    {
+      "epoch": 18.14236111111111,
+      "grad_norm": 0.0011914368951693177,
+      "learning_rate": 5.4875e-06,
+      "loss": 0.0029,
+      "step": 10450
+    },
+    {
+      "epoch": 18.229166666666668,
+      "grad_norm": 0.000494650739710778,
+      "learning_rate": 5.418055555555556e-06,
+      "loss": 0.0,
+      "step": 10500
+    },
+    {
+      "epoch": 18.31597222222222,
+      "grad_norm": 0.0006326820584945381,
+      "learning_rate": 5.348611111111111e-06,
+      "loss": 0.0005,
+      "step": 10550
+    },
+    {
+      "epoch": 18.40277777777778,
+      "grad_norm": 0.0005152356461621821,
+      "learning_rate": 5.279166666666668e-06,
+      "loss": 0.0,
+      "step": 10600
+    },
+    {
+      "epoch": 18.489583333333332,
+      "grad_norm": 0.001848602551035583,
+      "learning_rate": 5.209722222222222e-06,
+      "loss": 0.0001,
+      "step": 10650
+    },
+    {
+      "epoch": 18.57638888888889,
+      "grad_norm": 0.00047650947817601264,
+      "learning_rate": 5.140277777777778e-06,
+      "loss": 0.014,
+      "step": 10700
+    },
+    {
+      "epoch": 18.663194444444443,
+      "grad_norm": 0.0004927313420921564,
+      "learning_rate": 5.070833333333334e-06,
+      "loss": 0.0101,
+      "step": 10750
+    },
+    {
+      "epoch": 18.75,
+      "grad_norm": 0.0006253819447010756,
+      "learning_rate": 5.0013888888888895e-06,
+      "loss": 0.0,
+      "step": 10800
+    },
+    {
+      "epoch": 18.836805555555557,
+      "grad_norm": 0.0005877416697330773,
+      "learning_rate": 4.931944444444445e-06,
+      "loss": 0.0,
+      "step": 10850
+    },
+    {
+      "epoch": 18.92361111111111,
+      "grad_norm": 0.000749260769225657,
+      "learning_rate": 4.8625000000000005e-06,
+      "loss": 0.0034,
+      "step": 10900
+    },
+    {
+      "epoch": 19.0,
+      "eval_loss": 0.08709079027175903,
+      "eval_runtime": 2.1461,
+      "eval_samples_per_second": 477.135,
+      "eval_steps_per_second": 29.821,
+      "step": 10944
+    },
+    {
+      "epoch": 19.010416666666668,
+      "grad_norm": 0.0033058973494917154,
+      "learning_rate": 4.793055555555556e-06,
+      "loss": 0.0028,
+      "step": 10950
+    },
+    {
+      "epoch": 19.09722222222222,
+      "grad_norm": 0.0005344200180843472,
+      "learning_rate": 4.723611111111111e-06,
+      "loss": 0.0012,
+      "step": 11000
+    },
+    {
+      "epoch": 19.18402777777778,
+      "grad_norm": 0.0011495526414364576,
+      "learning_rate": 4.654166666666667e-06,
+      "loss": 0.0,
+      "step": 11050
+    },
+    {
+      "epoch": 19.270833333333332,
+      "grad_norm": 0.0007638471433892846,
+      "learning_rate": 4.584722222222222e-06,
+      "loss": 0.0008,
+      "step": 11100
+    },
+    {
+      "epoch": 19.35763888888889,
+      "grad_norm": 0.0016245943261310458,
+      "learning_rate": 4.515277777777778e-06,
+      "loss": 0.0,
+      "step": 11150
+    },
+    {
+      "epoch": 19.444444444444443,
+      "grad_norm": 0.001350992708466947,
+      "learning_rate": 4.445833333333333e-06,
+      "loss": 0.0006,
+      "step": 11200
+    },
+    {
+      "epoch": 19.53125,
+      "grad_norm": 0.0007238370017148554,
+      "learning_rate": 4.3763888888888896e-06,
+      "loss": 0.0,
+      "step": 11250
+    },
+    {
+      "epoch": 19.618055555555557,
+      "grad_norm": 0.0009173430735245347,
+      "learning_rate": 4.306944444444445e-06,
+      "loss": 0.005,
+      "step": 11300
+    },
+    {
+      "epoch": 19.70486111111111,
+      "grad_norm": 0.002093716524541378,
+      "learning_rate": 4.2375000000000005e-06,
+      "loss": 0.0,
+      "step": 11350
+    },
+    {
+      "epoch": 19.791666666666668,
+      "grad_norm": 0.0008814275497570634,
+      "learning_rate": 4.168055555555556e-06,
+      "loss": 0.0004,
+      "step": 11400
+    },
+    {
+      "epoch": 19.87847222222222,
+      "grad_norm": 0.000544120033737272,
+      "learning_rate": 4.0986111111111114e-06,
+      "loss": 0.0001,
+      "step": 11450
+    },
+    {
+      "epoch": 19.96527777777778,
+      "grad_norm": 0.0006753376801498234,
+      "learning_rate": 4.029166666666667e-06,
+      "loss": 0.0078,
+      "step": 11500
+    },
+    {
+      "epoch": 20.0,
+      "eval_loss": 0.13657595217227936,
+      "eval_runtime": 2.1501,
+      "eval_samples_per_second": 476.265,
+      "eval_steps_per_second": 29.767,
+      "step": 11520
+    },
+    {
+      "epoch": 20.052083333333332,
+      "grad_norm": 0.000810706231277436,
+      "learning_rate": 3.959722222222222e-06,
+      "loss": 0.0,
+      "step": 11550
+    },
+    {
+      "epoch": 20.13888888888889,
+      "grad_norm": 0.0008722566999495029,
+      "learning_rate": 3.890277777777778e-06,
+      "loss": 0.002,
+      "step": 11600
+    },
+    {
+      "epoch": 20.225694444444443,
+      "grad_norm": 0.0010332430247217417,
+      "learning_rate": 3.820833333333333e-06,
+      "loss": 0.0,
+      "step": 11650
+    },
+    {
+      "epoch": 20.3125,
+      "grad_norm": 0.0003171579446643591,
+      "learning_rate": 3.751388888888889e-06,
+      "loss": 0.0001,
+      "step": 11700
+    },
+    {
+      "epoch": 20.399305555555557,
+      "grad_norm": 0.0003055994166061282,
+      "learning_rate": 3.6819444444444447e-06,
+      "loss": 0.0,
+      "step": 11750
+    },
+    {
+      "epoch": 20.48611111111111,
+      "grad_norm": 0.0005060540861450136,
+      "learning_rate": 3.6125000000000006e-06,
+      "loss": 0.0,
+      "step": 11800
+    },
+    {
+      "epoch": 20.572916666666668,
+      "grad_norm": 0.0005677157896570861,
+      "learning_rate": 3.5430555555555556e-06,
+      "loss": 0.0,
+      "step": 11850
+    },
+    {
+      "epoch": 20.65972222222222,
+      "grad_norm": 0.000569184310734272,
+      "learning_rate": 3.4736111111111115e-06,
+      "loss": 0.0011,
+      "step": 11900
+    },
+    {
+      "epoch": 20.74652777777778,
+      "grad_norm": 0.0008372145821340382,
+      "learning_rate": 3.4041666666666665e-06,
+      "loss": 0.0,
+      "step": 11950
+    },
+    {
+      "epoch": 20.833333333333332,
+      "grad_norm": 0.0009117226582020521,
+      "learning_rate": 3.3347222222222224e-06,
+      "loss": 0.0027,
+      "step": 12000
+    },
+    {
+      "epoch": 20.92013888888889,
+      "grad_norm": 0.0006149787222966552,
+      "learning_rate": 3.265277777777778e-06,
+      "loss": 0.0009,
+      "step": 12050
+    },
+    {
+      "epoch": 21.0,
+      "eval_loss": 0.07403512299060822,
+      "eval_runtime": 2.1707,
+      "eval_samples_per_second": 471.738,
+      "eval_steps_per_second": 29.484,
+      "step": 12096
+    },
+    {
+      "epoch": 21.006944444444443,
+      "grad_norm": 0.0009870927315205336,
+      "learning_rate": 3.1958333333333334e-06,
+      "loss": 0.0133,
+      "step": 12100
+    },
+    {
+      "epoch": 21.09375,
+      "grad_norm": 0.008030719123780727,
+      "learning_rate": 3.1263888888888893e-06,
+      "loss": 0.0,
+      "step": 12150
+    },
+    {
+      "epoch": 21.180555555555557,
+      "grad_norm": 0.0010351603850722313,
+      "learning_rate": 3.0569444444444447e-06,
+      "loss": 0.0025,
+      "step": 12200
+    },
+    {
+      "epoch": 21.26736111111111,
+      "grad_norm": 0.004166141152381897,
+      "learning_rate": 2.9875e-06,
+      "loss": 0.0023,
+      "step": 12250
+    },
+    {
+      "epoch": 21.354166666666668,
+      "grad_norm": 0.0021667364053428173,
+      "learning_rate": 2.9180555555555557e-06,
+      "loss": 0.005,
+      "step": 12300
+    },
+    {
+      "epoch": 21.44097222222222,
+      "grad_norm": 0.005213022232055664,
+      "learning_rate": 2.8486111111111116e-06,
+      "loss": 0.0001,
+      "step": 12350
+    },
+    {
+      "epoch": 21.52777777777778,
+      "grad_norm": 0.0016998907085508108,
+      "learning_rate": 2.7791666666666666e-06,
+      "loss": 0.0,
+      "step": 12400
+    },
+    {
+      "epoch": 21.614583333333332,
+      "grad_norm": 0.0011233899276703596,
+      "learning_rate": 2.7097222222222225e-06,
+      "loss": 0.0013,
+      "step": 12450
+    },
+    {
+      "epoch": 21.70138888888889,
+      "grad_norm": 0.00541352853178978,
+      "learning_rate": 2.6402777777777775e-06,
+      "loss": 0.0,
+      "step": 12500
+    },
+    {
+      "epoch": 21.788194444444443,
+      "grad_norm": 0.0006274741608649492,
+      "learning_rate": 2.5708333333333334e-06,
+      "loss": 0.0007,
+      "step": 12550
+    },
+    {
+      "epoch": 21.875,
+      "grad_norm": 0.000486269302200526,
+      "learning_rate": 2.5013888888888893e-06,
+      "loss": 0.0,
+      "step": 12600
+    },
+    {
+      "epoch": 21.961805555555557,
+      "grad_norm": 0.000523523660376668,
+      "learning_rate": 2.4319444444444444e-06,
+      "loss": 0.0005,
+      "step": 12650
+    },
+    {
+      "epoch": 22.0,
+      "eval_loss": 0.09000910818576813,
+      "eval_runtime": 2.1668,
+      "eval_samples_per_second": 472.584,
+      "eval_steps_per_second": 29.537,
+      "step": 12672
+    },
+    {
+      "epoch": 22.04861111111111,
+      "grad_norm": 0.0002973914088215679,
+      "learning_rate": 2.3625000000000003e-06,
+      "loss": 0.0,
+      "step": 12700
+    },
+    {
+      "epoch": 22.135416666666668,
+      "grad_norm": 0.00037905474891886115,
+      "learning_rate": 2.2930555555555557e-06,
+      "loss": 0.0003,
+      "step": 12750
+    },
+    {
+      "epoch": 22.22222222222222,
+      "grad_norm": 0.0003634969179984182,
+      "learning_rate": 2.223611111111111e-06,
+      "loss": 0.0,
+      "step": 12800
+    },
+    {
+      "epoch": 22.30902777777778,
+      "grad_norm": 0.0003171905700583011,
+      "learning_rate": 2.154166666666667e-06,
+      "loss": 0.0,
+      "step": 12850
+    },
+    {
+      "epoch": 22.395833333333332,
+      "grad_norm": 0.0005504607688635588,
+      "learning_rate": 2.0847222222222225e-06,
+      "loss": 0.0037,
+      "step": 12900
+    },
+    {
+      "epoch": 22.48263888888889,
+      "grad_norm": 0.001255808281712234,
+      "learning_rate": 2.015277777777778e-06,
+      "loss": 0.0025,
+      "step": 12950
+    },
+    {
+      "epoch": 22.569444444444443,
+      "grad_norm": 0.0004096803313586861,
+      "learning_rate": 1.9458333333333335e-06,
+      "loss": 0.0021,
+      "step": 13000
+    },
+    {
+      "epoch": 22.65625,
+      "grad_norm": 0.001167616923339665,
+      "learning_rate": 1.876388888888889e-06,
+      "loss": 0.0,
+      "step": 13050
+    },
+    {
+      "epoch": 22.743055555555557,
+      "grad_norm": 0.0003252147580496967,
+      "learning_rate": 1.8069444444444444e-06,
+      "loss": 0.0006,
+      "step": 13100
+    },
+    {
+      "epoch": 22.82986111111111,
+      "grad_norm": 0.0006310039316304028,
+      "learning_rate": 1.7375e-06,
+      "loss": 0.0001,
+      "step": 13150
+    },
+    {
+      "epoch": 22.916666666666668,
+      "grad_norm": 0.00035475249751470983,
+      "learning_rate": 1.6680555555555558e-06,
+      "loss": 0.0026,
+      "step": 13200
+    },
+    {
+      "epoch": 23.0,
+      "eval_loss": 0.0978098213672638,
+      "eval_runtime": 2.1724,
+      "eval_samples_per_second": 471.367,
+      "eval_steps_per_second": 29.46,
+      "step": 13248
+    },
+    {
+      "epoch": 23.00347222222222,
+      "grad_norm": 0.0005200915038585663,
+      "learning_rate": 1.5986111111111112e-06,
+      "loss": 0.0,
+      "step": 13250
+    },
+    {
+      "epoch": 23.09027777777778,
+      "grad_norm": 0.622466504573822,
+      "learning_rate": 1.529166666666667e-06,
+      "loss": 0.0003,
+      "step": 13300
+    },
+    {
+      "epoch": 23.177083333333332,
+      "grad_norm": 0.0003293896734248847,
+      "learning_rate": 1.4597222222222224e-06,
+      "loss": 0.0024,
+      "step": 13350
+    },
+    {
+      "epoch": 23.26388888888889,
+      "grad_norm": 0.00028986833058297634,
+      "learning_rate": 1.3902777777777779e-06,
+      "loss": 0.0,
+      "step": 13400
+    },
+    {
+      "epoch": 23.350694444444443,
+      "grad_norm": 0.00039684175862930715,
+      "learning_rate": 1.3208333333333333e-06,
+      "loss": 0.0004,
+      "step": 13450
+    },
+    {
+      "epoch": 23.4375,
+      "grad_norm": 0.0009431451908312738,
+      "learning_rate": 1.251388888888889e-06,
+      "loss": 0.0,
+      "step": 13500
+    },
+    {
+      "epoch": 23.524305555555557,
+      "grad_norm": 0.0009446613257750869,
+      "learning_rate": 1.1819444444444447e-06,
+      "loss": 0.0,
+      "step": 13550
+    },
+    {
+      "epoch": 23.61111111111111,
+      "grad_norm": 0.000527396856341511,
+      "learning_rate": 1.1125000000000001e-06,
+      "loss": 0.0,
+      "step": 13600
+    },
+    {
+      "epoch": 23.697916666666668,
+      "grad_norm": 0.0008534564403817058,
+      "learning_rate": 1.0430555555555556e-06,
+      "loss": 0.0,
+      "step": 13650
+    },
+    {
+      "epoch": 23.78472222222222,
+      "grad_norm": 0.000303189066471532,
+      "learning_rate": 9.73611111111111e-07,
+      "loss": 0.0,
+      "step": 13700
+    },
+    {
+      "epoch": 23.87152777777778,
+      "grad_norm": 0.0006793912034481764,
+      "learning_rate": 9.041666666666668e-07,
+      "loss": 0.0069,
+      "step": 13750
+    },
+    {
+      "epoch": 23.958333333333332,
+      "grad_norm": 0.00035732006654143333,
+      "learning_rate": 8.347222222222223e-07,
+      "loss": 0.0,
+      "step": 13800
+    },
+    {
+      "epoch": 24.0,
+      "eval_loss": 0.09862537682056427,
+      "eval_runtime": 2.1709,
+      "eval_samples_per_second": 471.693,
+      "eval_steps_per_second": 29.481,
+      "step": 13824
+    },
+    {
+      "epoch": 24.04513888888889,
+      "grad_norm": 0.000883078551851213,
+      "learning_rate": 7.652777777777778e-07,
+      "loss": 0.0,
+      "step": 13850
+    },
+    {
+      "epoch": 24.131944444444443,
+      "grad_norm": 0.00040300763794220984,
+      "learning_rate": 6.958333333333334e-07,
+      "loss": 0.0005,
+      "step": 13900
+    },
+    {
+      "epoch": 24.21875,
+      "grad_norm": 0.001412940793670714,
+      "learning_rate": 6.263888888888888e-07,
+      "loss": 0.0017,
+      "step": 13950
+    },
+    {
+      "epoch": 24.305555555555557,
+      "grad_norm": 0.000599319173488766,
+      "learning_rate": 5.569444444444444e-07,
+      "loss": 0.0,
+      "step": 14000
+    },
+    {
+      "epoch": 24.39236111111111,
+      "grad_norm": 0.0007638942333869636,
+      "learning_rate": 4.875000000000001e-07,
+      "loss": 0.0,
+      "step": 14050
+    },
+    {
+      "epoch": 24.479166666666668,
+      "grad_norm": 0.00034263054840266705,
+      "learning_rate": 4.1805555555555556e-07,
+      "loss": 0.0009,
+      "step": 14100
+    },
+    {
+      "epoch": 24.56597222222222,
+      "grad_norm": 0.0004652358475141227,
+      "learning_rate": 3.4861111111111114e-07,
+      "loss": 0.0015,
+      "step": 14150
+    },
+    {
+      "epoch": 24.65277777777778,
+      "grad_norm": 0.00027831687475554645,
+      "learning_rate": 2.7916666666666666e-07,
+      "loss": 0.0,
+      "step": 14200
+    },
+    {
+      "epoch": 24.739583333333332,
+      "grad_norm": 0.0002827314310707152,
+      "learning_rate": 2.0972222222222223e-07,
+      "loss": 0.0015,
+      "step": 14250
+    },
+    {
+      "epoch": 24.82638888888889,
+      "grad_norm": 0.00031116604804992676,
+      "learning_rate": 1.4027777777777778e-07,
+      "loss": 0.0,
+      "step": 14300
+    },
+    {
+      "epoch": 24.913194444444443,
+      "grad_norm": 0.00027885418967343867,
+      "learning_rate": 7.083333333333334e-08,
+      "loss": 0.0018,
+      "step": 14350
+    },
+    {
+      "epoch": 25.0,
+      "grad_norm": 0.0005058420938439667,
+      "learning_rate": 1.388888888888889e-09,
+      "loss": 0.0005,
+      "step": 14400
+    },
+    {
+      "epoch": 25.0,
+      "eval_loss": 0.09845860302448273,
+      "eval_runtime": 2.17,
+      "eval_samples_per_second": 471.882,
+      "eval_steps_per_second": 29.493,
+      "step": 14400
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 14400,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 25,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1377061586400000.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:baa56bfd358c76bf1a059c72b384e27f54f1a9ce98cb0d06dfec03d4a3cfa18b
+size 5649