diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,4934 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9017713365539453,
+  "eval_steps": 500,
+  "global_step": 700,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0012882447665056361,
+      "grad_norm": 5.764990329742432,
+      "learning_rate": 0.0,
+      "loss": 10.1729,
+      "step": 1
+    },
+    {
+      "epoch": 0.0025764895330112722,
+      "grad_norm": 5.699520111083984,
+      "learning_rate": 4e-05,
+      "loss": 9.9457,
+      "step": 2
+    },
+    {
+      "epoch": 0.003864734299516908,
+      "grad_norm": 5.760854721069336,
+      "learning_rate": 8e-05,
+      "loss": 9.7561,
+      "step": 3
+    },
+    {
+      "epoch": 0.0051529790660225444,
+      "grad_norm": 6.234244346618652,
+      "learning_rate": 0.00012,
+      "loss": 9.0043,
+      "step": 4
+    },
+    {
+      "epoch": 0.00644122383252818,
+      "grad_norm": 6.719285011291504,
+      "learning_rate": 0.00016,
+      "loss": 7.4539,
+      "step": 5
+    },
+    {
+      "epoch": 0.007729468599033816,
+      "grad_norm": 5.1464948654174805,
+      "learning_rate": 0.0002,
+      "loss": 5.7333,
+      "step": 6
+    },
+    {
+      "epoch": 0.009017713365539453,
+      "grad_norm": 3.3606348037719727,
+      "learning_rate": 0.00019974059662775616,
+      "loss": 3.8319,
+      "step": 7
+    },
+    {
+      "epoch": 0.010305958132045089,
+      "grad_norm": 2.361740827560425,
+      "learning_rate": 0.00019948119325551234,
+      "loss": 3.3392,
+      "step": 8
+    },
+    {
+      "epoch": 0.011594202898550725,
+      "grad_norm": 2.378281831741333,
+      "learning_rate": 0.0001992217898832685,
+      "loss": 2.8643,
+      "step": 9
+    },
+    {
+      "epoch": 0.01288244766505636,
+      "grad_norm": 1.9084206819534302,
+      "learning_rate": 0.00019896238651102467,
+      "loss": 2.4692,
+      "step": 10
+    },
+    {
+      "epoch": 0.014170692431561997,
+      "grad_norm": 2.3616507053375244,
+      "learning_rate": 0.00019870298313878082,
+      "loss": 2.2057,
+      "step": 11
+    },
+    {
+      "epoch": 0.015458937198067632,
+      "grad_norm": 2.7130489349365234,
+      "learning_rate": 0.00019844357976653697,
+      "loss": 1.8781,
+      "step": 12
+    },
+    {
+      "epoch": 0.01674718196457327,
+      "grad_norm": 5.479770183563232,
+      "learning_rate": 0.00019818417639429315,
+      "loss": 1.6427,
+      "step": 13
+    },
+    {
+      "epoch": 0.018035426731078906,
+      "grad_norm": 2.0840210914611816,
+      "learning_rate": 0.0001979247730220493,
+      "loss": 1.6455,
+      "step": 14
+    },
+    {
+      "epoch": 0.01932367149758454,
+      "grad_norm": 17.294357299804688,
+      "learning_rate": 0.00019766536964980547,
+      "loss": 1.9366,
+      "step": 15
+    },
+    {
+      "epoch": 0.020611916264090178,
+      "grad_norm": 3.7959189414978027,
+      "learning_rate": 0.00019740596627756162,
+      "loss": 1.9272,
+      "step": 16
+    },
+    {
+      "epoch": 0.021900161030595812,
+      "grad_norm": 9.078225135803223,
+      "learning_rate": 0.00019714656290531778,
+      "loss": 1.8734,
+      "step": 17
+    },
+    {
+      "epoch": 0.02318840579710145,
+      "grad_norm": 2.7898125648498535,
+      "learning_rate": 0.00019688715953307395,
+      "loss": 1.8415,
+      "step": 18
+    },
+    {
+      "epoch": 0.024476650563607084,
+      "grad_norm": 5.833450794219971,
+      "learning_rate": 0.00019662775616083008,
+      "loss": 1.6641,
+      "step": 19
+    },
+    {
+      "epoch": 0.02576489533011272,
+      "grad_norm": 1.286916971206665,
+      "learning_rate": 0.00019636835278858625,
+      "loss": 1.6488,
+      "step": 20
+    },
+    {
+      "epoch": 0.02705314009661836,
+      "grad_norm": 1.4083938598632812,
+      "learning_rate": 0.0001961089494163424,
+      "loss": 1.6369,
+      "step": 21
+    },
+    {
+      "epoch": 0.028341384863123993,
+      "grad_norm": 11.11021900177002,
+      "learning_rate": 0.00019584954604409858,
+      "loss": 1.0877,
+      "step": 22
+    },
+    {
+      "epoch": 0.02962962962962963,
+      "grad_norm": 4.023814678192139,
+      "learning_rate": 0.00019559014267185473,
+      "loss": 1.172,
+      "step": 23
+    },
+    {
+      "epoch": 0.030917874396135265,
+      "grad_norm": 1.5380833148956299,
+      "learning_rate": 0.0001953307392996109,
+      "loss": 1.2489,
+      "step": 24
+    },
+    {
+      "epoch": 0.0322061191626409,
+      "grad_norm": 3.5287179946899414,
+      "learning_rate": 0.00019507133592736706,
+      "loss": 1.059,
+      "step": 25
+    },
+    {
+      "epoch": 0.03349436392914654,
+      "grad_norm": 0.4443202316761017,
+      "learning_rate": 0.0001948119325551232,
+      "loss": 1.0324,
+      "step": 26
+    },
+    {
+      "epoch": 0.034782608695652174,
+      "grad_norm": 4.4658098220825195,
+      "learning_rate": 0.0001945525291828794,
+      "loss": 0.9281,
+      "step": 27
+    },
+    {
+      "epoch": 0.03607085346215781,
+      "grad_norm": 0.6924022436141968,
+      "learning_rate": 0.00019429312581063554,
+      "loss": 0.7725,
+      "step": 28
+    },
+    {
+      "epoch": 0.03735909822866344,
+      "grad_norm": 0.39130347967147827,
+      "learning_rate": 0.00019403372243839172,
+      "loss": 0.9851,
+      "step": 29
+    },
+    {
+      "epoch": 0.03864734299516908,
+      "grad_norm": 0.478762149810791,
+      "learning_rate": 0.00019377431906614787,
+      "loss": 0.9003,
+      "step": 30
+    },
+    {
+      "epoch": 0.03993558776167472,
+      "grad_norm": 0.485379695892334,
+      "learning_rate": 0.00019351491569390402,
+      "loss": 0.9038,
+      "step": 31
+    },
+    {
+      "epoch": 0.041223832528180356,
+      "grad_norm": 0.4116724729537964,
+      "learning_rate": 0.0001932555123216602,
+      "loss": 0.7782,
+      "step": 32
+    },
+    {
+      "epoch": 0.04251207729468599,
+      "grad_norm": 0.35044676065444946,
+      "learning_rate": 0.00019299610894941635,
+      "loss": 0.7145,
+      "step": 33
+    },
+    {
+      "epoch": 0.043800322061191624,
+      "grad_norm": 0.34671103954315186,
+      "learning_rate": 0.00019273670557717253,
+      "loss": 0.838,
+      "step": 34
+    },
+    {
+      "epoch": 0.04508856682769726,
+      "grad_norm": 0.3169376850128174,
+      "learning_rate": 0.00019247730220492868,
+      "loss": 0.9283,
+      "step": 35
+    },
+    {
+      "epoch": 0.0463768115942029,
+      "grad_norm": 0.3791329860687256,
+      "learning_rate": 0.00019221789883268483,
+      "loss": 0.9332,
+      "step": 36
+    },
+    {
+      "epoch": 0.04766505636070854,
+      "grad_norm": 0.39683282375335693,
+      "learning_rate": 0.000191958495460441,
+      "loss": 0.837,
+      "step": 37
+    },
+    {
+      "epoch": 0.04895330112721417,
+      "grad_norm": 0.4130147099494934,
+      "learning_rate": 0.00019169909208819716,
+      "loss": 0.688,
+      "step": 38
+    },
+    {
+      "epoch": 0.050241545893719805,
+      "grad_norm": 0.535886824131012,
+      "learning_rate": 0.00019143968871595333,
+      "loss": 0.819,
+      "step": 39
+    },
+    {
+      "epoch": 0.05152979066022544,
+      "grad_norm": 0.41564154624938965,
+      "learning_rate": 0.00019118028534370949,
+      "loss": 1.0323,
+      "step": 40
+    },
+    {
+      "epoch": 0.05281803542673108,
+      "grad_norm": 0.38580086827278137,
+      "learning_rate": 0.00019092088197146564,
+      "loss": 0.9947,
+      "step": 41
+    },
+    {
+      "epoch": 0.05410628019323672,
+      "grad_norm": 0.3614998757839203,
+      "learning_rate": 0.00019066147859922181,
+      "loss": 0.8925,
+      "step": 42
+    },
+    {
+      "epoch": 0.05539452495974235,
+      "grad_norm": 0.3364286422729492,
+      "learning_rate": 0.00019040207522697794,
+      "loss": 0.8473,
+      "step": 43
+    },
+    {
+      "epoch": 0.056682769726247986,
+      "grad_norm": 0.3541828393936157,
+      "learning_rate": 0.00019014267185473412,
+      "loss": 0.8477,
+      "step": 44
+    },
+    {
+      "epoch": 0.057971014492753624,
+      "grad_norm": 0.35495537519454956,
+      "learning_rate": 0.00018988326848249027,
+      "loss": 0.8881,
+      "step": 45
+    },
+    {
+      "epoch": 0.05925925925925926,
+      "grad_norm": 0.43733540177345276,
+      "learning_rate": 0.00018962386511024644,
+      "loss": 0.8743,
+      "step": 46
+    },
+    {
+      "epoch": 0.06054750402576489,
+      "grad_norm": 0.3078387975692749,
+      "learning_rate": 0.0001893644617380026,
+      "loss": 0.7479,
+      "step": 47
+    },
+    {
+      "epoch": 0.06183574879227053,
+      "grad_norm": 0.36661794781684875,
+      "learning_rate": 0.00018910505836575875,
+      "loss": 0.8258,
+      "step": 48
+    },
+    {
+      "epoch": 0.06312399355877617,
+      "grad_norm": 0.34701570868492126,
+      "learning_rate": 0.00018884565499351492,
+      "loss": 0.8085,
+      "step": 49
+    },
+    {
+      "epoch": 0.0644122383252818,
+      "grad_norm": 0.30905681848526,
+      "learning_rate": 0.00018858625162127107,
+      "loss": 0.7474,
+      "step": 50
+    },
+    {
+      "epoch": 0.06570048309178744,
+      "grad_norm": 0.47441986203193665,
+      "learning_rate": 0.00018832684824902725,
+      "loss": 1.0549,
+      "step": 51
+    },
+    {
+      "epoch": 0.06698872785829307,
+      "grad_norm": 0.2966022491455078,
+      "learning_rate": 0.0001880674448767834,
+      "loss": 0.8517,
+      "step": 52
+    },
+    {
+      "epoch": 0.06827697262479872,
+      "grad_norm": 0.33785632252693176,
+      "learning_rate": 0.00018780804150453958,
+      "loss": 0.8858,
+      "step": 53
+    },
+    {
+      "epoch": 0.06956521739130435,
+      "grad_norm": 0.33717742562294006,
+      "learning_rate": 0.00018754863813229573,
+      "loss": 0.7961,
+      "step": 54
+    },
+    {
+      "epoch": 0.07085346215780998,
+      "grad_norm": 0.4235801100730896,
+      "learning_rate": 0.00018728923476005188,
+      "loss": 0.9562,
+      "step": 55
+    },
+    {
+      "epoch": 0.07214170692431562,
+      "grad_norm": 0.40099507570266724,
+      "learning_rate": 0.00018702983138780806,
+      "loss": 0.6817,
+      "step": 56
+    },
+    {
+      "epoch": 0.07342995169082125,
+      "grad_norm": 0.3041292428970337,
+      "learning_rate": 0.0001867704280155642,
+      "loss": 0.6219,
+      "step": 57
+    },
+    {
+      "epoch": 0.07471819645732689,
+      "grad_norm": 0.428120493888855,
+      "learning_rate": 0.0001865110246433204,
+      "loss": 0.7911,
+      "step": 58
+    },
+    {
+      "epoch": 0.07600644122383253,
+      "grad_norm": 0.39466729760169983,
+      "learning_rate": 0.00018625162127107654,
+      "loss": 0.7883,
+      "step": 59
+    },
+    {
+      "epoch": 0.07729468599033816,
+      "grad_norm": 0.3272225856781006,
+      "learning_rate": 0.0001859922178988327,
+      "loss": 0.736,
+      "step": 60
+    },
+    {
+      "epoch": 0.0785829307568438,
+      "grad_norm": 0.3868604898452759,
+      "learning_rate": 0.00018573281452658887,
+      "loss": 0.772,
+      "step": 61
+    },
+    {
+      "epoch": 0.07987117552334944,
+      "grad_norm": 0.4111652970314026,
+      "learning_rate": 0.00018547341115434502,
+      "loss": 0.7715,
+      "step": 62
+    },
+    {
+      "epoch": 0.08115942028985507,
+      "grad_norm": 0.367587149143219,
+      "learning_rate": 0.0001852140077821012,
+      "loss": 0.8887,
+      "step": 63
+    },
+    {
+      "epoch": 0.08244766505636071,
+      "grad_norm": 0.36358535289764404,
+      "learning_rate": 0.00018495460440985735,
+      "loss": 0.6756,
+      "step": 64
+    },
+    {
+      "epoch": 0.08373590982286634,
+      "grad_norm": 0.3693746030330658,
+      "learning_rate": 0.0001846952010376135,
+      "loss": 0.7451,
+      "step": 65
+    },
+    {
+      "epoch": 0.08502415458937199,
+      "grad_norm": 0.33801788091659546,
+      "learning_rate": 0.00018443579766536967,
+      "loss": 0.7722,
+      "step": 66
+    },
+    {
+      "epoch": 0.08631239935587762,
+      "grad_norm": 0.40920770168304443,
+      "learning_rate": 0.0001841763942931258,
+      "loss": 0.6399,
+      "step": 67
+    },
+    {
+      "epoch": 0.08760064412238325,
+      "grad_norm": 0.36758852005004883,
+      "learning_rate": 0.00018391699092088198,
+      "loss": 0.7686,
+      "step": 68
+    },
+    {
+      "epoch": 0.08888888888888889,
+      "grad_norm": 0.37189269065856934,
+      "learning_rate": 0.00018365758754863813,
+      "loss": 0.8437,
+      "step": 69
+    },
+    {
+      "epoch": 0.09017713365539452,
+      "grad_norm": 0.42477479577064514,
+      "learning_rate": 0.0001833981841763943,
+      "loss": 0.7571,
+      "step": 70
+    },
+    {
+      "epoch": 0.09146537842190017,
+      "grad_norm": 0.34100234508514404,
+      "learning_rate": 0.00018313878080415046,
+      "loss": 0.733,
+      "step": 71
+    },
+    {
+      "epoch": 0.0927536231884058,
+      "grad_norm": 0.42223483324050903,
+      "learning_rate": 0.0001828793774319066,
+      "loss": 0.7981,
+      "step": 72
+    },
+    {
+      "epoch": 0.09404186795491143,
+      "grad_norm": 0.40469273924827576,
+      "learning_rate": 0.00018261997405966278,
+      "loss": 0.8633,
+      "step": 73
+    },
+    {
+      "epoch": 0.09533011272141707,
+      "grad_norm": 0.35466790199279785,
+      "learning_rate": 0.00018236057068741893,
+      "loss": 0.7359,
+      "step": 74
+    },
+    {
+      "epoch": 0.0966183574879227,
+      "grad_norm": 0.3824892044067383,
+      "learning_rate": 0.0001821011673151751,
+      "loss": 0.846,
+      "step": 75
+    },
+    {
+      "epoch": 0.09790660225442833,
+      "grad_norm": 0.4101675748825073,
+      "learning_rate": 0.00018184176394293126,
+      "loss": 1.0094,
+      "step": 76
+    },
+    {
+      "epoch": 0.09919484702093398,
+      "grad_norm": 0.3373378813266754,
+      "learning_rate": 0.0001815823605706874,
+      "loss": 0.6908,
+      "step": 77
+    },
+    {
+      "epoch": 0.10048309178743961,
+      "grad_norm": 0.41473421454429626,
+      "learning_rate": 0.0001813229571984436,
+      "loss": 0.7753,
+      "step": 78
+    },
+    {
+      "epoch": 0.10177133655394525,
+      "grad_norm": 0.3552979826927185,
+      "learning_rate": 0.00018106355382619974,
+      "loss": 0.883,
+      "step": 79
+    },
+    {
+      "epoch": 0.10305958132045089,
+      "grad_norm": 0.3655754029750824,
+      "learning_rate": 0.00018080415045395592,
+      "loss": 0.7978,
+      "step": 80
+    },
+    {
+      "epoch": 0.10434782608695652,
+      "grad_norm": 0.398554265499115,
+      "learning_rate": 0.00018054474708171207,
+      "loss": 0.9426,
+      "step": 81
+    },
+    {
+      "epoch": 0.10563607085346216,
+      "grad_norm": 0.4098765552043915,
+      "learning_rate": 0.00018028534370946825,
+      "loss": 0.7127,
+      "step": 82
+    },
+    {
+      "epoch": 0.10692431561996779,
+      "grad_norm": 0.38591381907463074,
+      "learning_rate": 0.0001800259403372244,
+      "loss": 0.7994,
+      "step": 83
+    },
+    {
+      "epoch": 0.10821256038647344,
+      "grad_norm": 0.42177343368530273,
+      "learning_rate": 0.00017976653696498055,
+      "loss": 0.8032,
+      "step": 84
+    },
+    {
+      "epoch": 0.10950080515297907,
+      "grad_norm": 0.38358885049819946,
+      "learning_rate": 0.00017950713359273673,
+      "loss": 0.8478,
+      "step": 85
+    },
+    {
+      "epoch": 0.1107890499194847,
+      "grad_norm": 0.4549978971481323,
+      "learning_rate": 0.00017924773022049288,
+      "loss": 0.813,
+      "step": 86
+    },
+    {
+      "epoch": 0.11207729468599034,
+      "grad_norm": 0.4372895359992981,
+      "learning_rate": 0.00017898832684824906,
+      "loss": 0.8797,
+      "step": 87
+    },
+    {
+      "epoch": 0.11336553945249597,
+      "grad_norm": 0.4454326033592224,
+      "learning_rate": 0.0001787289234760052,
+      "loss": 0.8554,
+      "step": 88
+    },
+    {
+      "epoch": 0.11465378421900162,
+      "grad_norm": 0.3808746933937073,
+      "learning_rate": 0.00017846952010376136,
+      "loss": 0.5919,
+      "step": 89
+    },
+    {
+      "epoch": 0.11594202898550725,
+      "grad_norm": 0.4146284759044647,
+      "learning_rate": 0.00017821011673151754,
+      "loss": 0.8823,
+      "step": 90
+    },
+    {
+      "epoch": 0.11723027375201288,
+      "grad_norm": 0.47205957770347595,
+      "learning_rate": 0.00017795071335927366,
+      "loss": 0.6284,
+      "step": 91
+    },
+    {
+      "epoch": 0.11851851851851852,
+      "grad_norm": 0.4155535101890564,
+      "learning_rate": 0.00017769130998702984,
+      "loss": 0.8945,
+      "step": 92
+    },
+    {
+      "epoch": 0.11980676328502415,
+      "grad_norm": 0.4152592420578003,
+      "learning_rate": 0.000177431906614786,
+      "loss": 0.818,
+      "step": 93
+    },
+    {
+      "epoch": 0.12109500805152978,
+      "grad_norm": 0.4558146297931671,
+      "learning_rate": 0.00017717250324254217,
+      "loss": 0.651,
+      "step": 94
+    },
+    {
+      "epoch": 0.12238325281803543,
+      "grad_norm": 0.4004950523376465,
+      "learning_rate": 0.00017691309987029832,
+      "loss": 0.7546,
+      "step": 95
+    },
+    {
+      "epoch": 0.12367149758454106,
+      "grad_norm": 0.35895851254463196,
+      "learning_rate": 0.00017665369649805447,
+      "loss": 0.6174,
+      "step": 96
+    },
+    {
+      "epoch": 0.1249597423510467,
+      "grad_norm": 0.4626515209674835,
+      "learning_rate": 0.00017639429312581064,
+      "loss": 0.716,
+      "step": 97
+    },
+    {
+      "epoch": 0.12624798711755233,
+      "grad_norm": 0.47447800636291504,
+      "learning_rate": 0.0001761348897535668,
+      "loss": 0.9699,
+      "step": 98
+    },
+    {
+      "epoch": 0.12753623188405797,
+      "grad_norm": 0.4361920654773712,
+      "learning_rate": 0.00017587548638132297,
+      "loss": 0.9217,
+      "step": 99
+    },
+    {
+      "epoch": 0.1288244766505636,
+      "grad_norm": 0.42450228333473206,
+      "learning_rate": 0.00017561608300907912,
+      "loss": 0.6938,
+      "step": 100
+    },
+    {
+      "epoch": 0.13011272141706925,
+      "grad_norm": 0.4310356080532074,
+      "learning_rate": 0.00017535667963683527,
+      "loss": 0.7263,
+      "step": 101
+    },
+    {
+      "epoch": 0.13140096618357489,
+      "grad_norm": 0.5808001756668091,
+      "learning_rate": 0.00017509727626459145,
+      "loss": 0.9891,
+      "step": 102
+    },
+    {
+      "epoch": 0.13268921095008052,
+      "grad_norm": 0.49347755312919617,
+      "learning_rate": 0.0001748378728923476,
+      "loss": 0.7918,
+      "step": 103
+    },
+    {
+      "epoch": 0.13397745571658615,
+      "grad_norm": 0.42868706583976746,
+      "learning_rate": 0.00017457846952010378,
+      "loss": 0.7067,
+      "step": 104
+    },
+    {
+      "epoch": 0.13526570048309178,
+      "grad_norm": 0.4322398900985718,
+      "learning_rate": 0.00017431906614785993,
+      "loss": 0.6705,
+      "step": 105
+    },
+    {
+      "epoch": 0.13655394524959744,
+      "grad_norm": 0.41033244132995605,
+      "learning_rate": 0.00017405966277561608,
+      "loss": 0.6878,
+      "step": 106
+    },
+    {
+      "epoch": 0.13784219001610307,
+      "grad_norm": 0.536390483379364,
+      "learning_rate": 0.00017380025940337226,
+      "loss": 0.6961,
+      "step": 107
+    },
+    {
+      "epoch": 0.1391304347826087,
+      "grad_norm": 0.4299734830856323,
+      "learning_rate": 0.0001735408560311284,
+      "loss": 0.7065,
+      "step": 108
+    },
+    {
+      "epoch": 0.14041867954911433,
+      "grad_norm": 0.4070943593978882,
+      "learning_rate": 0.0001732814526588846,
+      "loss": 0.6975,
+      "step": 109
+    },
+    {
+      "epoch": 0.14170692431561996,
+      "grad_norm": 0.46637794375419617,
+      "learning_rate": 0.00017302204928664074,
+      "loss": 0.7583,
+      "step": 110
+    },
+    {
+      "epoch": 0.14299516908212562,
+      "grad_norm": 0.38566964864730835,
+      "learning_rate": 0.00017276264591439692,
+      "loss": 0.7765,
+      "step": 111
+    },
+    {
+      "epoch": 0.14428341384863125,
+      "grad_norm": 0.38054248690605164,
+      "learning_rate": 0.00017250324254215307,
+      "loss": 0.6291,
+      "step": 112
+    },
+    {
+      "epoch": 0.14557165861513688,
+      "grad_norm": 0.5447641015052795,
+      "learning_rate": 0.00017224383916990922,
+      "loss": 1.0189,
+      "step": 113
+    },
+    {
+      "epoch": 0.1468599033816425,
+      "grad_norm": 0.4753653109073639,
+      "learning_rate": 0.0001719844357976654,
+      "loss": 0.83,
+      "step": 114
+    },
+    {
+      "epoch": 0.14814814814814814,
+      "grad_norm": 0.4890337288379669,
+      "learning_rate": 0.00017172503242542152,
+      "loss": 0.8586,
+      "step": 115
+    },
+    {
+      "epoch": 0.14943639291465377,
+      "grad_norm": 0.42376580834388733,
+      "learning_rate": 0.0001714656290531777,
+      "loss": 0.805,
+      "step": 116
+    },
+    {
+      "epoch": 0.15072463768115943,
+      "grad_norm": 0.4509013295173645,
+      "learning_rate": 0.00017120622568093385,
+      "loss": 0.8794,
+      "step": 117
+    },
+    {
+      "epoch": 0.15201288244766506,
+      "grad_norm": 0.4562942385673523,
+      "learning_rate": 0.00017094682230869003,
+      "loss": 0.6392,
+      "step": 118
+    },
+    {
+      "epoch": 0.1533011272141707,
+      "grad_norm": 0.48996442556381226,
+      "learning_rate": 0.00017068741893644618,
+      "loss": 0.7655,
+      "step": 119
+    },
+    {
+      "epoch": 0.15458937198067632,
+      "grad_norm": 0.5451317429542542,
+      "learning_rate": 0.00017042801556420233,
+      "loss": 0.796,
+      "step": 120
+    },
+    {
+      "epoch": 0.15587761674718195,
+      "grad_norm": 0.45719748735427856,
+      "learning_rate": 0.0001701686121919585,
+      "loss": 0.7704,
+      "step": 121
+    },
+    {
+      "epoch": 0.1571658615136876,
+      "grad_norm": 0.5048899054527283,
+      "learning_rate": 0.00016990920881971466,
+      "loss": 0.7396,
+      "step": 122
+    },
+    {
+      "epoch": 0.15845410628019324,
+      "grad_norm": 0.4184553921222687,
+      "learning_rate": 0.00016964980544747083,
+      "loss": 0.9926,
+      "step": 123
+    },
+    {
+      "epoch": 0.15974235104669887,
+      "grad_norm": 0.4456348717212677,
+      "learning_rate": 0.00016939040207522698,
+      "loss": 0.7654,
+      "step": 124
+    },
+    {
+      "epoch": 0.1610305958132045,
+      "grad_norm": 0.4423070251941681,
+      "learning_rate": 0.00016913099870298313,
+      "loss": 0.7832,
+      "step": 125
+    },
+    {
+      "epoch": 0.16231884057971013,
+      "grad_norm": 0.5408623218536377,
+      "learning_rate": 0.0001688715953307393,
+      "loss": 0.9074,
+      "step": 126
+    },
+    {
+      "epoch": 0.1636070853462158,
+      "grad_norm": 0.5411691665649414,
+      "learning_rate": 0.00016861219195849546,
+      "loss": 0.9271,
+      "step": 127
+    },
+    {
+      "epoch": 0.16489533011272142,
+      "grad_norm": 0.41004684567451477,
+      "learning_rate": 0.00016835278858625164,
+      "loss": 0.5686,
+      "step": 128
+    },
+    {
+      "epoch": 0.16618357487922705,
+      "grad_norm": 0.43191105127334595,
+      "learning_rate": 0.0001680933852140078,
+      "loss": 0.7841,
+      "step": 129
+    },
+    {
+      "epoch": 0.16747181964573268,
+      "grad_norm": 0.46590304374694824,
+      "learning_rate": 0.00016783398184176394,
+      "loss": 0.6283,
+      "step": 130
+    },
+    {
+      "epoch": 0.16876006441223831,
+      "grad_norm": 0.4356256425380707,
+      "learning_rate": 0.00016757457846952012,
+      "loss": 0.5977,
+      "step": 131
+    },
+    {
+      "epoch": 0.17004830917874397,
+      "grad_norm": 0.44105201959609985,
+      "learning_rate": 0.00016731517509727627,
+      "loss": 0.8701,
+      "step": 132
+    },
+    {
+      "epoch": 0.1713365539452496,
+      "grad_norm": 0.496669739484787,
+      "learning_rate": 0.00016705577172503245,
+      "loss": 0.8613,
+      "step": 133
+    },
+    {
+      "epoch": 0.17262479871175523,
+      "grad_norm": 0.41839754581451416,
+      "learning_rate": 0.0001667963683527886,
+      "loss": 0.5693,
+      "step": 134
+    },
+    {
+      "epoch": 0.17391304347826086,
+      "grad_norm": 0.42133820056915283,
+      "learning_rate": 0.00016653696498054475,
+      "loss": 0.7622,
+      "step": 135
+    },
+    {
+      "epoch": 0.1752012882447665,
+      "grad_norm": 0.45265620946884155,
+      "learning_rate": 0.00016627756160830093,
+      "loss": 0.8501,
+      "step": 136
+    },
+    {
+      "epoch": 0.17648953301127215,
+      "grad_norm": 0.45904725790023804,
+      "learning_rate": 0.00016601815823605708,
+      "loss": 0.9041,
+      "step": 137
+    },
+    {
+      "epoch": 0.17777777777777778,
+      "grad_norm": 0.42884427309036255,
+      "learning_rate": 0.00016575875486381326,
+      "loss": 0.7386,
+      "step": 138
+    },
+    {
+      "epoch": 0.17906602254428342,
+      "grad_norm": 0.44365760684013367,
+      "learning_rate": 0.00016549935149156938,
+      "loss": 0.7196,
+      "step": 139
+    },
+    {
+      "epoch": 0.18035426731078905,
+      "grad_norm": 0.38908517360687256,
+      "learning_rate": 0.00016523994811932556,
+      "loss": 0.6251,
+      "step": 140
+    },
+    {
+      "epoch": 0.18164251207729468,
+      "grad_norm": 0.3956596255302429,
+      "learning_rate": 0.0001649805447470817,
+      "loss": 0.6839,
+      "step": 141
+    },
+    {
+      "epoch": 0.18293075684380034,
+      "grad_norm": 0.46725159883499146,
+      "learning_rate": 0.00016472114137483789,
+      "loss": 0.7637,
+      "step": 142
+    },
+    {
+      "epoch": 0.18421900161030597,
+      "grad_norm": 0.4984063506126404,
+      "learning_rate": 0.00016446173800259404,
+      "loss": 0.818,
+      "step": 143
+    },
+    {
+      "epoch": 0.1855072463768116,
+      "grad_norm": 0.40556883811950684,
+      "learning_rate": 0.0001642023346303502,
+      "loss": 0.5996,
+      "step": 144
+    },
+    {
+      "epoch": 0.18679549114331723,
+      "grad_norm": 0.4421241581439972,
+      "learning_rate": 0.00016394293125810637,
+      "loss": 0.8339,
+      "step": 145
+    },
+    {
+      "epoch": 0.18808373590982286,
+      "grad_norm": 0.4321085512638092,
+      "learning_rate": 0.00016368352788586252,
+      "loss": 0.8028,
+      "step": 146
+    },
+    {
+      "epoch": 0.18937198067632852,
+      "grad_norm": 0.4498562514781952,
+      "learning_rate": 0.0001634241245136187,
+      "loss": 0.8976,
+      "step": 147
+    },
+    {
+      "epoch": 0.19066022544283415,
+      "grad_norm": 0.45957380533218384,
+      "learning_rate": 0.00016316472114137484,
+      "loss": 0.9271,
+      "step": 148
+    },
+    {
+      "epoch": 0.19194847020933978,
+      "grad_norm": 0.4764615595340729,
+      "learning_rate": 0.000162905317769131,
+      "loss": 0.9667,
+      "step": 149
+    },
+    {
+      "epoch": 0.1932367149758454,
+      "grad_norm": 0.4241081774234772,
+      "learning_rate": 0.00016264591439688717,
+      "loss": 0.6945,
+      "step": 150
+    },
+    {
+      "epoch": 0.19452495974235104,
+      "grad_norm": 0.5130481123924255,
+      "learning_rate": 0.00016238651102464332,
+      "loss": 0.7224,
+      "step": 151
+    },
+    {
+      "epoch": 0.19581320450885667,
+      "grad_norm": 0.4727570116519928,
+      "learning_rate": 0.0001621271076523995,
+      "loss": 0.7714,
+      "step": 152
+    },
+    {
+      "epoch": 0.19710144927536233,
+      "grad_norm": 0.5420963764190674,
+      "learning_rate": 0.00016186770428015565,
+      "loss": 0.9346,
+      "step": 153
+    },
+    {
+      "epoch": 0.19838969404186796,
+      "grad_norm": 0.4338800013065338,
+      "learning_rate": 0.0001616083009079118,
+      "loss": 0.8527,
+      "step": 154
+    },
+    {
+      "epoch": 0.1996779388083736,
+      "grad_norm": 0.45830976963043213,
+      "learning_rate": 0.00016134889753566798,
+      "loss": 0.8171,
+      "step": 155
+    },
+    {
+      "epoch": 0.20096618357487922,
+      "grad_norm": 0.48107942938804626,
+      "learning_rate": 0.00016108949416342413,
+      "loss": 0.8537,
+      "step": 156
+    },
+    {
+      "epoch": 0.20225442834138485,
+      "grad_norm": 0.4447987973690033,
+      "learning_rate": 0.0001608300907911803,
+      "loss": 0.7909,
+      "step": 157
+    },
+    {
+      "epoch": 0.2035426731078905,
+      "grad_norm": 0.4311445653438568,
+      "learning_rate": 0.00016057068741893646,
+      "loss": 0.7733,
+      "step": 158
+    },
+    {
+      "epoch": 0.20483091787439614,
+      "grad_norm": 0.5173223614692688,
+      "learning_rate": 0.0001603112840466926,
+      "loss": 0.9312,
+      "step": 159
+    },
+    {
+      "epoch": 0.20611916264090177,
+      "grad_norm": 0.5143957734107971,
+      "learning_rate": 0.0001600518806744488,
+      "loss": 0.804,
+      "step": 160
+    },
+    {
+      "epoch": 0.2074074074074074,
+      "grad_norm": 0.4494340121746063,
+      "learning_rate": 0.00015979247730220494,
+      "loss": 0.7741,
+      "step": 161
+    },
+    {
+      "epoch": 0.20869565217391303,
+      "grad_norm": 0.5051131248474121,
+      "learning_rate": 0.00015953307392996112,
+      "loss": 0.8216,
+      "step": 162
+    },
+    {
+      "epoch": 0.2099838969404187,
+      "grad_norm": 0.48853760957717896,
+      "learning_rate": 0.00015927367055771724,
+      "loss": 0.7958,
+      "step": 163
+    },
+    {
+      "epoch": 0.21127214170692432,
+      "grad_norm": 0.4491981863975525,
+      "learning_rate": 0.00015901426718547342,
+      "loss": 0.8012,
+      "step": 164
+    },
+    {
+      "epoch": 0.21256038647342995,
+      "grad_norm": 0.41452592611312866,
+      "learning_rate": 0.00015875486381322957,
+      "loss": 0.6653,
+      "step": 165
+    },
+    {
+      "epoch": 0.21384863123993558,
+      "grad_norm": 0.4610249400138855,
+      "learning_rate": 0.00015849546044098572,
+      "loss": 0.7559,
+      "step": 166
+    },
+    {
+      "epoch": 0.2151368760064412,
+      "grad_norm": 0.46895861625671387,
+      "learning_rate": 0.0001582360570687419,
+      "loss": 0.7393,
+      "step": 167
+    },
+    {
+      "epoch": 0.21642512077294687,
+      "grad_norm": 0.44812971353530884,
+      "learning_rate": 0.00015797665369649805,
+      "loss": 0.7904,
+      "step": 168
+    },
+    {
+      "epoch": 0.2177133655394525,
+      "grad_norm": 0.4483109712600708,
+      "learning_rate": 0.00015771725032425423,
+      "loss": 0.7503,
+      "step": 169
+    },
+    {
+      "epoch": 0.21900161030595813,
+      "grad_norm": 0.4433995485305786,
+      "learning_rate": 0.00015745784695201038,
+      "loss": 0.7791,
+      "step": 170
+    },
+    {
+      "epoch": 0.22028985507246376,
+      "grad_norm": 0.5305430889129639,
+      "learning_rate": 0.00015719844357976655,
+      "loss": 0.9014,
+      "step": 171
+    },
+    {
+      "epoch": 0.2215780998389694,
+      "grad_norm": 0.4747445285320282,
+      "learning_rate": 0.0001569390402075227,
+      "loss": 0.6446,
+      "step": 172
+    },
+    {
+      "epoch": 0.22286634460547505,
+      "grad_norm": 0.5174173712730408,
+      "learning_rate": 0.00015667963683527886,
+      "loss": 0.6938,
+      "step": 173
+    },
+    {
+      "epoch": 0.22415458937198068,
+      "grad_norm": 0.5461775660514832,
+      "learning_rate": 0.00015642023346303503,
+      "loss": 0.9596,
+      "step": 174
+    },
+    {
+      "epoch": 0.22544283413848631,
+      "grad_norm": 0.5394182205200195,
+      "learning_rate": 0.00015616083009079118,
+      "loss": 0.8632,
+      "step": 175
+    },
+    {
+      "epoch": 0.22673107890499195,
+      "grad_norm": 0.4866770803928375,
+      "learning_rate": 0.00015590142671854736,
+      "loss": 0.8799,
+      "step": 176
+    },
+    {
+      "epoch": 0.22801932367149758,
+      "grad_norm": 0.4386501908302307,
+      "learning_rate": 0.0001556420233463035,
+      "loss": 0.7341,
+      "step": 177
+    },
+    {
+      "epoch": 0.22930756843800323,
+      "grad_norm": 0.5443551540374756,
+      "learning_rate": 0.00015538261997405966,
+      "loss": 0.771,
+      "step": 178
+    },
+    {
+      "epoch": 0.23059581320450886,
+      "grad_norm": 0.45818325877189636,
+      "learning_rate": 0.00015512321660181584,
+      "loss": 0.8682,
+      "step": 179
+    },
+    {
+      "epoch": 0.2318840579710145,
+      "grad_norm": 0.501369297504425,
+      "learning_rate": 0.000154863813229572,
+      "loss": 0.7586,
+      "step": 180
+    },
+    {
+      "epoch": 0.23317230273752013,
+      "grad_norm": 0.4658907651901245,
+      "learning_rate": 0.00015460440985732817,
+      "loss": 0.6609,
+      "step": 181
+    },
+    {
+      "epoch": 0.23446054750402576,
+      "grad_norm": 0.4543883800506592,
+      "learning_rate": 0.00015434500648508432,
+      "loss": 0.5404,
+      "step": 182
+    },
+    {
+      "epoch": 0.2357487922705314,
+      "grad_norm": 0.4215242862701416,
+      "learning_rate": 0.00015408560311284047,
+      "loss": 0.7295,
+      "step": 183
+    },
+    {
+      "epoch": 0.23703703703703705,
+      "grad_norm": 0.4865438640117645,
+      "learning_rate": 0.00015382619974059665,
+      "loss": 0.8251,
+      "step": 184
+    },
+    {
+      "epoch": 0.23832528180354268,
+      "grad_norm": 0.4978322386741638,
+      "learning_rate": 0.0001535667963683528,
+      "loss": 0.9334,
+      "step": 185
+    },
+    {
+      "epoch": 0.2396135265700483,
+      "grad_norm": 0.434435099363327,
+      "learning_rate": 0.00015330739299610898,
+      "loss": 0.9299,
+      "step": 186
+    },
+    {
+      "epoch": 0.24090177133655394,
+      "grad_norm": 0.5044904947280884,
+      "learning_rate": 0.0001530479896238651,
+      "loss": 0.7411,
+      "step": 187
+    },
+    {
+      "epoch": 0.24219001610305957,
+      "grad_norm": 0.4364910423755646,
+      "learning_rate": 0.00015278858625162128,
+      "loss": 0.8248,
+      "step": 188
+    },
+    {
+      "epoch": 0.24347826086956523,
+      "grad_norm": 0.46096572279930115,
+      "learning_rate": 0.00015252918287937743,
+      "loss": 0.8211,
+      "step": 189
+    },
+    {
+      "epoch": 0.24476650563607086,
+      "grad_norm": 0.4325025677680969,
+      "learning_rate": 0.00015226977950713358,
+      "loss": 0.7043,
+      "step": 190
+    },
+    {
+      "epoch": 0.2460547504025765,
+      "grad_norm": 0.4898943305015564,
+      "learning_rate": 0.00015201037613488976,
+      "loss": 0.7608,
+      "step": 191
+    },
+    {
+      "epoch": 0.24734299516908212,
+      "grad_norm": 0.47487872838974,
+      "learning_rate": 0.0001517509727626459,
+      "loss": 0.7175,
+      "step": 192
+    },
+    {
+      "epoch": 0.24863123993558775,
+      "grad_norm": 0.4339347779750824,
+      "learning_rate": 0.0001514915693904021,
+      "loss": 0.8499,
+      "step": 193
+    },
+    {
+      "epoch": 0.2499194847020934,
+      "grad_norm": 0.46825259923934937,
+      "learning_rate": 0.00015123216601815824,
+      "loss": 0.621,
+      "step": 194
+    },
+    {
+      "epoch": 0.25120772946859904,
+      "grad_norm": 0.4948033094406128,
+      "learning_rate": 0.0001509727626459144,
+      "loss": 0.6888,
+      "step": 195
+    },
+    {
+      "epoch": 0.25249597423510467,
+      "grad_norm": 0.4327951967716217,
+      "learning_rate": 0.00015071335927367057,
+      "loss": 0.6128,
+      "step": 196
+    },
+    {
+      "epoch": 0.2537842190016103,
+      "grad_norm": 0.569115161895752,
+      "learning_rate": 0.00015045395590142672,
+      "loss": 0.8251,
+      "step": 197
+    },
+    {
+      "epoch": 0.25507246376811593,
+      "grad_norm": 0.47008320689201355,
+      "learning_rate": 0.0001501945525291829,
+      "loss": 0.8214,
+      "step": 198
+    },
+    {
+      "epoch": 0.25636070853462156,
+      "grad_norm": 0.4881947636604309,
+      "learning_rate": 0.00014993514915693904,
+      "loss": 0.6731,
+      "step": 199
+    },
+    {
+      "epoch": 0.2576489533011272,
+      "grad_norm": 0.5395270586013794,
+      "learning_rate": 0.00014967574578469522,
+      "loss": 0.8095,
+      "step": 200
+    },
+    {
+      "epoch": 0.2589371980676328,
+      "grad_norm": 0.44902658462524414,
+      "learning_rate": 0.00014941634241245137,
+      "loss": 0.7042,
+      "step": 201
+    },
+    {
+      "epoch": 0.2602254428341385,
+      "grad_norm": 0.5789260268211365,
+      "learning_rate": 0.00014915693904020752,
+      "loss": 0.9071,
+      "step": 202
+    },
+    {
+      "epoch": 0.26151368760064414,
+      "grad_norm": 0.48466676473617554,
+      "learning_rate": 0.0001488975356679637,
+      "loss": 0.7318,
+      "step": 203
+    },
+    {
+      "epoch": 0.26280193236714977,
+      "grad_norm": 0.4419580101966858,
+      "learning_rate": 0.00014863813229571985,
+      "loss": 0.7128,
+      "step": 204
+    },
+    {
+      "epoch": 0.2640901771336554,
+      "grad_norm": 0.4542410969734192,
+      "learning_rate": 0.00014837872892347603,
+      "loss": 0.7075,
+      "step": 205
+    },
+    {
+      "epoch": 0.26537842190016103,
+      "grad_norm": 0.49915802478790283,
+      "learning_rate": 0.00014811932555123218,
+      "loss": 0.8091,
+      "step": 206
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.39728543162345886,
+      "learning_rate": 0.00014785992217898833,
+      "loss": 0.6258,
+      "step": 207
+    },
+    {
+      "epoch": 0.2679549114331723,
+      "grad_norm": 0.524169385433197,
+      "learning_rate": 0.0001476005188067445,
+      "loss": 0.73,
+      "step": 208
+    },
+    {
+      "epoch": 0.2692431561996779,
+      "grad_norm": 0.4486137330532074,
+      "learning_rate": 0.00014734111543450066,
+      "loss": 0.7607,
+      "step": 209
+    },
+    {
+      "epoch": 0.27053140096618356,
+      "grad_norm": 0.5274791717529297,
+      "learning_rate": 0.00014708171206225684,
+      "loss": 0.6731,
+      "step": 210
+    },
+    {
+      "epoch": 0.2718196457326892,
+      "grad_norm": 0.44794782996177673,
+      "learning_rate": 0.00014682230869001296,
+      "loss": 0.5291,
+      "step": 211
+    },
+    {
+      "epoch": 0.27310789049919487,
+      "grad_norm": 0.48657894134521484,
+      "learning_rate": 0.00014656290531776914,
+      "loss": 0.6754,
+      "step": 212
+    },
+    {
+      "epoch": 0.2743961352657005,
+      "grad_norm": 0.49806416034698486,
+      "learning_rate": 0.0001463035019455253,
+      "loss": 0.7096,
+      "step": 213
+    },
+    {
+      "epoch": 0.27568438003220613,
+      "grad_norm": 0.49381333589553833,
+      "learning_rate": 0.00014604409857328144,
+      "loss": 0.5939,
+      "step": 214
+    },
+    {
+      "epoch": 0.27697262479871176,
+      "grad_norm": 0.4638739824295044,
+      "learning_rate": 0.00014578469520103762,
+      "loss": 0.6444,
+      "step": 215
+    },
+    {
+      "epoch": 0.2782608695652174,
+      "grad_norm": 0.5256271362304688,
+      "learning_rate": 0.00014552529182879377,
+      "loss": 0.7595,
+      "step": 216
+    },
+    {
+      "epoch": 0.279549114331723,
+      "grad_norm": 0.47106048464775085,
+      "learning_rate": 0.00014526588845654995,
+      "loss": 0.6394,
+      "step": 217
+    },
+    {
+      "epoch": 0.28083735909822866,
+      "grad_norm": 0.5482437610626221,
+      "learning_rate": 0.0001450064850843061,
+      "loss": 0.7181,
+      "step": 218
+    },
+    {
+      "epoch": 0.2821256038647343,
+      "grad_norm": 0.4711976945400238,
+      "learning_rate": 0.00014474708171206225,
+      "loss": 0.7207,
+      "step": 219
+    },
+    {
+      "epoch": 0.2834138486312399,
+      "grad_norm": 0.5149180293083191,
+      "learning_rate": 0.00014448767833981843,
+      "loss": 0.8199,
+      "step": 220
+    },
+    {
+      "epoch": 0.28470209339774555,
+      "grad_norm": 0.452908992767334,
+      "learning_rate": 0.00014422827496757458,
+      "loss": 0.6987,
+      "step": 221
+    },
+    {
+      "epoch": 0.28599033816425123,
+      "grad_norm": 0.5486910343170166,
+      "learning_rate": 0.00014396887159533075,
+      "loss": 0.7726,
+      "step": 222
+    },
+    {
+      "epoch": 0.28727858293075687,
+      "grad_norm": 0.5290431380271912,
+      "learning_rate": 0.0001437094682230869,
+      "loss": 0.8298,
+      "step": 223
+    },
+    {
+      "epoch": 0.2885668276972625,
+      "grad_norm": 0.49307680130004883,
+      "learning_rate": 0.00014345006485084306,
+      "loss": 0.7525,
+      "step": 224
+    },
+    {
+      "epoch": 0.2898550724637681,
+      "grad_norm": 0.5979593396186829,
+      "learning_rate": 0.00014319066147859923,
+      "loss": 0.8451,
+      "step": 225
+    },
+    {
+      "epoch": 0.29114331723027376,
+      "grad_norm": 0.49994269013404846,
+      "learning_rate": 0.00014293125810635538,
+      "loss": 0.6975,
+      "step": 226
+    },
+    {
+      "epoch": 0.2924315619967794,
+      "grad_norm": 0.5523327589035034,
+      "learning_rate": 0.00014267185473411156,
+      "loss": 0.7264,
+      "step": 227
+    },
+    {
+      "epoch": 0.293719806763285,
+      "grad_norm": 0.5106574296951294,
+      "learning_rate": 0.0001424124513618677,
+      "loss": 0.7794,
+      "step": 228
+    },
+    {
+      "epoch": 0.29500805152979065,
+      "grad_norm": 0.458646297454834,
+      "learning_rate": 0.0001421530479896239,
+      "loss": 0.8118,
+      "step": 229
+    },
+    {
+      "epoch": 0.2962962962962963,
+      "grad_norm": 0.5162986516952515,
+      "learning_rate": 0.00014189364461738004,
+      "loss": 0.8167,
+      "step": 230
+    },
+    {
+      "epoch": 0.2975845410628019,
+      "grad_norm": 0.47405433654785156,
+      "learning_rate": 0.0001416342412451362,
+      "loss": 0.7852,
+      "step": 231
+    },
+    {
+      "epoch": 0.29887278582930754,
+      "grad_norm": 0.5881102681159973,
+      "learning_rate": 0.00014137483787289237,
+      "loss": 0.9897,
+      "step": 232
+    },
+    {
+      "epoch": 0.3001610305958132,
+      "grad_norm": 0.4673059582710266,
+      "learning_rate": 0.00014111543450064852,
+      "loss": 0.7341,
+      "step": 233
+    },
+    {
+      "epoch": 0.30144927536231886,
+      "grad_norm": 0.48171284794807434,
+      "learning_rate": 0.0001408560311284047,
+      "loss": 0.7156,
+      "step": 234
+    },
+    {
+      "epoch": 0.3027375201288245,
+      "grad_norm": 0.43746286630630493,
+      "learning_rate": 0.00014059662775616082,
+      "loss": 0.6003,
+      "step": 235
+    },
+    {
+      "epoch": 0.3040257648953301,
+      "grad_norm": 0.46966665983200073,
+      "learning_rate": 0.000140337224383917,
+      "loss": 0.718,
+      "step": 236
+    },
+    {
+      "epoch": 0.30531400966183575,
+      "grad_norm": 0.4956988990306854,
+      "learning_rate": 0.00014007782101167315,
+      "loss": 0.6542,
+      "step": 237
+    },
+    {
+      "epoch": 0.3066022544283414,
+      "grad_norm": 0.5336653590202332,
+      "learning_rate": 0.0001398184176394293,
+      "loss": 0.7719,
+      "step": 238
+    },
+    {
+      "epoch": 0.307890499194847,
+      "grad_norm": 0.510515034198761,
+      "learning_rate": 0.00013955901426718548,
+      "loss": 0.8369,
+      "step": 239
+    },
+    {
+      "epoch": 0.30917874396135264,
+      "grad_norm": 0.4901074469089508,
+      "learning_rate": 0.00013929961089494163,
+      "loss": 0.7973,
+      "step": 240
+    },
+    {
+      "epoch": 0.3104669887278583,
+      "grad_norm": 0.5074118375778198,
+      "learning_rate": 0.0001390402075226978,
+      "loss": 0.8418,
+      "step": 241
+    },
+    {
+      "epoch": 0.3117552334943639,
+      "grad_norm": 0.48613104224205017,
+      "learning_rate": 0.00013878080415045396,
+      "loss": 0.7661,
+      "step": 242
+    },
+    {
+      "epoch": 0.3130434782608696,
+      "grad_norm": 0.527791440486908,
+      "learning_rate": 0.0001385214007782101,
+      "loss": 0.6461,
+      "step": 243
+    },
+    {
+      "epoch": 0.3143317230273752,
+      "grad_norm": 0.539172887802124,
+      "learning_rate": 0.0001382619974059663,
+      "loss": 0.7588,
+      "step": 244
+    },
+    {
+      "epoch": 0.31561996779388085,
+      "grad_norm": 0.4465171694755554,
+      "learning_rate": 0.00013800259403372244,
+      "loss": 0.6897,
+      "step": 245
+    },
+    {
+      "epoch": 0.3169082125603865,
+      "grad_norm": 0.44620922207832336,
+      "learning_rate": 0.00013774319066147862,
+      "loss": 0.591,
+      "step": 246
+    },
+    {
+      "epoch": 0.3181964573268921,
+      "grad_norm": 0.44383737444877625,
+      "learning_rate": 0.00013748378728923477,
+      "loss": 0.822,
+      "step": 247
+    },
+    {
+      "epoch": 0.31948470209339774,
+      "grad_norm": 0.5062816739082336,
+      "learning_rate": 0.00013722438391699092,
+      "loss": 0.7657,
+      "step": 248
+    },
+    {
+      "epoch": 0.3207729468599034,
+      "grad_norm": 0.4794199764728546,
+      "learning_rate": 0.0001369649805447471,
+      "loss": 0.6533,
+      "step": 249
+    },
+    {
+      "epoch": 0.322061191626409,
+      "grad_norm": 0.506678581237793,
+      "learning_rate": 0.00013670557717250325,
+      "loss": 0.6881,
+      "step": 250
+    },
+    {
+      "epoch": 0.32334943639291464,
+      "grad_norm": 0.5363421440124512,
+      "learning_rate": 0.00013644617380025942,
+      "loss": 0.7263,
+      "step": 251
+    },
+    {
+      "epoch": 0.32463768115942027,
+      "grad_norm": 0.4600725769996643,
+      "learning_rate": 0.00013618677042801557,
+      "loss": 0.6522,
+      "step": 252
+    },
+    {
+      "epoch": 0.32592592592592595,
+      "grad_norm": 0.4250006377696991,
+      "learning_rate": 0.00013592736705577172,
+      "loss": 0.5492,
+      "step": 253
+    },
+    {
+      "epoch": 0.3272141706924316,
+      "grad_norm": 0.5984755158424377,
+      "learning_rate": 0.0001356679636835279,
+      "loss": 0.7152,
+      "step": 254
+    },
+    {
+      "epoch": 0.3285024154589372,
+      "grad_norm": 0.4653768241405487,
+      "learning_rate": 0.00013540856031128405,
+      "loss": 0.6651,
+      "step": 255
+    },
+    {
+      "epoch": 0.32979066022544284,
+      "grad_norm": 0.5344521403312683,
+      "learning_rate": 0.00013514915693904023,
+      "loss": 0.736,
+      "step": 256
+    },
+    {
+      "epoch": 0.3310789049919485,
+      "grad_norm": 0.469061017036438,
+      "learning_rate": 0.00013488975356679638,
+      "loss": 0.5771,
+      "step": 257
+    },
+    {
+      "epoch": 0.3323671497584541,
+      "grad_norm": 0.46232855319976807,
+      "learning_rate": 0.00013463035019455256,
+      "loss": 0.6887,
+      "step": 258
+    },
+    {
+      "epoch": 0.33365539452495974,
+      "grad_norm": 0.4812975525856018,
+      "learning_rate": 0.00013437094682230868,
+      "loss": 0.8316,
+      "step": 259
+    },
+    {
+      "epoch": 0.33494363929146537,
+      "grad_norm": 0.5068632960319519,
+      "learning_rate": 0.00013411154345006486,
+      "loss": 0.7372,
+      "step": 260
+    },
+    {
+      "epoch": 0.336231884057971,
+      "grad_norm": 0.42497095465660095,
+      "learning_rate": 0.000133852140077821,
+      "loss": 0.7469,
+      "step": 261
+    },
+    {
+      "epoch": 0.33752012882447663,
+      "grad_norm": 0.49439537525177,
+      "learning_rate": 0.00013359273670557716,
+      "loss": 0.6429,
+      "step": 262
+    },
+    {
+      "epoch": 0.33880837359098226,
+      "grad_norm": 0.4804583787918091,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.7772,
+      "step": 263
+    },
+    {
+      "epoch": 0.34009661835748795,
+      "grad_norm": 0.46911564469337463,
+      "learning_rate": 0.0001330739299610895,
+      "loss": 0.5994,
+      "step": 264
+    },
+    {
+      "epoch": 0.3413848631239936,
+      "grad_norm": 0.5286073088645935,
+      "learning_rate": 0.00013281452658884567,
+      "loss": 0.6459,
+      "step": 265
+    },
+    {
+      "epoch": 0.3426731078904992,
+      "grad_norm": 0.48704788088798523,
+      "learning_rate": 0.00013255512321660182,
+      "loss": 0.6466,
+      "step": 266
+    },
+    {
+      "epoch": 0.34396135265700484,
+      "grad_norm": 0.5040203332901001,
+      "learning_rate": 0.00013229571984435797,
+      "loss": 0.7436,
+      "step": 267
+    },
+    {
+      "epoch": 0.34524959742351047,
+      "grad_norm": 0.48882773518562317,
+      "learning_rate": 0.00013203631647211415,
+      "loss": 0.7009,
+      "step": 268
+    },
+    {
+      "epoch": 0.3465378421900161,
+      "grad_norm": 0.5158678889274597,
+      "learning_rate": 0.0001317769130998703,
+      "loss": 0.6862,
+      "step": 269
+    },
+    {
+      "epoch": 0.34782608695652173,
+      "grad_norm": 0.489501416683197,
+      "learning_rate": 0.00013151750972762648,
+      "loss": 0.6378,
+      "step": 270
+    },
+    {
+      "epoch": 0.34911433172302736,
+      "grad_norm": 0.42305371165275574,
+      "learning_rate": 0.00013125810635538263,
+      "loss": 0.593,
+      "step": 271
+    },
+    {
+      "epoch": 0.350402576489533,
+      "grad_norm": 0.5226255059242249,
+      "learning_rate": 0.00013099870298313878,
+      "loss": 0.7828,
+      "step": 272
+    },
+    {
+      "epoch": 0.3516908212560386,
+      "grad_norm": 0.4217074513435364,
+      "learning_rate": 0.00013073929961089496,
+      "loss": 0.6397,
+      "step": 273
+    },
+    {
+      "epoch": 0.3529790660225443,
+      "grad_norm": 0.46896272897720337,
+      "learning_rate": 0.0001304798962386511,
+      "loss": 0.614,
+      "step": 274
+    },
+    {
+      "epoch": 0.35426731078904994,
+      "grad_norm": 0.47062304615974426,
+      "learning_rate": 0.00013022049286640728,
+      "loss": 0.6892,
+      "step": 275
+    },
+    {
+      "epoch": 0.35555555555555557,
+      "grad_norm": 0.4669751822948456,
+      "learning_rate": 0.00012996108949416343,
+      "loss": 0.8675,
+      "step": 276
+    },
+    {
+      "epoch": 0.3568438003220612,
+      "grad_norm": 0.4246136546134949,
+      "learning_rate": 0.00012970168612191958,
+      "loss": 0.6467,
+      "step": 277
+    },
+    {
+      "epoch": 0.35813204508856683,
+      "grad_norm": 0.42293113470077515,
+      "learning_rate": 0.00012944228274967576,
+      "loss": 0.6006,
+      "step": 278
+    },
+    {
+      "epoch": 0.35942028985507246,
+      "grad_norm": 0.44599637389183044,
+      "learning_rate": 0.0001291828793774319,
+      "loss": 0.6241,
+      "step": 279
+    },
+    {
+      "epoch": 0.3607085346215781,
+      "grad_norm": 0.4490668773651123,
+      "learning_rate": 0.0001289234760051881,
+      "loss": 0.5644,
+      "step": 280
+    },
+    {
+      "epoch": 0.3619967793880837,
+      "grad_norm": 0.5100782513618469,
+      "learning_rate": 0.00012866407263294424,
+      "loss": 0.7378,
+      "step": 281
+    },
+    {
+      "epoch": 0.36328502415458935,
+      "grad_norm": 0.4394833445549011,
+      "learning_rate": 0.0001284046692607004,
+      "loss": 0.6662,
+      "step": 282
+    },
+    {
+      "epoch": 0.364573268921095,
+      "grad_norm": 0.49244457483291626,
+      "learning_rate": 0.00012814526588845657,
+      "loss": 0.7512,
+      "step": 283
+    },
+    {
+      "epoch": 0.36586151368760067,
+      "grad_norm": 0.4558521807193756,
+      "learning_rate": 0.0001278858625162127,
+      "loss": 0.7213,
+      "step": 284
+    },
+    {
+      "epoch": 0.3671497584541063,
+      "grad_norm": 0.6079721450805664,
+      "learning_rate": 0.00012762645914396887,
+      "loss": 0.7615,
+      "step": 285
+    },
+    {
+      "epoch": 0.36843800322061193,
+      "grad_norm": 0.5249935984611511,
+      "learning_rate": 0.00012736705577172502,
+      "loss": 0.8172,
+      "step": 286
+    },
+    {
+      "epoch": 0.36972624798711756,
+      "grad_norm": 0.5798977613449097,
+      "learning_rate": 0.0001271076523994812,
+      "loss": 0.8244,
+      "step": 287
+    },
+    {
+      "epoch": 0.3710144927536232,
+      "grad_norm": 0.496056467294693,
+      "learning_rate": 0.00012684824902723735,
+      "loss": 0.8799,
+      "step": 288
+    },
+    {
+      "epoch": 0.3723027375201288,
+      "grad_norm": 0.47068995237350464,
+      "learning_rate": 0.00012658884565499353,
+      "loss": 0.8069,
+      "step": 289
+    },
+    {
+      "epoch": 0.37359098228663445,
+      "grad_norm": 0.5302271842956543,
+      "learning_rate": 0.00012632944228274968,
+      "loss": 0.7593,
+      "step": 290
+    },
+    {
+      "epoch": 0.3748792270531401,
+      "grad_norm": 0.5044103860855103,
+      "learning_rate": 0.00012607003891050583,
+      "loss": 0.7462,
+      "step": 291
+    },
+    {
+      "epoch": 0.3761674718196457,
+      "grad_norm": 0.4707060158252716,
+      "learning_rate": 0.000125810635538262,
+      "loss": 0.6593,
+      "step": 292
+    },
+    {
+      "epoch": 0.37745571658615135,
+      "grad_norm": 0.5337527394294739,
+      "learning_rate": 0.00012555123216601816,
+      "loss": 0.6138,
+      "step": 293
+    },
+    {
+      "epoch": 0.37874396135265703,
+      "grad_norm": 0.5467652082443237,
+      "learning_rate": 0.00012529182879377434,
+      "loss": 0.8375,
+      "step": 294
+    },
+    {
+      "epoch": 0.38003220611916266,
+      "grad_norm": 0.48266416788101196,
+      "learning_rate": 0.0001250324254215305,
+      "loss": 0.6897,
+      "step": 295
+    },
+    {
+      "epoch": 0.3813204508856683,
+      "grad_norm": 0.49726054072380066,
+      "learning_rate": 0.00012477302204928664,
+      "loss": 0.8202,
+      "step": 296
+    },
+    {
+      "epoch": 0.3826086956521739,
+      "grad_norm": 0.5109860301017761,
+      "learning_rate": 0.00012451361867704282,
+      "loss": 0.7937,
+      "step": 297
+    },
+    {
+      "epoch": 0.38389694041867956,
+      "grad_norm": 0.44613054394721985,
+      "learning_rate": 0.00012425421530479897,
+      "loss": 0.7205,
+      "step": 298
+    },
+    {
+      "epoch": 0.3851851851851852,
+      "grad_norm": 0.5678048729896545,
+      "learning_rate": 0.00012399481193255514,
+      "loss": 0.8164,
+      "step": 299
+    },
+    {
+      "epoch": 0.3864734299516908,
+      "grad_norm": 0.4355293810367584,
+      "learning_rate": 0.0001237354085603113,
+      "loss": 0.5967,
+      "step": 300
+    },
+    {
+      "epoch": 0.38776167471819645,
+      "grad_norm": 0.5225346088409424,
+      "learning_rate": 0.00012347600518806745,
+      "loss": 0.7688,
+      "step": 301
+    },
+    {
+      "epoch": 0.3890499194847021,
+      "grad_norm": 0.47630950808525085,
+      "learning_rate": 0.00012321660181582362,
+      "loss": 0.6535,
+      "step": 302
+    },
+    {
+      "epoch": 0.3903381642512077,
+      "grad_norm": 0.48992452025413513,
+      "learning_rate": 0.00012295719844357977,
+      "loss": 0.652,
+      "step": 303
+    },
+    {
+      "epoch": 0.39162640901771334,
+      "grad_norm": 0.4927466809749603,
+      "learning_rate": 0.00012269779507133595,
+      "loss": 0.6116,
+      "step": 304
+    },
+    {
+      "epoch": 0.392914653784219,
+      "grad_norm": 0.4766499400138855,
+      "learning_rate": 0.0001224383916990921,
+      "loss": 0.6457,
+      "step": 305
+    },
+    {
+      "epoch": 0.39420289855072466,
+      "grad_norm": 0.49338245391845703,
+      "learning_rate": 0.00012217898832684825,
+      "loss": 0.6211,
+      "step": 306
+    },
+    {
+      "epoch": 0.3954911433172303,
+      "grad_norm": 0.5238732099533081,
+      "learning_rate": 0.00012191958495460443,
+      "loss": 0.7313,
+      "step": 307
+    },
+    {
+      "epoch": 0.3967793880837359,
+      "grad_norm": 0.494093656539917,
+      "learning_rate": 0.00012166018158236057,
+      "loss": 0.7583,
+      "step": 308
+    },
+    {
+      "epoch": 0.39806763285024155,
+      "grad_norm": 0.46139660477638245,
+      "learning_rate": 0.00012140077821011673,
+      "loss": 0.6841,
+      "step": 309
+    },
+    {
+      "epoch": 0.3993558776167472,
+      "grad_norm": 0.4901793897151947,
+      "learning_rate": 0.00012114137483787288,
+      "loss": 0.6862,
+      "step": 310
+    },
+    {
+      "epoch": 0.4006441223832528,
+      "grad_norm": 0.4695977568626404,
+      "learning_rate": 0.00012088197146562905,
+      "loss": 0.6428,
+      "step": 311
+    },
+    {
+      "epoch": 0.40193236714975844,
+      "grad_norm": 0.4964921772480011,
+      "learning_rate": 0.00012062256809338521,
+      "loss": 0.6061,
+      "step": 312
+    },
+    {
+      "epoch": 0.40322061191626407,
+      "grad_norm": 0.5101466178894043,
+      "learning_rate": 0.00012036316472114138,
+      "loss": 0.8195,
+      "step": 313
+    },
+    {
+      "epoch": 0.4045088566827697,
+      "grad_norm": 0.470225989818573,
+      "learning_rate": 0.00012010376134889754,
+      "loss": 0.681,
+      "step": 314
+    },
+    {
+      "epoch": 0.4057971014492754,
+      "grad_norm": 0.4532884955406189,
+      "learning_rate": 0.0001198443579766537,
+      "loss": 0.7239,
+      "step": 315
+    },
+    {
+      "epoch": 0.407085346215781,
+      "grad_norm": 0.4604836106300354,
+      "learning_rate": 0.00011958495460440985,
+      "loss": 0.7433,
+      "step": 316
+    },
+    {
+      "epoch": 0.40837359098228665,
+      "grad_norm": 0.4511779546737671,
+      "learning_rate": 0.00011932555123216602,
+      "loss": 0.7404,
+      "step": 317
+    },
+    {
+      "epoch": 0.4096618357487923,
+      "grad_norm": 0.5277577042579651,
+      "learning_rate": 0.00011906614785992218,
+      "loss": 0.8757,
+      "step": 318
+    },
+    {
+      "epoch": 0.4109500805152979,
+      "grad_norm": 0.444564551115036,
+      "learning_rate": 0.00011880674448767835,
+      "loss": 0.6465,
+      "step": 319
+    },
+    {
+      "epoch": 0.41223832528180354,
+      "grad_norm": 0.4861951470375061,
+      "learning_rate": 0.00011854734111543451,
+      "loss": 0.8336,
+      "step": 320
+    },
+    {
+      "epoch": 0.41352657004830917,
+      "grad_norm": 0.4412696957588196,
+      "learning_rate": 0.00011828793774319066,
+      "loss": 0.7586,
+      "step": 321
+    },
+    {
+      "epoch": 0.4148148148148148,
+      "grad_norm": 0.5230206251144409,
+      "learning_rate": 0.00011802853437094683,
+      "loss": 0.7423,
+      "step": 322
+    },
+    {
+      "epoch": 0.41610305958132043,
+      "grad_norm": 0.4539431631565094,
+      "learning_rate": 0.00011776913099870299,
+      "loss": 0.6849,
+      "step": 323
+    },
+    {
+      "epoch": 0.41739130434782606,
+      "grad_norm": 0.5001434683799744,
+      "learning_rate": 0.00011750972762645916,
+      "loss": 0.6527,
+      "step": 324
+    },
+    {
+      "epoch": 0.41867954911433175,
+      "grad_norm": 0.5230083465576172,
+      "learning_rate": 0.00011725032425421532,
+      "loss": 0.6829,
+      "step": 325
+    },
+    {
+      "epoch": 0.4199677938808374,
+      "grad_norm": 0.5428875684738159,
+      "learning_rate": 0.00011699092088197148,
+      "loss": 0.6075,
+      "step": 326
+    },
+    {
+      "epoch": 0.421256038647343,
+      "grad_norm": 0.49785757064819336,
+      "learning_rate": 0.00011673151750972763,
+      "loss": 0.6696,
+      "step": 327
+    },
+    {
+      "epoch": 0.42254428341384864,
+      "grad_norm": 0.5448641180992126,
+      "learning_rate": 0.0001164721141374838,
+      "loss": 0.7752,
+      "step": 328
+    },
+    {
+      "epoch": 0.4238325281803543,
+      "grad_norm": 0.6280490159988403,
+      "learning_rate": 0.00011621271076523996,
+      "loss": 0.8681,
+      "step": 329
+    },
+    {
+      "epoch": 0.4251207729468599,
+      "grad_norm": 0.5525287389755249,
+      "learning_rate": 0.00011595330739299613,
+      "loss": 0.8434,
+      "step": 330
+    },
+    {
+      "epoch": 0.42640901771336553,
+      "grad_norm": 0.4954991042613983,
+      "learning_rate": 0.00011569390402075229,
+      "loss": 0.7923,
+      "step": 331
+    },
+    {
+      "epoch": 0.42769726247987117,
+      "grad_norm": 0.46500164270401,
+      "learning_rate": 0.00011543450064850843,
+      "loss": 0.7084,
+      "step": 332
+    },
+    {
+      "epoch": 0.4289855072463768,
+      "grad_norm": 0.5183458924293518,
+      "learning_rate": 0.00011517509727626459,
+      "loss": 0.754,
+      "step": 333
+    },
+    {
+      "epoch": 0.4302737520128824,
+      "grad_norm": 0.521300733089447,
+      "learning_rate": 0.00011491569390402074,
+      "loss": 0.7481,
+      "step": 334
+    },
+    {
+      "epoch": 0.43156199677938806,
+      "grad_norm": 0.46088019013404846,
+      "learning_rate": 0.00011465629053177691,
+      "loss": 0.5601,
+      "step": 335
+    },
+    {
+      "epoch": 0.43285024154589374,
+      "grad_norm": 0.5142108798027039,
+      "learning_rate": 0.00011439688715953307,
+      "loss": 0.8001,
+      "step": 336
+    },
+    {
+      "epoch": 0.4341384863123994,
+      "grad_norm": 0.41947636008262634,
+      "learning_rate": 0.00011413748378728924,
+      "loss": 0.6669,
+      "step": 337
+    },
+    {
+      "epoch": 0.435426731078905,
+      "grad_norm": 0.4584703743457794,
+      "learning_rate": 0.0001138780804150454,
+      "loss": 0.702,
+      "step": 338
+    },
+    {
+      "epoch": 0.43671497584541064,
+      "grad_norm": 0.4480314254760742,
+      "learning_rate": 0.00011361867704280155,
+      "loss": 0.6379,
+      "step": 339
+    },
+    {
+      "epoch": 0.43800322061191627,
+      "grad_norm": 0.49402984976768494,
+      "learning_rate": 0.00011335927367055772,
+      "loss": 0.7751,
+      "step": 340
+    },
+    {
+      "epoch": 0.4392914653784219,
+      "grad_norm": 0.5001116991043091,
+      "learning_rate": 0.00011309987029831388,
+      "loss": 0.7157,
+      "step": 341
+    },
+    {
+      "epoch": 0.4405797101449275,
+      "grad_norm": 0.4650849401950836,
+      "learning_rate": 0.00011284046692607004,
+      "loss": 0.5801,
+      "step": 342
+    },
+    {
+      "epoch": 0.44186795491143316,
+      "grad_norm": 0.5000032186508179,
+      "learning_rate": 0.00011258106355382621,
+      "loss": 0.8127,
+      "step": 343
+    },
+    {
+      "epoch": 0.4431561996779388,
+      "grad_norm": 0.5941475033760071,
+      "learning_rate": 0.00011232166018158237,
+      "loss": 0.8227,
+      "step": 344
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 0.49535176157951355,
+      "learning_rate": 0.00011206225680933852,
+      "loss": 0.6376,
+      "step": 345
+    },
+    {
+      "epoch": 0.4457326892109501,
+      "grad_norm": 0.46945926547050476,
+      "learning_rate": 0.00011180285343709469,
+      "loss": 0.6801,
+      "step": 346
+    },
+    {
+      "epoch": 0.44702093397745574,
+      "grad_norm": 0.47991520166397095,
+      "learning_rate": 0.00011154345006485085,
+      "loss": 0.6071,
+      "step": 347
+    },
+    {
+      "epoch": 0.44830917874396137,
+      "grad_norm": 0.45372679829597473,
+      "learning_rate": 0.00011128404669260702,
+      "loss": 0.7388,
+      "step": 348
+    },
+    {
+      "epoch": 0.449597423510467,
+      "grad_norm": 0.5295307636260986,
+      "learning_rate": 0.00011102464332036318,
+      "loss": 0.776,
+      "step": 349
+    },
+    {
+      "epoch": 0.45088566827697263,
+      "grad_norm": 0.516298770904541,
+      "learning_rate": 0.00011076523994811933,
+      "loss": 0.7546,
+      "step": 350
+    },
+    {
+      "epoch": 0.45217391304347826,
+      "grad_norm": 0.4629455804824829,
+      "learning_rate": 0.0001105058365758755,
+      "loss": 0.5773,
+      "step": 351
+    },
+    {
+      "epoch": 0.4534621578099839,
+      "grad_norm": 0.4974667727947235,
+      "learning_rate": 0.00011024643320363166,
+      "loss": 0.6464,
+      "step": 352
+    },
+    {
+      "epoch": 0.4547504025764895,
+      "grad_norm": 0.47429102659225464,
+      "learning_rate": 0.00010998702983138782,
+      "loss": 0.7503,
+      "step": 353
+    },
+    {
+      "epoch": 0.45603864734299515,
+      "grad_norm": 0.5169098377227783,
+      "learning_rate": 0.00010972762645914399,
+      "loss": 0.7697,
+      "step": 354
+    },
+    {
+      "epoch": 0.4573268921095008,
+      "grad_norm": 0.6083032488822937,
+      "learning_rate": 0.00010946822308690015,
+      "loss": 0.7145,
+      "step": 355
+    },
+    {
+      "epoch": 0.45861513687600647,
+      "grad_norm": 0.6092599034309387,
+      "learning_rate": 0.00010920881971465629,
+      "loss": 0.9839,
+      "step": 356
+    },
+    {
+      "epoch": 0.4599033816425121,
+      "grad_norm": 0.47699296474456787,
+      "learning_rate": 0.00010894941634241245,
+      "loss": 0.7013,
+      "step": 357
+    },
+    {
+      "epoch": 0.46119162640901773,
+      "grad_norm": 0.44026511907577515,
+      "learning_rate": 0.0001086900129701686,
+      "loss": 0.7314,
+      "step": 358
+    },
+    {
+      "epoch": 0.46247987117552336,
+      "grad_norm": 0.5326471328735352,
+      "learning_rate": 0.00010843060959792477,
+      "loss": 0.8708,
+      "step": 359
+    },
+    {
+      "epoch": 0.463768115942029,
+      "grad_norm": 0.5188657641410828,
+      "learning_rate": 0.00010817120622568093,
+      "loss": 0.6573,
+      "step": 360
+    },
+    {
+      "epoch": 0.4650563607085346,
+      "grad_norm": 0.5846801400184631,
+      "learning_rate": 0.0001079118028534371,
+      "loss": 0.7319,
+      "step": 361
+    },
+    {
+      "epoch": 0.46634460547504025,
+      "grad_norm": 0.5272177457809448,
+      "learning_rate": 0.00010765239948119326,
+      "loss": 0.6251,
+      "step": 362
+    },
+    {
+      "epoch": 0.4676328502415459,
+      "grad_norm": 0.5060721635818481,
+      "learning_rate": 0.00010739299610894941,
+      "loss": 0.6675,
+      "step": 363
+    },
+    {
+      "epoch": 0.4689210950080515,
+      "grad_norm": 0.5200803279876709,
+      "learning_rate": 0.00010713359273670558,
+      "loss": 0.6373,
+      "step": 364
+    },
+    {
+      "epoch": 0.47020933977455714,
+      "grad_norm": 0.5527567863464355,
+      "learning_rate": 0.00010687418936446174,
+      "loss": 0.9144,
+      "step": 365
+    },
+    {
+      "epoch": 0.4714975845410628,
+      "grad_norm": 0.5247730016708374,
+      "learning_rate": 0.0001066147859922179,
+      "loss": 0.7283,
+      "step": 366
+    },
+    {
+      "epoch": 0.47278582930756846,
+      "grad_norm": 0.482681006193161,
+      "learning_rate": 0.00010635538261997407,
+      "loss": 0.8382,
+      "step": 367
+    },
+    {
+      "epoch": 0.4740740740740741,
+      "grad_norm": 0.5045844316482544,
+      "learning_rate": 0.00010609597924773022,
+      "loss": 0.8324,
+      "step": 368
+    },
+    {
+      "epoch": 0.4753623188405797,
+      "grad_norm": 0.500696063041687,
+      "learning_rate": 0.00010583657587548638,
+      "loss": 0.6749,
+      "step": 369
+    },
+    {
+      "epoch": 0.47665056360708535,
+      "grad_norm": 0.49296805262565613,
+      "learning_rate": 0.00010557717250324255,
+      "loss": 0.8396,
+      "step": 370
+    },
+    {
+      "epoch": 0.477938808373591,
+      "grad_norm": 0.5083613395690918,
+      "learning_rate": 0.00010531776913099871,
+      "loss": 0.596,
+      "step": 371
+    },
+    {
+      "epoch": 0.4792270531400966,
+      "grad_norm": 0.6000961065292358,
+      "learning_rate": 0.00010505836575875488,
+      "loss": 0.7097,
+      "step": 372
+    },
+    {
+      "epoch": 0.48051529790660225,
+      "grad_norm": 0.47504574060440063,
+      "learning_rate": 0.00010479896238651104,
+      "loss": 0.676,
+      "step": 373
+    },
+    {
+      "epoch": 0.4818035426731079,
+      "grad_norm": 0.4866791069507599,
+      "learning_rate": 0.00010453955901426719,
+      "loss": 0.6026,
+      "step": 374
+    },
+    {
+      "epoch": 0.4830917874396135,
+      "grad_norm": 0.5388527512550354,
+      "learning_rate": 0.00010428015564202336,
+      "loss": 0.78,
+      "step": 375
+    },
+    {
+      "epoch": 0.48438003220611914,
+      "grad_norm": 0.5430642366409302,
+      "learning_rate": 0.00010402075226977952,
+      "loss": 0.7898,
+      "step": 376
+    },
+    {
+      "epoch": 0.4856682769726248,
+      "grad_norm": 0.5378901362419128,
+      "learning_rate": 0.00010376134889753568,
+      "loss": 0.8215,
+      "step": 377
+    },
+    {
+      "epoch": 0.48695652173913045,
+      "grad_norm": 0.46278834342956543,
+      "learning_rate": 0.00010350194552529185,
+      "loss": 0.786,
+      "step": 378
+    },
+    {
+      "epoch": 0.4882447665056361,
+      "grad_norm": 0.5695458650588989,
+      "learning_rate": 0.000103242542153048,
+      "loss": 0.7929,
+      "step": 379
+    },
+    {
+      "epoch": 0.4895330112721417,
+      "grad_norm": 0.5052254796028137,
+      "learning_rate": 0.00010298313878080415,
+      "loss": 0.8047,
+      "step": 380
+    },
+    {
+      "epoch": 0.49082125603864735,
+      "grad_norm": 0.45410144329071045,
+      "learning_rate": 0.0001027237354085603,
+      "loss": 0.6309,
+      "step": 381
+    },
+    {
+      "epoch": 0.492109500805153,
+      "grad_norm": 0.5507941842079163,
+      "learning_rate": 0.00010246433203631646,
+      "loss": 0.7374,
+      "step": 382
+    },
+    {
+      "epoch": 0.4933977455716586,
+      "grad_norm": 0.4703005850315094,
+      "learning_rate": 0.00010220492866407263,
+      "loss": 0.5724,
+      "step": 383
+    },
+    {
+      "epoch": 0.49468599033816424,
+      "grad_norm": 0.5034976601600647,
+      "learning_rate": 0.0001019455252918288,
+      "loss": 0.6829,
+      "step": 384
+    },
+    {
+      "epoch": 0.49597423510466987,
+      "grad_norm": 0.5183707475662231,
+      "learning_rate": 0.00010168612191958496,
+      "loss": 0.6716,
+      "step": 385
+    },
+    {
+      "epoch": 0.4972624798711755,
+      "grad_norm": 0.5549296736717224,
+      "learning_rate": 0.00010142671854734112,
+      "loss": 0.7464,
+      "step": 386
+    },
+    {
+      "epoch": 0.4985507246376812,
+      "grad_norm": 0.48852047324180603,
+      "learning_rate": 0.00010116731517509727,
+      "loss": 0.649,
+      "step": 387
+    },
+    {
+      "epoch": 0.4998389694041868,
+      "grad_norm": 0.5118862986564636,
+      "learning_rate": 0.00010090791180285344,
+      "loss": 0.6043,
+      "step": 388
+    },
+    {
+      "epoch": 0.5011272141706924,
+      "grad_norm": 0.5366110801696777,
+      "learning_rate": 0.0001006485084306096,
+      "loss": 0.7139,
+      "step": 389
+    },
+    {
+      "epoch": 0.5024154589371981,
+      "grad_norm": 0.5275729894638062,
+      "learning_rate": 0.00010038910505836577,
+      "loss": 0.7035,
+      "step": 390
+    },
+    {
+      "epoch": 0.5037037037037037,
+      "grad_norm": 0.5201203227043152,
+      "learning_rate": 0.00010012970168612193,
+      "loss": 0.716,
+      "step": 391
+    },
+    {
+      "epoch": 0.5049919484702093,
+      "grad_norm": 0.5168887376785278,
+      "learning_rate": 9.987029831387808e-05,
+      "loss": 0.8432,
+      "step": 392
+    },
+    {
+      "epoch": 0.506280193236715,
+      "grad_norm": 0.5083385109901428,
+      "learning_rate": 9.961089494163424e-05,
+      "loss": 0.7078,
+      "step": 393
+    },
+    {
+      "epoch": 0.5075684380032206,
+      "grad_norm": 0.5033498406410217,
+      "learning_rate": 9.935149156939041e-05,
+      "loss": 0.6846,
+      "step": 394
+    },
+    {
+      "epoch": 0.5088566827697263,
+      "grad_norm": 0.5229712128639221,
+      "learning_rate": 9.909208819714657e-05,
+      "loss": 0.7517,
+      "step": 395
+    },
+    {
+      "epoch": 0.5101449275362319,
+      "grad_norm": 0.4493921399116516,
+      "learning_rate": 9.883268482490274e-05,
+      "loss": 0.5185,
+      "step": 396
+    },
+    {
+      "epoch": 0.5114331723027375,
+      "grad_norm": 0.4618862569332123,
+      "learning_rate": 9.857328145265889e-05,
+      "loss": 0.6914,
+      "step": 397
+    },
+    {
+      "epoch": 0.5127214170692431,
+      "grad_norm": 0.5105440020561218,
+      "learning_rate": 9.831387808041504e-05,
+      "loss": 0.7408,
+      "step": 398
+    },
+    {
+      "epoch": 0.5140096618357488,
+      "grad_norm": 0.4876827001571655,
+      "learning_rate": 9.80544747081712e-05,
+      "loss": 0.7642,
+      "step": 399
+    },
+    {
+      "epoch": 0.5152979066022544,
+      "grad_norm": 0.5248561501502991,
+      "learning_rate": 9.779507133592737e-05,
+      "loss": 0.6578,
+      "step": 400
+    },
+    {
+      "epoch": 0.5165861513687601,
+      "grad_norm": 0.4495491087436676,
+      "learning_rate": 9.753566796368353e-05,
+      "loss": 0.6296,
+      "step": 401
+    },
+    {
+      "epoch": 0.5178743961352656,
+      "grad_norm": 0.4628872573375702,
+      "learning_rate": 9.72762645914397e-05,
+      "loss": 0.5686,
+      "step": 402
+    },
+    {
+      "epoch": 0.5191626409017713,
+      "grad_norm": 0.5524469017982483,
+      "learning_rate": 9.701686121919586e-05,
+      "loss": 0.8243,
+      "step": 403
+    },
+    {
+      "epoch": 0.520450885668277,
+      "grad_norm": 0.5526472926139832,
+      "learning_rate": 9.675745784695201e-05,
+      "loss": 0.7644,
+      "step": 404
+    },
+    {
+      "epoch": 0.5217391304347826,
+      "grad_norm": 0.5220494270324707,
+      "learning_rate": 9.649805447470817e-05,
+      "loss": 0.7113,
+      "step": 405
+    },
+    {
+      "epoch": 0.5230273752012883,
+      "grad_norm": 0.4727495610713959,
+      "learning_rate": 9.623865110246434e-05,
+      "loss": 0.5613,
+      "step": 406
+    },
+    {
+      "epoch": 0.5243156199677939,
+      "grad_norm": 0.440445214509964,
+      "learning_rate": 9.59792477302205e-05,
+      "loss": 0.5719,
+      "step": 407
+    },
+    {
+      "epoch": 0.5256038647342995,
+      "grad_norm": 0.520539402961731,
+      "learning_rate": 9.571984435797667e-05,
+      "loss": 0.6716,
+      "step": 408
+    },
+    {
+      "epoch": 0.5268921095008051,
+      "grad_norm": 0.5473395586013794,
+      "learning_rate": 9.546044098573282e-05,
+      "loss": 0.6881,
+      "step": 409
+    },
+    {
+      "epoch": 0.5281803542673108,
+      "grad_norm": 0.5728646516799927,
+      "learning_rate": 9.520103761348897e-05,
+      "loss": 0.694,
+      "step": 410
+    },
+    {
+      "epoch": 0.5294685990338164,
+      "grad_norm": 0.5672905445098877,
+      "learning_rate": 9.494163424124513e-05,
+      "loss": 0.7893,
+      "step": 411
+    },
+    {
+      "epoch": 0.5307568438003221,
+      "grad_norm": 0.5057477355003357,
+      "learning_rate": 9.46822308690013e-05,
+      "loss": 0.7957,
+      "step": 412
+    },
+    {
+      "epoch": 0.5320450885668278,
+      "grad_norm": 0.5638203620910645,
+      "learning_rate": 9.442282749675746e-05,
+      "loss": 0.8208,
+      "step": 413
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.4758095145225525,
+      "learning_rate": 9.416342412451363e-05,
+      "loss": 0.6516,
+      "step": 414
+    },
+    {
+      "epoch": 0.534621578099839,
+      "grad_norm": 0.5819146037101746,
+      "learning_rate": 9.390402075226979e-05,
+      "loss": 0.7839,
+      "step": 415
+    },
+    {
+      "epoch": 0.5359098228663446,
+      "grad_norm": 0.5698294639587402,
+      "learning_rate": 9.364461738002594e-05,
+      "loss": 0.6847,
+      "step": 416
+    },
+    {
+      "epoch": 0.5371980676328503,
+      "grad_norm": 0.5539317727088928,
+      "learning_rate": 9.33852140077821e-05,
+      "loss": 0.6868,
+      "step": 417
+    },
+    {
+      "epoch": 0.5384863123993558,
+      "grad_norm": 0.5531253218650818,
+      "learning_rate": 9.312581063553827e-05,
+      "loss": 0.6416,
+      "step": 418
+    },
+    {
+      "epoch": 0.5397745571658615,
+      "grad_norm": 0.5280610918998718,
+      "learning_rate": 9.286640726329443e-05,
+      "loss": 0.6711,
+      "step": 419
+    },
+    {
+      "epoch": 0.5410628019323671,
+      "grad_norm": 0.5485169291496277,
+      "learning_rate": 9.26070038910506e-05,
+      "loss": 0.67,
+      "step": 420
+    },
+    {
+      "epoch": 0.5423510466988728,
+      "grad_norm": 0.632940948009491,
+      "learning_rate": 9.234760051880675e-05,
+      "loss": 1.0122,
+      "step": 421
+    },
+    {
+      "epoch": 0.5436392914653784,
+      "grad_norm": 0.5226237177848816,
+      "learning_rate": 9.20881971465629e-05,
+      "loss": 0.6517,
+      "step": 422
+    },
+    {
+      "epoch": 0.5449275362318841,
+      "grad_norm": 0.5100864768028259,
+      "learning_rate": 9.182879377431906e-05,
+      "loss": 0.5886,
+      "step": 423
+    },
+    {
+      "epoch": 0.5462157809983897,
+      "grad_norm": 0.5091288685798645,
+      "learning_rate": 9.156939040207523e-05,
+      "loss": 0.759,
+      "step": 424
+    },
+    {
+      "epoch": 0.5475040257648953,
+      "grad_norm": 0.5094250440597534,
+      "learning_rate": 9.130998702983139e-05,
+      "loss": 0.6407,
+      "step": 425
+    },
+    {
+      "epoch": 0.548792270531401,
+      "grad_norm": 0.4518897533416748,
+      "learning_rate": 9.105058365758756e-05,
+      "loss": 0.5463,
+      "step": 426
+    },
+    {
+      "epoch": 0.5500805152979066,
+      "grad_norm": 0.5876538753509521,
+      "learning_rate": 9.07911802853437e-05,
+      "loss": 0.8909,
+      "step": 427
+    },
+    {
+      "epoch": 0.5513687600644123,
+      "grad_norm": 0.5553408265113831,
+      "learning_rate": 9.053177691309987e-05,
+      "loss": 0.9111,
+      "step": 428
+    },
+    {
+      "epoch": 0.5526570048309178,
+      "grad_norm": 0.6221159100532532,
+      "learning_rate": 9.027237354085604e-05,
+      "loss": 0.8558,
+      "step": 429
+    },
+    {
+      "epoch": 0.5539452495974235,
+      "grad_norm": 0.5404058694839478,
+      "learning_rate": 9.00129701686122e-05,
+      "loss": 0.7609,
+      "step": 430
+    },
+    {
+      "epoch": 0.5552334943639291,
+      "grad_norm": 0.43805137276649475,
+      "learning_rate": 8.975356679636836e-05,
+      "loss": 0.5286,
+      "step": 431
+    },
+    {
+      "epoch": 0.5565217391304348,
+      "grad_norm": 0.493563175201416,
+      "learning_rate": 8.949416342412453e-05,
+      "loss": 0.5727,
+      "step": 432
+    },
+    {
+      "epoch": 0.5578099838969404,
+      "grad_norm": 0.5368430614471436,
+      "learning_rate": 8.923476005188068e-05,
+      "loss": 0.7623,
+      "step": 433
+    },
+    {
+      "epoch": 0.559098228663446,
+      "grad_norm": 0.4323422312736511,
+      "learning_rate": 8.897535667963683e-05,
+      "loss": 0.5958,
+      "step": 434
+    },
+    {
+      "epoch": 0.5603864734299517,
+      "grad_norm": 0.49179717898368835,
+      "learning_rate": 8.8715953307393e-05,
+      "loss": 0.681,
+      "step": 435
+    },
+    {
+      "epoch": 0.5616747181964573,
+      "grad_norm": 0.40715619921684265,
+      "learning_rate": 8.845654993514916e-05,
+      "loss": 0.6298,
+      "step": 436
+    },
+    {
+      "epoch": 0.562962962962963,
+      "grad_norm": 0.6095149517059326,
+      "learning_rate": 8.819714656290532e-05,
+      "loss": 1.0039,
+      "step": 437
+    },
+    {
+      "epoch": 0.5642512077294686,
+      "grad_norm": 0.5469616055488586,
+      "learning_rate": 8.793774319066149e-05,
+      "loss": 0.7941,
+      "step": 438
+    },
+    {
+      "epoch": 0.5655394524959743,
+      "grad_norm": 0.5149989128112793,
+      "learning_rate": 8.767833981841764e-05,
+      "loss": 0.7819,
+      "step": 439
+    },
+    {
+      "epoch": 0.5668276972624798,
+      "grad_norm": 0.479438453912735,
+      "learning_rate": 8.74189364461738e-05,
+      "loss": 0.4806,
+      "step": 440
+    },
+    {
+      "epoch": 0.5681159420289855,
+      "grad_norm": 0.562567412853241,
+      "learning_rate": 8.715953307392997e-05,
+      "loss": 0.7942,
+      "step": 441
+    },
+    {
+      "epoch": 0.5694041867954911,
+      "grad_norm": 0.5192587375640869,
+      "learning_rate": 8.690012970168613e-05,
+      "loss": 0.6479,
+      "step": 442
+    },
+    {
+      "epoch": 0.5706924315619968,
+      "grad_norm": 0.4897756576538086,
+      "learning_rate": 8.66407263294423e-05,
+      "loss": 0.6539,
+      "step": 443
+    },
+    {
+      "epoch": 0.5719806763285025,
+      "grad_norm": 0.45649632811546326,
+      "learning_rate": 8.638132295719846e-05,
+      "loss": 0.657,
+      "step": 444
+    },
+    {
+      "epoch": 0.573268921095008,
+      "grad_norm": 0.5581417679786682,
+      "learning_rate": 8.612191958495461e-05,
+      "loss": 0.7415,
+      "step": 445
+    },
+    {
+      "epoch": 0.5745571658615137,
+      "grad_norm": 0.4822051525115967,
+      "learning_rate": 8.586251621271076e-05,
+      "loss": 0.7249,
+      "step": 446
+    },
+    {
+      "epoch": 0.5758454106280193,
+      "grad_norm": 0.6398015022277832,
+      "learning_rate": 8.560311284046692e-05,
+      "loss": 0.7328,
+      "step": 447
+    },
+    {
+      "epoch": 0.577133655394525,
+      "grad_norm": 0.5618659257888794,
+      "learning_rate": 8.534370946822309e-05,
+      "loss": 0.8104,
+      "step": 448
+    },
+    {
+      "epoch": 0.5784219001610306,
+      "grad_norm": 0.49202972650527954,
+      "learning_rate": 8.508430609597925e-05,
+      "loss": 0.6797,
+      "step": 449
+    },
+    {
+      "epoch": 0.5797101449275363,
+      "grad_norm": 0.5291930437088013,
+      "learning_rate": 8.482490272373542e-05,
+      "loss": 0.6015,
+      "step": 450
+    },
+    {
+      "epoch": 0.5809983896940418,
+      "grad_norm": 0.5322192907333374,
+      "learning_rate": 8.456549935149157e-05,
+      "loss": 0.7246,
+      "step": 451
+    },
+    {
+      "epoch": 0.5822866344605475,
+      "grad_norm": 0.5172200798988342,
+      "learning_rate": 8.430609597924773e-05,
+      "loss": 0.6873,
+      "step": 452
+    },
+    {
+      "epoch": 0.5835748792270531,
+      "grad_norm": 0.5367067456245422,
+      "learning_rate": 8.40466926070039e-05,
+      "loss": 0.7349,
+      "step": 453
+    },
+    {
+      "epoch": 0.5848631239935588,
+      "grad_norm": 0.5243058204650879,
+      "learning_rate": 8.378728923476006e-05,
+      "loss": 0.6441,
+      "step": 454
+    },
+    {
+      "epoch": 0.5861513687600645,
+      "grad_norm": 0.5509822964668274,
+      "learning_rate": 8.352788586251622e-05,
+      "loss": 0.7456,
+      "step": 455
+    },
+    {
+      "epoch": 0.58743961352657,
+      "grad_norm": 0.5376744866371155,
+      "learning_rate": 8.326848249027238e-05,
+      "loss": 0.6808,
+      "step": 456
+    },
+    {
+      "epoch": 0.5887278582930757,
+      "grad_norm": 0.5412257313728333,
+      "learning_rate": 8.300907911802854e-05,
+      "loss": 0.6135,
+      "step": 457
+    },
+    {
+      "epoch": 0.5900161030595813,
+      "grad_norm": 0.5956122279167175,
+      "learning_rate": 8.274967574578469e-05,
+      "loss": 0.7419,
+      "step": 458
+    },
+    {
+      "epoch": 0.591304347826087,
+      "grad_norm": 0.5524086952209473,
+      "learning_rate": 8.249027237354085e-05,
+      "loss": 0.5655,
+      "step": 459
+    },
+    {
+      "epoch": 0.5925925925925926,
+      "grad_norm": 0.5783061981201172,
+      "learning_rate": 8.223086900129702e-05,
+      "loss": 0.6528,
+      "step": 460
+    },
+    {
+      "epoch": 0.5938808373590982,
+      "grad_norm": 0.5542893409729004,
+      "learning_rate": 8.197146562905318e-05,
+      "loss": 0.6988,
+      "step": 461
+    },
+    {
+      "epoch": 0.5951690821256038,
+      "grad_norm": 0.5710337162017822,
+      "learning_rate": 8.171206225680935e-05,
+      "loss": 0.7465,
+      "step": 462
+    },
+    {
+      "epoch": 0.5964573268921095,
+      "grad_norm": 0.5694112181663513,
+      "learning_rate": 8.14526588845655e-05,
+      "loss": 0.7005,
+      "step": 463
+    },
+    {
+      "epoch": 0.5977455716586151,
+      "grad_norm": 0.5017877221107483,
+      "learning_rate": 8.119325551232166e-05,
+      "loss": 0.5978,
+      "step": 464
+    },
+    {
+      "epoch": 0.5990338164251208,
+      "grad_norm": 0.5797461271286011,
+      "learning_rate": 8.093385214007783e-05,
+      "loss": 0.8,
+      "step": 465
+    },
+    {
+      "epoch": 0.6003220611916265,
+      "grad_norm": 0.597811222076416,
+      "learning_rate": 8.067444876783399e-05,
+      "loss": 0.8356,
+      "step": 466
+    },
+    {
+      "epoch": 0.601610305958132,
+      "grad_norm": 0.5971367955207825,
+      "learning_rate": 8.041504539559015e-05,
+      "loss": 1.0245,
+      "step": 467
+    },
+    {
+      "epoch": 0.6028985507246377,
+      "grad_norm": 0.5506448745727539,
+      "learning_rate": 8.01556420233463e-05,
+      "loss": 0.6065,
+      "step": 468
+    },
+    {
+      "epoch": 0.6041867954911433,
+      "grad_norm": 0.5866613984107971,
+      "learning_rate": 7.989623865110247e-05,
+      "loss": 0.7841,
+      "step": 469
+    },
+    {
+      "epoch": 0.605475040257649,
+      "grad_norm": 0.5632089376449585,
+      "learning_rate": 7.963683527885862e-05,
+      "loss": 0.8046,
+      "step": 470
+    },
+    {
+      "epoch": 0.6067632850241546,
+      "grad_norm": 0.5145373940467834,
+      "learning_rate": 7.937743190661478e-05,
+      "loss": 0.5978,
+      "step": 471
+    },
+    {
+      "epoch": 0.6080515297906602,
+      "grad_norm": 0.48332056403160095,
+      "learning_rate": 7.911802853437095e-05,
+      "loss": 0.6119,
+      "step": 472
+    },
+    {
+      "epoch": 0.6093397745571658,
+      "grad_norm": 0.522520899772644,
+      "learning_rate": 7.885862516212711e-05,
+      "loss": 0.6623,
+      "step": 473
+    },
+    {
+      "epoch": 0.6106280193236715,
+      "grad_norm": 0.5305100679397583,
+      "learning_rate": 7.859922178988328e-05,
+      "loss": 0.7882,
+      "step": 474
+    },
+    {
+      "epoch": 0.6119162640901772,
+      "grad_norm": 0.4909839630126953,
+      "learning_rate": 7.833981841763943e-05,
+      "loss": 0.625,
+      "step": 475
+    },
+    {
+      "epoch": 0.6132045088566828,
+      "grad_norm": 0.5770312547683716,
+      "learning_rate": 7.808041504539559e-05,
+      "loss": 0.7479,
+      "step": 476
+    },
+    {
+      "epoch": 0.6144927536231884,
+      "grad_norm": 0.556817889213562,
+      "learning_rate": 7.782101167315176e-05,
+      "loss": 0.8317,
+      "step": 477
+    },
+    {
+      "epoch": 0.615780998389694,
+      "grad_norm": 0.5197098255157471,
+      "learning_rate": 7.756160830090792e-05,
+      "loss": 0.761,
+      "step": 478
+    },
+    {
+      "epoch": 0.6170692431561997,
+      "grad_norm": 0.5032650828361511,
+      "learning_rate": 7.730220492866408e-05,
+      "loss": 0.7149,
+      "step": 479
+    },
+    {
+      "epoch": 0.6183574879227053,
+      "grad_norm": 0.5901761651039124,
+      "learning_rate": 7.704280155642024e-05,
+      "loss": 0.723,
+      "step": 480
+    },
+    {
+      "epoch": 0.619645732689211,
+      "grad_norm": 0.5224949717521667,
+      "learning_rate": 7.67833981841764e-05,
+      "loss": 0.7275,
+      "step": 481
+    },
+    {
+      "epoch": 0.6209339774557165,
+      "grad_norm": 0.47279688715934753,
+      "learning_rate": 7.652399481193255e-05,
+      "loss": 0.5791,
+      "step": 482
+    },
+    {
+      "epoch": 0.6222222222222222,
+      "grad_norm": 0.49582868814468384,
+      "learning_rate": 7.626459143968871e-05,
+      "loss": 0.6728,
+      "step": 483
+    },
+    {
+      "epoch": 0.6235104669887278,
+      "grad_norm": 0.4722840189933777,
+      "learning_rate": 7.600518806744488e-05,
+      "loss": 0.5838,
+      "step": 484
+    },
+    {
+      "epoch": 0.6247987117552335,
+      "grad_norm": 0.5800105333328247,
+      "learning_rate": 7.574578469520104e-05,
+      "loss": 0.595,
+      "step": 485
+    },
+    {
+      "epoch": 0.6260869565217392,
+      "grad_norm": 0.5518195033073425,
+      "learning_rate": 7.54863813229572e-05,
+      "loss": 0.7302,
+      "step": 486
+    },
+    {
+      "epoch": 0.6273752012882448,
+      "grad_norm": 0.44109973311424255,
+      "learning_rate": 7.522697795071336e-05,
+      "loss": 0.5433,
+      "step": 487
+    },
+    {
+      "epoch": 0.6286634460547504,
+      "grad_norm": 0.5839915871620178,
+      "learning_rate": 7.496757457846952e-05,
+      "loss": 0.7484,
+      "step": 488
+    },
+    {
+      "epoch": 0.629951690821256,
+      "grad_norm": 0.6299886107444763,
+      "learning_rate": 7.470817120622569e-05,
+      "loss": 0.7771,
+      "step": 489
+    },
+    {
+      "epoch": 0.6312399355877617,
+      "grad_norm": 0.48367929458618164,
+      "learning_rate": 7.444876783398185e-05,
+      "loss": 0.5674,
+      "step": 490
+    },
+    {
+      "epoch": 0.6325281803542673,
+      "grad_norm": 0.5867652893066406,
+      "learning_rate": 7.418936446173802e-05,
+      "loss": 0.7733,
+      "step": 491
+    },
+    {
+      "epoch": 0.633816425120773,
+      "grad_norm": 0.4677927494049072,
+      "learning_rate": 7.392996108949417e-05,
+      "loss": 0.6418,
+      "step": 492
+    },
+    {
+      "epoch": 0.6351046698872785,
+      "grad_norm": 0.5139054656028748,
+      "learning_rate": 7.367055771725033e-05,
+      "loss": 0.7922,
+      "step": 493
+    },
+    {
+      "epoch": 0.6363929146537842,
+      "grad_norm": 0.4561646282672882,
+      "learning_rate": 7.341115434500648e-05,
+      "loss": 0.5885,
+      "step": 494
+    },
+    {
+      "epoch": 0.6376811594202898,
+      "grad_norm": 0.5079929828643799,
+      "learning_rate": 7.315175097276265e-05,
+      "loss": 0.6827,
+      "step": 495
+    },
+    {
+      "epoch": 0.6389694041867955,
+      "grad_norm": 0.5590360164642334,
+      "learning_rate": 7.289234760051881e-05,
+      "loss": 0.6681,
+      "step": 496
+    },
+    {
+      "epoch": 0.6402576489533012,
+      "grad_norm": 0.585269033908844,
+      "learning_rate": 7.263294422827497e-05,
+      "loss": 0.7604,
+      "step": 497
+    },
+    {
+      "epoch": 0.6415458937198067,
+      "grad_norm": 0.5380440950393677,
+      "learning_rate": 7.237354085603112e-05,
+      "loss": 0.7946,
+      "step": 498
+    },
+    {
+      "epoch": 0.6428341384863124,
+      "grad_norm": 0.4413246214389801,
+      "learning_rate": 7.211413748378729e-05,
+      "loss": 0.5847,
+      "step": 499
+    },
+    {
+      "epoch": 0.644122383252818,
+      "grad_norm": 0.536934494972229,
+      "learning_rate": 7.185473411154345e-05,
+      "loss": 0.745,
+      "step": 500
+    },
+    {
+      "epoch": 0.6454106280193237,
+      "grad_norm": 0.46904176473617554,
+      "learning_rate": 7.159533073929962e-05,
+      "loss": 0.6846,
+      "step": 501
+    },
+    {
+      "epoch": 0.6466988727858293,
+      "grad_norm": 0.5345873832702637,
+      "learning_rate": 7.133592736705578e-05,
+      "loss": 0.7499,
+      "step": 502
+    },
+    {
+      "epoch": 0.647987117552335,
+      "grad_norm": 0.5083842873573303,
+      "learning_rate": 7.107652399481195e-05,
+      "loss": 0.7829,
+      "step": 503
+    },
+    {
+      "epoch": 0.6492753623188405,
+      "grad_norm": 0.49629780650138855,
+      "learning_rate": 7.08171206225681e-05,
+      "loss": 0.6308,
+      "step": 504
+    },
+    {
+      "epoch": 0.6505636070853462,
+      "grad_norm": 0.5113663077354431,
+      "learning_rate": 7.055771725032426e-05,
+      "loss": 0.7062,
+      "step": 505
+    },
+    {
+      "epoch": 0.6518518518518519,
+      "grad_norm": 0.5348049402236938,
+      "learning_rate": 7.029831387808041e-05,
+      "loss": 0.7495,
+      "step": 506
+    },
+    {
+      "epoch": 0.6531400966183575,
+      "grad_norm": 0.5834509134292603,
+      "learning_rate": 7.003891050583658e-05,
+      "loss": 0.81,
+      "step": 507
+    },
+    {
+      "epoch": 0.6544283413848632,
+      "grad_norm": 0.5517732501029968,
+      "learning_rate": 6.977950713359274e-05,
+      "loss": 0.7376,
+      "step": 508
+    },
+    {
+      "epoch": 0.6557165861513687,
+      "grad_norm": 0.5555460453033447,
+      "learning_rate": 6.95201037613489e-05,
+      "loss": 0.6707,
+      "step": 509
+    },
+    {
+      "epoch": 0.6570048309178744,
+      "grad_norm": 0.5952188968658447,
+      "learning_rate": 6.926070038910505e-05,
+      "loss": 0.8308,
+      "step": 510
+    },
+    {
+      "epoch": 0.65829307568438,
+      "grad_norm": 0.46281638741493225,
+      "learning_rate": 6.900129701686122e-05,
+      "loss": 0.5855,
+      "step": 511
+    },
+    {
+      "epoch": 0.6595813204508857,
+      "grad_norm": 0.5051981210708618,
+      "learning_rate": 6.874189364461738e-05,
+      "loss": 0.7197,
+      "step": 512
+    },
+    {
+      "epoch": 0.6608695652173913,
+      "grad_norm": 0.5460030436515808,
+      "learning_rate": 6.848249027237355e-05,
+      "loss": 0.6863,
+      "step": 513
+    },
+    {
+      "epoch": 0.662157809983897,
+      "grad_norm": 0.504718542098999,
+      "learning_rate": 6.822308690012971e-05,
+      "loss": 0.5749,
+      "step": 514
+    },
+    {
+      "epoch": 0.6634460547504025,
+      "grad_norm": 0.5503727793693542,
+      "learning_rate": 6.796368352788586e-05,
+      "loss": 0.6802,
+      "step": 515
+    },
+    {
+      "epoch": 0.6647342995169082,
+      "grad_norm": 0.559354305267334,
+      "learning_rate": 6.770428015564203e-05,
+      "loss": 0.6774,
+      "step": 516
+    },
+    {
+      "epoch": 0.6660225442834139,
+      "grad_norm": 0.5191950798034668,
+      "learning_rate": 6.744487678339819e-05,
+      "loss": 0.5853,
+      "step": 517
+    },
+    {
+      "epoch": 0.6673107890499195,
+      "grad_norm": 0.5837051868438721,
+      "learning_rate": 6.718547341115434e-05,
+      "loss": 0.7629,
+      "step": 518
+    },
+    {
+      "epoch": 0.6685990338164252,
+      "grad_norm": 0.49824637174606323,
+      "learning_rate": 6.69260700389105e-05,
+      "loss": 0.6449,
+      "step": 519
+    },
+    {
+      "epoch": 0.6698872785829307,
+      "grad_norm": 0.5827267169952393,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.6425,
+      "step": 520
+    },
+    {
+      "epoch": 0.6711755233494364,
+      "grad_norm": 0.5547000169754028,
+      "learning_rate": 6.640726329442283e-05,
+      "loss": 0.8105,
+      "step": 521
+    },
+    {
+      "epoch": 0.672463768115942,
+      "grad_norm": 0.5251694321632385,
+      "learning_rate": 6.614785992217898e-05,
+      "loss": 0.6392,
+      "step": 522
+    },
+    {
+      "epoch": 0.6737520128824477,
+      "grad_norm": 0.577367901802063,
+      "learning_rate": 6.588845654993515e-05,
+      "loss": 0.7776,
+      "step": 523
+    },
+    {
+      "epoch": 0.6750402576489533,
+      "grad_norm": 0.5495286583900452,
+      "learning_rate": 6.562905317769131e-05,
+      "loss": 0.6673,
+      "step": 524
+    },
+    {
+      "epoch": 0.6763285024154589,
+      "grad_norm": 0.6513116955757141,
+      "learning_rate": 6.536964980544748e-05,
+      "loss": 0.8314,
+      "step": 525
+    },
+    {
+      "epoch": 0.6776167471819645,
+      "grad_norm": 0.5346915125846863,
+      "learning_rate": 6.511024643320364e-05,
+      "loss": 0.703,
+      "step": 526
+    },
+    {
+      "epoch": 0.6789049919484702,
+      "grad_norm": 0.5663869380950928,
+      "learning_rate": 6.485084306095979e-05,
+      "loss": 0.6595,
+      "step": 527
+    },
+    {
+      "epoch": 0.6801932367149759,
+      "grad_norm": 0.5390554070472717,
+      "learning_rate": 6.459143968871596e-05,
+      "loss": 0.7031,
+      "step": 528
+    },
+    {
+      "epoch": 0.6814814814814815,
+      "grad_norm": 0.5291828513145447,
+      "learning_rate": 6.433203631647212e-05,
+      "loss": 0.7384,
+      "step": 529
+    },
+    {
+      "epoch": 0.6827697262479872,
+      "grad_norm": 0.507726788520813,
+      "learning_rate": 6.407263294422829e-05,
+      "loss": 0.6169,
+      "step": 530
+    },
+    {
+      "epoch": 0.6840579710144927,
+      "grad_norm": 0.524138331413269,
+      "learning_rate": 6.381322957198444e-05,
+      "loss": 0.6784,
+      "step": 531
+    },
+    {
+      "epoch": 0.6853462157809984,
+      "grad_norm": 0.5644485950469971,
+      "learning_rate": 6.35538261997406e-05,
+      "loss": 0.8255,
+      "step": 532
+    },
+    {
+      "epoch": 0.686634460547504,
+      "grad_norm": 0.5468744039535522,
+      "learning_rate": 6.329442282749676e-05,
+      "loss": 0.7893,
+      "step": 533
+    },
+    {
+      "epoch": 0.6879227053140097,
+      "grad_norm": 0.4952101409435272,
+      "learning_rate": 6.303501945525292e-05,
+      "loss": 0.6192,
+      "step": 534
+    },
+    {
+      "epoch": 0.6892109500805152,
+      "grad_norm": 0.5614569187164307,
+      "learning_rate": 6.277561608300908e-05,
+      "loss": 0.7055,
+      "step": 535
+    },
+    {
+      "epoch": 0.6904991948470209,
+      "grad_norm": 0.5651270151138306,
+      "learning_rate": 6.251621271076524e-05,
+      "loss": 0.7327,
+      "step": 536
+    },
+    {
+      "epoch": 0.6917874396135266,
+      "grad_norm": 0.5416032075881958,
+      "learning_rate": 6.225680933852141e-05,
+      "loss": 0.64,
+      "step": 537
+    },
+    {
+      "epoch": 0.6930756843800322,
+      "grad_norm": 0.6302821636199951,
+      "learning_rate": 6.199740596627757e-05,
+      "loss": 0.8542,
+      "step": 538
+    },
+    {
+      "epoch": 0.6943639291465379,
+      "grad_norm": 0.5361074805259705,
+      "learning_rate": 6.173800259403372e-05,
+      "loss": 0.6282,
+      "step": 539
+    },
+    {
+      "epoch": 0.6956521739130435,
+      "grad_norm": 0.5210204124450684,
+      "learning_rate": 6.147859922178989e-05,
+      "loss": 0.8354,
+      "step": 540
+    },
+    {
+      "epoch": 0.6969404186795491,
+      "grad_norm": 0.5401708483695984,
+      "learning_rate": 6.121919584954605e-05,
+      "loss": 0.6165,
+      "step": 541
+    },
+    {
+      "epoch": 0.6982286634460547,
+      "grad_norm": 0.516559362411499,
+      "learning_rate": 6.0959792477302215e-05,
+      "loss": 0.5498,
+      "step": 542
+    },
+    {
+      "epoch": 0.6995169082125604,
+      "grad_norm": 0.5983400344848633,
+      "learning_rate": 6.0700389105058366e-05,
+      "loss": 0.7538,
+      "step": 543
+    },
+    {
+      "epoch": 0.700805152979066,
+      "grad_norm": 0.5111982226371765,
+      "learning_rate": 6.0440985732814524e-05,
+      "loss": 0.6156,
+      "step": 544
+    },
+    {
+      "epoch": 0.7020933977455717,
+      "grad_norm": 0.5821353793144226,
+      "learning_rate": 6.018158236057069e-05,
+      "loss": 0.6417,
+      "step": 545
+    },
+    {
+      "epoch": 0.7033816425120772,
+      "grad_norm": 0.4738411009311676,
+      "learning_rate": 5.992217898832685e-05,
+      "loss": 0.6541,
+      "step": 546
+    },
+    {
+      "epoch": 0.7046698872785829,
+      "grad_norm": 0.6165397763252258,
+      "learning_rate": 5.966277561608301e-05,
+      "loss": 0.7366,
+      "step": 547
+    },
+    {
+      "epoch": 0.7059581320450886,
+      "grad_norm": 0.5883972644805908,
+      "learning_rate": 5.9403372243839174e-05,
+      "loss": 0.7371,
+      "step": 548
+    },
+    {
+      "epoch": 0.7072463768115942,
+      "grad_norm": 0.5415938496589661,
+      "learning_rate": 5.914396887159533e-05,
+      "loss": 0.6334,
+      "step": 549
+    },
+    {
+      "epoch": 0.7085346215780999,
+      "grad_norm": 0.5565886497497559,
+      "learning_rate": 5.8884565499351496e-05,
+      "loss": 0.7425,
+      "step": 550
+    },
+    {
+      "epoch": 0.7098228663446055,
+      "grad_norm": 0.6447110772132874,
+      "learning_rate": 5.862516212710766e-05,
+      "loss": 0.8405,
+      "step": 551
+    },
+    {
+      "epoch": 0.7111111111111111,
+      "grad_norm": 0.6419034004211426,
+      "learning_rate": 5.836575875486382e-05,
+      "loss": 0.8779,
+      "step": 552
+    },
+    {
+      "epoch": 0.7123993558776167,
+      "grad_norm": 0.4611152708530426,
+      "learning_rate": 5.810635538261998e-05,
+      "loss": 0.5832,
+      "step": 553
+    },
+    {
+      "epoch": 0.7136876006441224,
+      "grad_norm": 0.6436396837234497,
+      "learning_rate": 5.7846952010376146e-05,
+      "loss": 0.8172,
+      "step": 554
+    },
+    {
+      "epoch": 0.714975845410628,
+      "grad_norm": 0.5647209286689758,
+      "learning_rate": 5.7587548638132296e-05,
+      "loss": 0.6637,
+      "step": 555
+    },
+    {
+      "epoch": 0.7162640901771337,
+      "grad_norm": 0.5272210240364075,
+      "learning_rate": 5.7328145265888454e-05,
+      "loss": 0.6103,
+      "step": 556
+    },
+    {
+      "epoch": 0.7175523349436392,
+      "grad_norm": 0.5229634046554565,
+      "learning_rate": 5.706874189364462e-05,
+      "loss": 0.5514,
+      "step": 557
+    },
+    {
+      "epoch": 0.7188405797101449,
+      "grad_norm": 0.6116520166397095,
+      "learning_rate": 5.6809338521400776e-05,
+      "loss": 0.8535,
+      "step": 558
+    },
+    {
+      "epoch": 0.7201288244766506,
+      "grad_norm": 0.5706294178962708,
+      "learning_rate": 5.654993514915694e-05,
+      "loss": 0.7606,
+      "step": 559
+    },
+    {
+      "epoch": 0.7214170692431562,
+      "grad_norm": 0.6013360619544983,
+      "learning_rate": 5.6290531776913104e-05,
+      "loss": 0.7887,
+      "step": 560
+    },
+    {
+      "epoch": 0.7227053140096619,
+      "grad_norm": 0.5661988258361816,
+      "learning_rate": 5.603112840466926e-05,
+      "loss": 0.9302,
+      "step": 561
+    },
+    {
+      "epoch": 0.7239935587761674,
+      "grad_norm": 0.5267884135246277,
+      "learning_rate": 5.5771725032425426e-05,
+      "loss": 0.7534,
+      "step": 562
+    },
+    {
+      "epoch": 0.7252818035426731,
+      "grad_norm": 0.4822220504283905,
+      "learning_rate": 5.551232166018159e-05,
+      "loss": 0.5666,
+      "step": 563
+    },
+    {
+      "epoch": 0.7265700483091787,
+      "grad_norm": 0.5841349363327026,
+      "learning_rate": 5.525291828793775e-05,
+      "loss": 0.7755,
+      "step": 564
+    },
+    {
+      "epoch": 0.7278582930756844,
+      "grad_norm": 0.5259692072868347,
+      "learning_rate": 5.499351491569391e-05,
+      "loss": 0.7599,
+      "step": 565
+    },
+    {
+      "epoch": 0.72914653784219,
+      "grad_norm": 0.5511097311973572,
+      "learning_rate": 5.4734111543450076e-05,
+      "loss": 0.6198,
+      "step": 566
+    },
+    {
+      "epoch": 0.7304347826086957,
+      "grad_norm": 0.5707940459251404,
+      "learning_rate": 5.447470817120623e-05,
+      "loss": 0.7669,
+      "step": 567
+    },
+    {
+      "epoch": 0.7317230273752013,
+      "grad_norm": 0.6099474430084229,
+      "learning_rate": 5.4215304798962384e-05,
+      "loss": 0.7636,
+      "step": 568
+    },
+    {
+      "epoch": 0.7330112721417069,
+      "grad_norm": 0.4825986623764038,
+      "learning_rate": 5.395590142671855e-05,
+      "loss": 0.5758,
+      "step": 569
+    },
+    {
+      "epoch": 0.7342995169082126,
+      "grad_norm": 0.457233190536499,
+      "learning_rate": 5.3696498054474706e-05,
+      "loss": 0.5534,
+      "step": 570
+    },
+    {
+      "epoch": 0.7355877616747182,
+      "grad_norm": 0.5602165460586548,
+      "learning_rate": 5.343709468223087e-05,
+      "loss": 0.5802,
+      "step": 571
+    },
+    {
+      "epoch": 0.7368760064412239,
+      "grad_norm": 0.6400203108787537,
+      "learning_rate": 5.3177691309987034e-05,
+      "loss": 0.9281,
+      "step": 572
+    },
+    {
+      "epoch": 0.7381642512077294,
+      "grad_norm": 0.4856846332550049,
+      "learning_rate": 5.291828793774319e-05,
+      "loss": 0.6179,
+      "step": 573
+    },
+    {
+      "epoch": 0.7394524959742351,
+      "grad_norm": 0.5459800958633423,
+      "learning_rate": 5.2658884565499356e-05,
+      "loss": 0.7225,
+      "step": 574
+    },
+    {
+      "epoch": 0.7407407407407407,
+      "grad_norm": 0.5425988435745239,
+      "learning_rate": 5.239948119325552e-05,
+      "loss": 0.7091,
+      "step": 575
+    },
+    {
+      "epoch": 0.7420289855072464,
+      "grad_norm": 0.6519750356674194,
+      "learning_rate": 5.214007782101168e-05,
+      "loss": 0.7549,
+      "step": 576
+    },
+    {
+      "epoch": 0.743317230273752,
+      "grad_norm": 0.5276802778244019,
+      "learning_rate": 5.188067444876784e-05,
+      "loss": 0.5972,
+      "step": 577
+    },
+    {
+      "epoch": 0.7446054750402576,
+      "grad_norm": 0.5245682001113892,
+      "learning_rate": 5.1621271076524e-05,
+      "loss": 0.631,
+      "step": 578
+    },
+    {
+      "epoch": 0.7458937198067633,
+      "grad_norm": 0.574123203754425,
+      "learning_rate": 5.136186770428015e-05,
+      "loss": 0.8617,
+      "step": 579
+    },
+    {
+      "epoch": 0.7471819645732689,
+      "grad_norm": 0.5302646160125732,
+      "learning_rate": 5.1102464332036315e-05,
+      "loss": 0.682,
+      "step": 580
+    },
+    {
+      "epoch": 0.7484702093397746,
+      "grad_norm": 0.528005063533783,
+      "learning_rate": 5.084306095979248e-05,
+      "loss": 0.5872,
+      "step": 581
+    },
+    {
+      "epoch": 0.7497584541062802,
+      "grad_norm": 0.5490335822105408,
+      "learning_rate": 5.0583657587548636e-05,
+      "loss": 0.571,
+      "step": 582
+    },
+    {
+      "epoch": 0.7510466988727859,
+      "grad_norm": 0.5383925437927246,
+      "learning_rate": 5.03242542153048e-05,
+      "loss": 0.5694,
+      "step": 583
+    },
+    {
+      "epoch": 0.7523349436392914,
+      "grad_norm": 0.5377727150917053,
+      "learning_rate": 5.0064850843060965e-05,
+      "loss": 0.7569,
+      "step": 584
+    },
+    {
+      "epoch": 0.7536231884057971,
+      "grad_norm": 0.5144835710525513,
+      "learning_rate": 4.980544747081712e-05,
+      "loss": 0.6205,
+      "step": 585
+    },
+    {
+      "epoch": 0.7549114331723027,
+      "grad_norm": 0.5208680033683777,
+      "learning_rate": 4.9546044098573286e-05,
+      "loss": 0.6205,
+      "step": 586
+    },
+    {
+      "epoch": 0.7561996779388084,
+      "grad_norm": 0.6118026375770569,
+      "learning_rate": 4.9286640726329444e-05,
+      "loss": 0.8551,
+      "step": 587
+    },
+    {
+      "epoch": 0.7574879227053141,
+      "grad_norm": 0.5483299493789673,
+      "learning_rate": 4.90272373540856e-05,
+      "loss": 0.6421,
+      "step": 588
+    },
+    {
+      "epoch": 0.7587761674718196,
+      "grad_norm": 0.5909067392349243,
+      "learning_rate": 4.8767833981841766e-05,
+      "loss": 0.7131,
+      "step": 589
+    },
+    {
+      "epoch": 0.7600644122383253,
+      "grad_norm": 0.46308815479278564,
+      "learning_rate": 4.850843060959793e-05,
+      "loss": 0.472,
+      "step": 590
+    },
+    {
+      "epoch": 0.7613526570048309,
+      "grad_norm": 0.6142207384109497,
+      "learning_rate": 4.824902723735409e-05,
+      "loss": 0.6328,
+      "step": 591
+    },
+    {
+      "epoch": 0.7626409017713366,
+      "grad_norm": 0.6880104541778564,
+      "learning_rate": 4.798962386511025e-05,
+      "loss": 0.838,
+      "step": 592
+    },
+    {
+      "epoch": 0.7639291465378422,
+      "grad_norm": 0.5680267214775085,
+      "learning_rate": 4.773022049286641e-05,
+      "loss": 0.7676,
+      "step": 593
+    },
+    {
+      "epoch": 0.7652173913043478,
+      "grad_norm": 0.5472928881645203,
+      "learning_rate": 4.7470817120622567e-05,
+      "loss": 0.7345,
+      "step": 594
+    },
+    {
+      "epoch": 0.7665056360708534,
+      "grad_norm": 0.6231438517570496,
+      "learning_rate": 4.721141374837873e-05,
+      "loss": 0.7582,
+      "step": 595
+    },
+    {
+      "epoch": 0.7677938808373591,
+      "grad_norm": 0.5030277371406555,
+      "learning_rate": 4.6952010376134895e-05,
+      "loss": 0.5928,
+      "step": 596
+    },
+    {
+      "epoch": 0.7690821256038647,
+      "grad_norm": 0.6041036248207092,
+      "learning_rate": 4.669260700389105e-05,
+      "loss": 0.6439,
+      "step": 597
+    },
+    {
+      "epoch": 0.7703703703703704,
+      "grad_norm": 0.5044084191322327,
+      "learning_rate": 4.643320363164722e-05,
+      "loss": 0.5618,
+      "step": 598
+    },
+    {
+      "epoch": 0.7716586151368761,
+      "grad_norm": 0.5399697422981262,
+      "learning_rate": 4.6173800259403374e-05,
+      "loss": 0.6366,
+      "step": 599
+    },
+    {
+      "epoch": 0.7729468599033816,
+      "grad_norm": 0.496896892786026,
+      "learning_rate": 4.591439688715953e-05,
+      "loss": 0.5864,
+      "step": 600
+    },
+    {
+      "epoch": 0.7742351046698873,
+      "grad_norm": 0.46158090233802795,
+      "learning_rate": 4.5654993514915696e-05,
+      "loss": 0.609,
+      "step": 601
+    },
+    {
+      "epoch": 0.7755233494363929,
+      "grad_norm": 0.5886946320533752,
+      "learning_rate": 4.539559014267185e-05,
+      "loss": 0.7699,
+      "step": 602
+    },
+    {
+      "epoch": 0.7768115942028986,
+      "grad_norm": 0.5680760145187378,
+      "learning_rate": 4.513618677042802e-05,
+      "loss": 0.8665,
+      "step": 603
+    },
+    {
+      "epoch": 0.7780998389694042,
+      "grad_norm": 0.5787962675094604,
+      "learning_rate": 4.487678339818418e-05,
+      "loss": 0.6942,
+      "step": 604
+    },
+    {
+      "epoch": 0.7793880837359098,
+      "grad_norm": 0.6179983615875244,
+      "learning_rate": 4.461738002594034e-05,
+      "loss": 0.7403,
+      "step": 605
+    },
+    {
+      "epoch": 0.7806763285024154,
+      "grad_norm": 0.5327017903327942,
+      "learning_rate": 4.43579766536965e-05,
+      "loss": 0.6714,
+      "step": 606
+    },
+    {
+      "epoch": 0.7819645732689211,
+      "grad_norm": 0.5620171427726746,
+      "learning_rate": 4.409857328145266e-05,
+      "loss": 0.6706,
+      "step": 607
+    },
+    {
+      "epoch": 0.7832528180354267,
+      "grad_norm": 0.5355799794197083,
+      "learning_rate": 4.383916990920882e-05,
+      "loss": 0.6042,
+      "step": 608
+    },
+    {
+      "epoch": 0.7845410628019324,
+      "grad_norm": 0.692477285861969,
+      "learning_rate": 4.357976653696498e-05,
+      "loss": 0.7384,
+      "step": 609
+    },
+    {
+      "epoch": 0.785829307568438,
+      "grad_norm": 0.5491352081298828,
+      "learning_rate": 4.332036316472115e-05,
+      "loss": 0.6315,
+      "step": 610
+    },
+    {
+      "epoch": 0.7871175523349436,
+      "grad_norm": 0.6350588202476501,
+      "learning_rate": 4.3060959792477304e-05,
+      "loss": 0.7889,
+      "step": 611
+    },
+    {
+      "epoch": 0.7884057971014493,
+      "grad_norm": 0.5784136652946472,
+      "learning_rate": 4.280155642023346e-05,
+      "loss": 0.7421,
+      "step": 612
+    },
+    {
+      "epoch": 0.7896940418679549,
+      "grad_norm": 0.55226069688797,
+      "learning_rate": 4.2542153047989626e-05,
+      "loss": 0.7752,
+      "step": 613
+    },
+    {
+      "epoch": 0.7909822866344606,
+      "grad_norm": 0.541728138923645,
+      "learning_rate": 4.2282749675745784e-05,
+      "loss": 0.6681,
+      "step": 614
+    },
+    {
+      "epoch": 0.7922705314009661,
+      "grad_norm": 0.4921126067638397,
+      "learning_rate": 4.202334630350195e-05,
+      "loss": 0.5087,
+      "step": 615
+    },
+    {
+      "epoch": 0.7935587761674718,
+      "grad_norm": 0.5723814368247986,
+      "learning_rate": 4.176394293125811e-05,
+      "loss": 0.7275,
+      "step": 616
+    },
+    {
+      "epoch": 0.7948470209339774,
+      "grad_norm": 0.5064358115196228,
+      "learning_rate": 4.150453955901427e-05,
+      "loss": 0.6857,
+      "step": 617
+    },
+    {
+      "epoch": 0.7961352657004831,
+      "grad_norm": 0.495473176240921,
+      "learning_rate": 4.124513618677043e-05,
+      "loss": 0.5766,
+      "step": 618
+    },
+    {
+      "epoch": 0.7974235104669888,
+      "grad_norm": 0.47758999466896057,
+      "learning_rate": 4.098573281452659e-05,
+      "loss": 0.5152,
+      "step": 619
+    },
+    {
+      "epoch": 0.7987117552334944,
+      "grad_norm": 0.5546131730079651,
+      "learning_rate": 4.072632944228275e-05,
+      "loss": 0.5477,
+      "step": 620
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.637289822101593,
+      "learning_rate": 4.046692607003891e-05,
+      "loss": 0.8712,
+      "step": 621
+    },
+    {
+      "epoch": 0.8012882447665056,
+      "grad_norm": 0.6441432237625122,
+      "learning_rate": 4.020752269779508e-05,
+      "loss": 0.751,
+      "step": 622
+    },
+    {
+      "epoch": 0.8025764895330113,
+      "grad_norm": 0.5846959352493286,
+      "learning_rate": 3.9948119325551235e-05,
+      "loss": 0.764,
+      "step": 623
+    },
+    {
+      "epoch": 0.8038647342995169,
+      "grad_norm": 0.5156934261322021,
+      "learning_rate": 3.968871595330739e-05,
+      "loss": 0.6254,
+      "step": 624
+    },
+    {
+      "epoch": 0.8051529790660226,
+      "grad_norm": 0.5897034406661987,
+      "learning_rate": 3.9429312581063556e-05,
+      "loss": 0.6232,
+      "step": 625
+    },
+    {
+      "epoch": 0.8064412238325281,
+      "grad_norm": 0.6254003643989563,
+      "learning_rate": 3.9169909208819714e-05,
+      "loss": 0.7375,
+      "step": 626
+    },
+    {
+      "epoch": 0.8077294685990338,
+      "grad_norm": 0.5816264152526855,
+      "learning_rate": 3.891050583657588e-05,
+      "loss": 0.752,
+      "step": 627
+    },
+    {
+      "epoch": 0.8090177133655394,
+      "grad_norm": 0.570949912071228,
+      "learning_rate": 3.865110246433204e-05,
+      "loss": 0.6546,
+      "step": 628
+    },
+    {
+      "epoch": 0.8103059581320451,
+      "grad_norm": 0.5094951391220093,
+      "learning_rate": 3.83916990920882e-05,
+      "loss": 0.5009,
+      "step": 629
+    },
+    {
+      "epoch": 0.8115942028985508,
+      "grad_norm": 0.6055474281311035,
+      "learning_rate": 3.813229571984436e-05,
+      "loss": 0.7338,
+      "step": 630
+    },
+    {
+      "epoch": 0.8128824476650564,
+      "grad_norm": 0.5392929911613464,
+      "learning_rate": 3.787289234760052e-05,
+      "loss": 0.563,
+      "step": 631
+    },
+    {
+      "epoch": 0.814170692431562,
+      "grad_norm": 0.6130269765853882,
+      "learning_rate": 3.761348897535668e-05,
+      "loss": 0.8099,
+      "step": 632
+    },
+    {
+      "epoch": 0.8154589371980676,
+      "grad_norm": 0.6383023262023926,
+      "learning_rate": 3.735408560311284e-05,
+      "loss": 0.5776,
+      "step": 633
+    },
+    {
+      "epoch": 0.8167471819645733,
+      "grad_norm": 0.4606671929359436,
+      "learning_rate": 3.709468223086901e-05,
+      "loss": 0.5352,
+      "step": 634
+    },
+    {
+      "epoch": 0.8180354267310789,
+      "grad_norm": 0.5078199505805969,
+      "learning_rate": 3.6835278858625165e-05,
+      "loss": 0.5113,
+      "step": 635
+    },
+    {
+      "epoch": 0.8193236714975846,
+      "grad_norm": 0.5447183847427368,
+      "learning_rate": 3.657587548638132e-05,
+      "loss": 0.6092,
+      "step": 636
+    },
+    {
+      "epoch": 0.8206119162640901,
+      "grad_norm": 0.632024884223938,
+      "learning_rate": 3.631647211413749e-05,
+      "loss": 0.7237,
+      "step": 637
+    },
+    {
+      "epoch": 0.8219001610305958,
+      "grad_norm": 0.5381270051002502,
+      "learning_rate": 3.6057068741893644e-05,
+      "loss": 0.6391,
+      "step": 638
+    },
+    {
+      "epoch": 0.8231884057971014,
+      "grad_norm": 0.5342917442321777,
+      "learning_rate": 3.579766536964981e-05,
+      "loss": 0.672,
+      "step": 639
+    },
+    {
+      "epoch": 0.8244766505636071,
+      "grad_norm": 0.6121646761894226,
+      "learning_rate": 3.553826199740597e-05,
+      "loss": 0.8898,
+      "step": 640
+    },
+    {
+      "epoch": 0.8257648953301128,
+      "grad_norm": 0.6056507229804993,
+      "learning_rate": 3.527885862516213e-05,
+      "loss": 0.585,
+      "step": 641
+    },
+    {
+      "epoch": 0.8270531400966183,
+      "grad_norm": 0.5895273685455322,
+      "learning_rate": 3.501945525291829e-05,
+      "loss": 0.6135,
+      "step": 642
+    },
+    {
+      "epoch": 0.828341384863124,
+      "grad_norm": 0.5063283443450928,
+      "learning_rate": 3.476005188067445e-05,
+      "loss": 0.4939,
+      "step": 643
+    },
+    {
+      "epoch": 0.8296296296296296,
+      "grad_norm": 0.5781770348548889,
+      "learning_rate": 3.450064850843061e-05,
+      "loss": 0.7322,
+      "step": 644
+    },
+    {
+      "epoch": 0.8309178743961353,
+      "grad_norm": 0.5424814820289612,
+      "learning_rate": 3.4241245136186774e-05,
+      "loss": 0.6702,
+      "step": 645
+    },
+    {
+      "epoch": 0.8322061191626409,
+      "grad_norm": 0.5998700857162476,
+      "learning_rate": 3.398184176394293e-05,
+      "loss": 0.6788,
+      "step": 646
+    },
+    {
+      "epoch": 0.8334943639291466,
+      "grad_norm": 0.614637017250061,
+      "learning_rate": 3.3722438391699095e-05,
+      "loss": 0.7391,
+      "step": 647
+    },
+    {
+      "epoch": 0.8347826086956521,
+      "grad_norm": 0.6503768563270569,
+      "learning_rate": 3.346303501945525e-05,
+      "loss": 0.7648,
+      "step": 648
+    },
+    {
+      "epoch": 0.8360708534621578,
+      "grad_norm": 0.5270184874534607,
+      "learning_rate": 3.320363164721142e-05,
+      "loss": 0.504,
+      "step": 649
+    },
+    {
+      "epoch": 0.8373590982286635,
+      "grad_norm": 0.5014241337776184,
+      "learning_rate": 3.2944228274967575e-05,
+      "loss": 0.5252,
+      "step": 650
+    },
+    {
+      "epoch": 0.8386473429951691,
+      "grad_norm": 0.5668673515319824,
+      "learning_rate": 3.268482490272374e-05,
+      "loss": 0.6537,
+      "step": 651
+    },
+    {
+      "epoch": 0.8399355877616748,
+      "grad_norm": 0.5789865255355835,
+      "learning_rate": 3.2425421530479896e-05,
+      "loss": 0.8012,
+      "step": 652
+    },
+    {
+      "epoch": 0.8412238325281803,
+      "grad_norm": 0.6261132955551147,
+      "learning_rate": 3.216601815823606e-05,
+      "loss": 0.6832,
+      "step": 653
+    },
+    {
+      "epoch": 0.842512077294686,
+      "grad_norm": 0.5914183855056763,
+      "learning_rate": 3.190661478599222e-05,
+      "loss": 0.5653,
+      "step": 654
+    },
+    {
+      "epoch": 0.8438003220611916,
+      "grad_norm": 0.5597856044769287,
+      "learning_rate": 3.164721141374838e-05,
+      "loss": 0.557,
+      "step": 655
+    },
+    {
+      "epoch": 0.8450885668276973,
+      "grad_norm": 0.5060226917266846,
+      "learning_rate": 3.138780804150454e-05,
+      "loss": 0.6559,
+      "step": 656
+    },
+    {
+      "epoch": 0.8463768115942029,
+      "grad_norm": 0.6236748695373535,
+      "learning_rate": 3.1128404669260704e-05,
+      "loss": 0.7372,
+      "step": 657
+    },
+    {
+      "epoch": 0.8476650563607085,
+      "grad_norm": 0.5138527750968933,
+      "learning_rate": 3.086900129701686e-05,
+      "loss": 0.6031,
+      "step": 658
+    },
+    {
+      "epoch": 0.8489533011272141,
+      "grad_norm": 0.5962822437286377,
+      "learning_rate": 3.0609597924773026e-05,
+      "loss": 0.7374,
+      "step": 659
+    },
+    {
+      "epoch": 0.8502415458937198,
+      "grad_norm": 0.4833110272884369,
+      "learning_rate": 3.0350194552529183e-05,
+      "loss": 0.4742,
+      "step": 660
+    },
+    {
+      "epoch": 0.8515297906602255,
+      "grad_norm": 0.5980967283248901,
+      "learning_rate": 3.0090791180285344e-05,
+      "loss": 0.7025,
+      "step": 661
+    },
+    {
+      "epoch": 0.8528180354267311,
+      "grad_norm": 0.6031454801559448,
+      "learning_rate": 2.9831387808041505e-05,
+      "loss": 0.8479,
+      "step": 662
+    },
+    {
+      "epoch": 0.8541062801932368,
+      "grad_norm": 0.5824582576751709,
+      "learning_rate": 2.9571984435797666e-05,
+      "loss": 0.7073,
+      "step": 663
+    },
+    {
+      "epoch": 0.8553945249597423,
+      "grad_norm": 0.6369014978408813,
+      "learning_rate": 2.931258106355383e-05,
+      "loss": 0.864,
+      "step": 664
+    },
+    {
+      "epoch": 0.856682769726248,
+      "grad_norm": 0.554784893989563,
+      "learning_rate": 2.905317769130999e-05,
+      "loss": 0.6705,
+      "step": 665
+    },
+    {
+      "epoch": 0.8579710144927536,
+      "grad_norm": 0.5656050443649292,
+      "learning_rate": 2.8793774319066148e-05,
+      "loss": 0.6375,
+      "step": 666
+    },
+    {
+      "epoch": 0.8592592592592593,
+      "grad_norm": 0.6191110014915466,
+      "learning_rate": 2.853437094682231e-05,
+      "loss": 0.7277,
+      "step": 667
+    },
+    {
+      "epoch": 0.8605475040257649,
+      "grad_norm": 0.6224331855773926,
+      "learning_rate": 2.827496757457847e-05,
+      "loss": 0.6809,
+      "step": 668
+    },
+    {
+      "epoch": 0.8618357487922705,
+      "grad_norm": 0.6049439311027527,
+      "learning_rate": 2.801556420233463e-05,
+      "loss": 0.8089,
+      "step": 669
+    },
+    {
+      "epoch": 0.8631239935587761,
+      "grad_norm": 0.5969856977462769,
+      "learning_rate": 2.7756160830090795e-05,
+      "loss": 0.8102,
+      "step": 670
+    },
+    {
+      "epoch": 0.8644122383252818,
+      "grad_norm": 0.6787256002426147,
+      "learning_rate": 2.7496757457846956e-05,
+      "loss": 0.7602,
+      "step": 671
+    },
+    {
+      "epoch": 0.8657004830917875,
+      "grad_norm": 0.6535263061523438,
+      "learning_rate": 2.7237354085603113e-05,
+      "loss": 0.6816,
+      "step": 672
+    },
+    {
+      "epoch": 0.8669887278582931,
+      "grad_norm": 0.6893251538276672,
+      "learning_rate": 2.6977950713359274e-05,
+      "loss": 0.7271,
+      "step": 673
+    },
+    {
+      "epoch": 0.8682769726247987,
+      "grad_norm": 0.6239253282546997,
+      "learning_rate": 2.6718547341115435e-05,
+      "loss": 0.7995,
+      "step": 674
+    },
+    {
+      "epoch": 0.8695652173913043,
+      "grad_norm": 0.5103424787521362,
+      "learning_rate": 2.6459143968871596e-05,
+      "loss": 0.5603,
+      "step": 675
+    },
+    {
+      "epoch": 0.87085346215781,
+      "grad_norm": 0.558005690574646,
+      "learning_rate": 2.619974059662776e-05,
+      "loss": 0.5951,
+      "step": 676
+    },
+    {
+      "epoch": 0.8721417069243156,
+      "grad_norm": 0.5600588917732239,
+      "learning_rate": 2.594033722438392e-05,
+      "loss": 0.7646,
+      "step": 677
+    },
+    {
+      "epoch": 0.8734299516908213,
+      "grad_norm": 0.48512476682662964,
+      "learning_rate": 2.5680933852140075e-05,
+      "loss": 0.5015,
+      "step": 678
+    },
+    {
+      "epoch": 0.8747181964573268,
+      "grad_norm": 0.6091920733451843,
+      "learning_rate": 2.542153047989624e-05,
+      "loss": 0.8244,
+      "step": 679
+    },
+    {
+      "epoch": 0.8760064412238325,
+      "grad_norm": 0.49356287717819214,
+      "learning_rate": 2.51621271076524e-05,
+      "loss": 0.4808,
+      "step": 680
+    },
+    {
+      "epoch": 0.8772946859903382,
+      "grad_norm": 0.5376326441764832,
+      "learning_rate": 2.490272373540856e-05,
+      "loss": 0.5296,
+      "step": 681
+    },
+    {
+      "epoch": 0.8785829307568438,
+      "grad_norm": 0.611382782459259,
+      "learning_rate": 2.4643320363164722e-05,
+      "loss": 0.5765,
+      "step": 682
+    },
+    {
+      "epoch": 0.8798711755233495,
+      "grad_norm": 0.5653994083404541,
+      "learning_rate": 2.4383916990920883e-05,
+      "loss": 0.5864,
+      "step": 683
+    },
+    {
+      "epoch": 0.881159420289855,
+      "grad_norm": 0.48044463992118835,
+      "learning_rate": 2.4124513618677044e-05,
+      "loss": 0.6073,
+      "step": 684
+    },
+    {
+      "epoch": 0.8824476650563607,
+      "grad_norm": 0.6067565679550171,
+      "learning_rate": 2.3865110246433205e-05,
+      "loss": 0.6312,
+      "step": 685
+    },
+    {
+      "epoch": 0.8837359098228663,
+      "grad_norm": 0.5126189589500427,
+      "learning_rate": 2.3605706874189365e-05,
+      "loss": 0.5501,
+      "step": 686
+    },
+    {
+      "epoch": 0.885024154589372,
+      "grad_norm": 0.551137387752533,
+      "learning_rate": 2.3346303501945526e-05,
+      "loss": 0.578,
+      "step": 687
+    },
+    {
+      "epoch": 0.8863123993558776,
+      "grad_norm": 0.7072709202766418,
+      "learning_rate": 2.3086900129701687e-05,
+      "loss": 0.614,
+      "step": 688
+    },
+    {
+      "epoch": 0.8876006441223833,
+      "grad_norm": 0.6444385051727295,
+      "learning_rate": 2.2827496757457848e-05,
+      "loss": 0.6824,
+      "step": 689
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 0.5593189597129822,
+      "learning_rate": 2.256809338521401e-05,
+      "loss": 0.5942,
+      "step": 690
+    },
+    {
+      "epoch": 0.8901771336553945,
+      "grad_norm": 0.6002535223960876,
+      "learning_rate": 2.230869001297017e-05,
+      "loss": 0.7655,
+      "step": 691
+    },
+    {
+      "epoch": 0.8914653784219002,
+      "grad_norm": 0.6385635137557983,
+      "learning_rate": 2.204928664072633e-05,
+      "loss": 0.749,
+      "step": 692
+    },
+    {
+      "epoch": 0.8927536231884058,
+      "grad_norm": 0.5951741337776184,
+      "learning_rate": 2.178988326848249e-05,
+      "loss": 0.6708,
+      "step": 693
+    },
+    {
+      "epoch": 0.8940418679549115,
+      "grad_norm": 0.6050885915756226,
+      "learning_rate": 2.1530479896238652e-05,
+      "loss": 0.7473,
+      "step": 694
+    },
+    {
+      "epoch": 0.895330112721417,
+      "grad_norm": 0.570475161075592,
+      "learning_rate": 2.1271076523994813e-05,
+      "loss": 0.5881,
+      "step": 695
+    },
+    {
+      "epoch": 0.8966183574879227,
+      "grad_norm": 0.5623670816421509,
+      "learning_rate": 2.1011673151750974e-05,
+      "loss": 0.6613,
+      "step": 696
+    },
+    {
+      "epoch": 0.8979066022544283,
+      "grad_norm": 0.6884156465530396,
+      "learning_rate": 2.0752269779507135e-05,
+      "loss": 0.7917,
+      "step": 697
+    },
+    {
+      "epoch": 0.899194847020934,
+      "grad_norm": 0.6603716611862183,
+      "learning_rate": 2.0492866407263296e-05,
+      "loss": 0.6958,
+      "step": 698
+    },
+    {
+      "epoch": 0.9004830917874396,
+      "grad_norm": 0.6588467359542847,
+      "learning_rate": 2.0233463035019457e-05,
+      "loss": 0.5612,
+      "step": 699
+    },
+    {
+      "epoch": 0.9017713365539453,
+      "grad_norm": 0.5613631010055542,
+      "learning_rate": 1.9974059662775617e-05,
+      "loss": 0.6064,
+      "step": 700
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 776,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.98827660663808e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}