3b-de-ft-research_release-8bit / trainer_state.json
prince-canuma's picture
Upload folder using huggingface_hub
37dccdd verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011111111111111111,
"grad_norm": 5.2032976150512695,
"learning_rate": 4.999984769144476e-05,
"loss": 5.3058,
"step": 1
},
{
"epoch": 0.0022222222222222222,
"grad_norm": 6.879838466644287,
"learning_rate": 4.999939076763487e-05,
"loss": 5.0848,
"step": 2
},
{
"epoch": 0.0033333333333333335,
"grad_norm": 6.035982131958008,
"learning_rate": 4.999862923413781e-05,
"loss": 5.5976,
"step": 3
},
{
"epoch": 0.0044444444444444444,
"grad_norm": 7.264829635620117,
"learning_rate": 4.999756310023261e-05,
"loss": 5.007,
"step": 4
},
{
"epoch": 0.005555555555555556,
"grad_norm": 4.736705303192139,
"learning_rate": 4.9996192378909786e-05,
"loss": 4.6123,
"step": 5
},
{
"epoch": 0.006666666666666667,
"grad_norm": 6.610605239868164,
"learning_rate": 4.999451708687114e-05,
"loss": 4.7884,
"step": 6
},
{
"epoch": 0.0077777777777777776,
"grad_norm": 6.082452774047852,
"learning_rate": 4.999253724452958e-05,
"loss": 4.913,
"step": 7
},
{
"epoch": 0.008888888888888889,
"grad_norm": 4.39306116104126,
"learning_rate": 4.999025287600886e-05,
"loss": 5.2733,
"step": 8
},
{
"epoch": 0.01,
"grad_norm": 4.614330291748047,
"learning_rate": 4.998766400914329e-05,
"loss": 4.6074,
"step": 9
},
{
"epoch": 0.011111111111111112,
"grad_norm": 5.944769859313965,
"learning_rate": 4.99847706754774e-05,
"loss": 4.9337,
"step": 10
},
{
"epoch": 0.012222222222222223,
"grad_norm": 7.276998519897461,
"learning_rate": 4.998157291026553e-05,
"loss": 5.0143,
"step": 11
},
{
"epoch": 0.013333333333333334,
"grad_norm": 5.569228172302246,
"learning_rate": 4.997807075247146e-05,
"loss": 5.1253,
"step": 12
},
{
"epoch": 0.014444444444444444,
"grad_norm": 5.123626232147217,
"learning_rate": 4.997426424476787e-05,
"loss": 4.4759,
"step": 13
},
{
"epoch": 0.015555555555555555,
"grad_norm": 4.314916133880615,
"learning_rate": 4.997015343353585e-05,
"loss": 5.193,
"step": 14
},
{
"epoch": 0.016666666666666666,
"grad_norm": 4.61911153793335,
"learning_rate": 4.996573836886435e-05,
"loss": 4.3899,
"step": 15
},
{
"epoch": 0.017777777777777778,
"grad_norm": 6.143403053283691,
"learning_rate": 4.996101910454953e-05,
"loss": 4.2176,
"step": 16
},
{
"epoch": 0.01888888888888889,
"grad_norm": 6.195430278778076,
"learning_rate": 4.995599569809414e-05,
"loss": 4.1796,
"step": 17
},
{
"epoch": 0.02,
"grad_norm": 5.779390335083008,
"learning_rate": 4.995066821070679e-05,
"loss": 5.0214,
"step": 18
},
{
"epoch": 0.021111111111111112,
"grad_norm": 5.847035884857178,
"learning_rate": 4.994503670730125e-05,
"loss": 4.5121,
"step": 19
},
{
"epoch": 0.022222222222222223,
"grad_norm": 5.528200626373291,
"learning_rate": 4.993910125649561e-05,
"loss": 4.2415,
"step": 20
},
{
"epoch": 0.023333333333333334,
"grad_norm": 5.237406253814697,
"learning_rate": 4.9932861930611454e-05,
"loss": 5.0282,
"step": 21
},
{
"epoch": 0.024444444444444446,
"grad_norm": 5.065497875213623,
"learning_rate": 4.992631880567301e-05,
"loss": 4.525,
"step": 22
},
{
"epoch": 0.025555555555555557,
"grad_norm": 5.5612688064575195,
"learning_rate": 4.991947196140618e-05,
"loss": 4.9982,
"step": 23
},
{
"epoch": 0.02666666666666667,
"grad_norm": 5.090909481048584,
"learning_rate": 4.991232148123761e-05,
"loss": 4.5534,
"step": 24
},
{
"epoch": 0.027777777777777776,
"grad_norm": 5.165072441101074,
"learning_rate": 4.990486745229364e-05,
"loss": 4.6862,
"step": 25
},
{
"epoch": 0.028888888888888888,
"grad_norm": 4.630911827087402,
"learning_rate": 4.989710996539926e-05,
"loss": 4.8492,
"step": 26
},
{
"epoch": 0.03,
"grad_norm": 3.68540358543396,
"learning_rate": 4.9889049115077005e-05,
"loss": 5.1254,
"step": 27
},
{
"epoch": 0.03111111111111111,
"grad_norm": 5.599917888641357,
"learning_rate": 4.988068499954578e-05,
"loss": 4.9527,
"step": 28
},
{
"epoch": 0.03222222222222222,
"grad_norm": 5.534451007843018,
"learning_rate": 4.987201772071971e-05,
"loss": 4.912,
"step": 29
},
{
"epoch": 0.03333333333333333,
"grad_norm": 4.299800395965576,
"learning_rate": 4.9863047384206835e-05,
"loss": 5.1243,
"step": 30
},
{
"epoch": 0.034444444444444444,
"grad_norm": 3.687239646911621,
"learning_rate": 4.985377409930789e-05,
"loss": 4.5257,
"step": 31
},
{
"epoch": 0.035555555555555556,
"grad_norm": 5.489537239074707,
"learning_rate": 4.984419797901491e-05,
"loss": 4.9116,
"step": 32
},
{
"epoch": 0.03666666666666667,
"grad_norm": 4.619030475616455,
"learning_rate": 4.983431914000991e-05,
"loss": 4.718,
"step": 33
},
{
"epoch": 0.03777777777777778,
"grad_norm": 5.1001200675964355,
"learning_rate": 4.982413770266342e-05,
"loss": 5.0285,
"step": 34
},
{
"epoch": 0.03888888888888889,
"grad_norm": 4.231574058532715,
"learning_rate": 4.9813653791033057e-05,
"loss": 4.7938,
"step": 35
},
{
"epoch": 0.04,
"grad_norm": 3.560554027557373,
"learning_rate": 4.980286753286195e-05,
"loss": 4.962,
"step": 36
},
{
"epoch": 0.04111111111111111,
"grad_norm": 3.8664653301239014,
"learning_rate": 4.979177905957726e-05,
"loss": 4.9856,
"step": 37
},
{
"epoch": 0.042222222222222223,
"grad_norm": 4.1073784828186035,
"learning_rate": 4.978038850628854e-05,
"loss": 5.2019,
"step": 38
},
{
"epoch": 0.043333333333333335,
"grad_norm": 4.941130638122559,
"learning_rate": 4.976869601178609e-05,
"loss": 4.6499,
"step": 39
},
{
"epoch": 0.044444444444444446,
"grad_norm": 5.749270915985107,
"learning_rate": 4.975670171853926e-05,
"loss": 4.1511,
"step": 40
},
{
"epoch": 0.04555555555555556,
"grad_norm": 3.7464685440063477,
"learning_rate": 4.9744405772694725e-05,
"loss": 4.9937,
"step": 41
},
{
"epoch": 0.04666666666666667,
"grad_norm": 4.391846656799316,
"learning_rate": 4.9731808324074717e-05,
"loss": 4.9573,
"step": 42
},
{
"epoch": 0.04777777777777778,
"grad_norm": 4.163111209869385,
"learning_rate": 4.971890952617515e-05,
"loss": 4.8546,
"step": 43
},
{
"epoch": 0.04888888888888889,
"grad_norm": 3.859717607498169,
"learning_rate": 4.9705709536163824e-05,
"loss": 4.8448,
"step": 44
},
{
"epoch": 0.05,
"grad_norm": 4.045307636260986,
"learning_rate": 4.9692208514878444e-05,
"loss": 4.8979,
"step": 45
},
{
"epoch": 0.051111111111111114,
"grad_norm": 3.083608627319336,
"learning_rate": 4.96784066268247e-05,
"loss": 4.6191,
"step": 46
},
{
"epoch": 0.052222222222222225,
"grad_norm": 3.6996843814849854,
"learning_rate": 4.966430404017424e-05,
"loss": 4.1142,
"step": 47
},
{
"epoch": 0.05333333333333334,
"grad_norm": 5.001142501831055,
"learning_rate": 4.964990092676263e-05,
"loss": 4.673,
"step": 48
},
{
"epoch": 0.05444444444444444,
"grad_norm": 4.148028373718262,
"learning_rate": 4.963519746208726e-05,
"loss": 4.4178,
"step": 49
},
{
"epoch": 0.05555555555555555,
"grad_norm": 3.529871940612793,
"learning_rate": 4.962019382530521e-05,
"loss": 5.134,
"step": 50
},
{
"epoch": 0.056666666666666664,
"grad_norm": 3.791576385498047,
"learning_rate": 4.960489019923105e-05,
"loss": 4.5824,
"step": 51
},
{
"epoch": 0.057777777777777775,
"grad_norm": 3.236461877822876,
"learning_rate": 4.9589286770334654e-05,
"loss": 4.9126,
"step": 52
},
{
"epoch": 0.058888888888888886,
"grad_norm": 3.4092698097229004,
"learning_rate": 4.957338372873886e-05,
"loss": 4.9913,
"step": 53
},
{
"epoch": 0.06,
"grad_norm": 4.24392557144165,
"learning_rate": 4.9557181268217227e-05,
"loss": 4.3492,
"step": 54
},
{
"epoch": 0.06111111111111111,
"grad_norm": 3.5253679752349854,
"learning_rate": 4.9540679586191605e-05,
"loss": 4.5665,
"step": 55
},
{
"epoch": 0.06222222222222222,
"grad_norm": 4.3137688636779785,
"learning_rate": 4.952387888372979e-05,
"loss": 4.3782,
"step": 56
},
{
"epoch": 0.06333333333333334,
"grad_norm": 3.4922027587890625,
"learning_rate": 4.9506779365543046e-05,
"loss": 4.4069,
"step": 57
},
{
"epoch": 0.06444444444444444,
"grad_norm": 3.7192225456237793,
"learning_rate": 4.94893812399836e-05,
"loss": 4.6152,
"step": 58
},
{
"epoch": 0.06555555555555556,
"grad_norm": 3.398974895477295,
"learning_rate": 4.947168471904213e-05,
"loss": 4.8951,
"step": 59
},
{
"epoch": 0.06666666666666667,
"grad_norm": 2.9628076553344727,
"learning_rate": 4.9453690018345144e-05,
"loss": 4.5419,
"step": 60
},
{
"epoch": 0.06777777777777778,
"grad_norm": 2.703320026397705,
"learning_rate": 4.94353973571524e-05,
"loss": 4.9154,
"step": 61
},
{
"epoch": 0.06888888888888889,
"grad_norm": 2.9003796577453613,
"learning_rate": 4.94168069583542e-05,
"loss": 4.8565,
"step": 62
},
{
"epoch": 0.07,
"grad_norm": 2.6896684169769287,
"learning_rate": 4.939791904846869e-05,
"loss": 4.6401,
"step": 63
},
{
"epoch": 0.07111111111111111,
"grad_norm": 3.679429292678833,
"learning_rate": 4.937873385763908e-05,
"loss": 4.5216,
"step": 64
},
{
"epoch": 0.07222222222222222,
"grad_norm": 3.837848424911499,
"learning_rate": 4.9359251619630886e-05,
"loss": 4.7913,
"step": 65
},
{
"epoch": 0.07333333333333333,
"grad_norm": 4.7550368309021,
"learning_rate": 4.933947257182901e-05,
"loss": 4.6804,
"step": 66
},
{
"epoch": 0.07444444444444444,
"grad_norm": 3.387397289276123,
"learning_rate": 4.931939695523492e-05,
"loss": 5.1575,
"step": 67
},
{
"epoch": 0.07555555555555556,
"grad_norm": 2.715179204940796,
"learning_rate": 4.929902501446366e-05,
"loss": 4.8116,
"step": 68
},
{
"epoch": 0.07666666666666666,
"grad_norm": 3.598045587539673,
"learning_rate": 4.9278356997740904e-05,
"loss": 4.8033,
"step": 69
},
{
"epoch": 0.07777777777777778,
"grad_norm": 3.2445831298828125,
"learning_rate": 4.925739315689991e-05,
"loss": 5.0033,
"step": 70
},
{
"epoch": 0.07888888888888888,
"grad_norm": 3.411445379257202,
"learning_rate": 4.9236133747378475e-05,
"loss": 4.7147,
"step": 71
},
{
"epoch": 0.08,
"grad_norm": 7.331087589263916,
"learning_rate": 4.9214579028215776e-05,
"loss": 4.3199,
"step": 72
},
{
"epoch": 0.0811111111111111,
"grad_norm": 5.33408784866333,
"learning_rate": 4.919272926204929e-05,
"loss": 4.882,
"step": 73
},
{
"epoch": 0.08222222222222222,
"grad_norm": 2.8994922637939453,
"learning_rate": 4.917058471511149e-05,
"loss": 4.678,
"step": 74
},
{
"epoch": 0.08333333333333333,
"grad_norm": 2.394202709197998,
"learning_rate": 4.914814565722671e-05,
"loss": 4.7618,
"step": 75
},
{
"epoch": 0.08444444444444445,
"grad_norm": 3.3278257846832275,
"learning_rate": 4.912541236180779e-05,
"loss": 4.5066,
"step": 76
},
{
"epoch": 0.08555555555555555,
"grad_norm": 5.1034836769104,
"learning_rate": 4.910238510585276e-05,
"loss": 4.9339,
"step": 77
},
{
"epoch": 0.08666666666666667,
"grad_norm": 3.776923179626465,
"learning_rate": 4.907906416994146e-05,
"loss": 4.264,
"step": 78
},
{
"epoch": 0.08777777777777777,
"grad_norm": 3.5819032192230225,
"learning_rate": 4.905544983823214e-05,
"loss": 4.6317,
"step": 79
},
{
"epoch": 0.08888888888888889,
"grad_norm": 4.019664764404297,
"learning_rate": 4.9031542398457974e-05,
"loss": 4.2868,
"step": 80
},
{
"epoch": 0.09,
"grad_norm": 3.2063353061676025,
"learning_rate": 4.900734214192358e-05,
"loss": 4.7617,
"step": 81
},
{
"epoch": 0.09111111111111111,
"grad_norm": 3.4615073204040527,
"learning_rate": 4.898284936350144e-05,
"loss": 4.6781,
"step": 82
},
{
"epoch": 0.09222222222222222,
"grad_norm": 3.8503334522247314,
"learning_rate": 4.895806436162833e-05,
"loss": 5.0211,
"step": 83
},
{
"epoch": 0.09333333333333334,
"grad_norm": 3.9291231632232666,
"learning_rate": 4.893298743830168e-05,
"loss": 4.8865,
"step": 84
},
{
"epoch": 0.09444444444444444,
"grad_norm": 3.537541389465332,
"learning_rate": 4.890761889907589e-05,
"loss": 4.5888,
"step": 85
},
{
"epoch": 0.09555555555555556,
"grad_norm": 2.426281690597534,
"learning_rate": 4.888195905305859e-05,
"loss": 4.3387,
"step": 86
},
{
"epoch": 0.09666666666666666,
"grad_norm": 3.3084747791290283,
"learning_rate": 4.8856008212906925e-05,
"loss": 4.9159,
"step": 87
},
{
"epoch": 0.09777777777777778,
"grad_norm": 4.331256866455078,
"learning_rate": 4.882976669482367e-05,
"loss": 4.7531,
"step": 88
},
{
"epoch": 0.09888888888888889,
"grad_norm": 3.6446895599365234,
"learning_rate": 4.880323481855347e-05,
"loss": 4.2317,
"step": 89
},
{
"epoch": 0.1,
"grad_norm": 4.512236595153809,
"learning_rate": 4.877641290737884e-05,
"loss": 4.5889,
"step": 90
},
{
"epoch": 0.10111111111111111,
"grad_norm": 4.778031349182129,
"learning_rate": 4.874930128811631e-05,
"loss": 4.7279,
"step": 91
},
{
"epoch": 0.10222222222222223,
"grad_norm": 2.602832794189453,
"learning_rate": 4.8721900291112415e-05,
"loss": 4.9481,
"step": 92
},
{
"epoch": 0.10333333333333333,
"grad_norm": 2.8278868198394775,
"learning_rate": 4.869421025023965e-05,
"loss": 4.5763,
"step": 93
},
{
"epoch": 0.10444444444444445,
"grad_norm": 3.5263729095458984,
"learning_rate": 4.8666231502892415e-05,
"loss": 4.3702,
"step": 94
},
{
"epoch": 0.10555555555555556,
"grad_norm": 3.6424851417541504,
"learning_rate": 4.8637964389982926e-05,
"loss": 4.0502,
"step": 95
},
{
"epoch": 0.10666666666666667,
"grad_norm": 3.5338454246520996,
"learning_rate": 4.860940925593703e-05,
"loss": 4.7823,
"step": 96
},
{
"epoch": 0.10777777777777778,
"grad_norm": 3.6265504360198975,
"learning_rate": 4.858056644869002e-05,
"loss": 4.7303,
"step": 97
},
{
"epoch": 0.10888888888888888,
"grad_norm": 2.4503519535064697,
"learning_rate": 4.855143631968242e-05,
"loss": 4.4291,
"step": 98
},
{
"epoch": 0.11,
"grad_norm": 3.9208950996398926,
"learning_rate": 4.852201922385564e-05,
"loss": 4.876,
"step": 99
},
{
"epoch": 0.1111111111111111,
"grad_norm": 2.9791765213012695,
"learning_rate": 4.849231551964771e-05,
"loss": 4.8165,
"step": 100
},
{
"epoch": 0.11222222222222222,
"grad_norm": 3.589217185974121,
"learning_rate": 4.84623255689889e-05,
"loss": 4.7452,
"step": 101
},
{
"epoch": 0.11333333333333333,
"grad_norm": 3.037071943283081,
"learning_rate": 4.843204973729729e-05,
"loss": 4.7103,
"step": 102
},
{
"epoch": 0.11444444444444445,
"grad_norm": 2.5937793254852295,
"learning_rate": 4.840148839347434e-05,
"loss": 4.4314,
"step": 103
},
{
"epoch": 0.11555555555555555,
"grad_norm": 2.879254102706909,
"learning_rate": 4.837064190990036e-05,
"loss": 4.4885,
"step": 104
},
{
"epoch": 0.11666666666666667,
"grad_norm": 3.404500722885132,
"learning_rate": 4.8339510662430046e-05,
"loss": 4.4227,
"step": 105
},
{
"epoch": 0.11777777777777777,
"grad_norm": 3.4791483879089355,
"learning_rate": 4.830809503038781e-05,
"loss": 4.8543,
"step": 106
},
{
"epoch": 0.11888888888888889,
"grad_norm": 3.072810649871826,
"learning_rate": 4.827639539656321e-05,
"loss": 4.7271,
"step": 107
},
{
"epoch": 0.12,
"grad_norm": 3.365445375442505,
"learning_rate": 4.8244412147206284e-05,
"loss": 4.8491,
"step": 108
},
{
"epoch": 0.12111111111111111,
"grad_norm": 3.6025092601776123,
"learning_rate": 4.8212145672022844e-05,
"loss": 4.7209,
"step": 109
},
{
"epoch": 0.12222222222222222,
"grad_norm": 4.458660125732422,
"learning_rate": 4.817959636416969e-05,
"loss": 4.1645,
"step": 110
},
{
"epoch": 0.12333333333333334,
"grad_norm": 3.9988503456115723,
"learning_rate": 4.814676462024988e-05,
"loss": 4.6813,
"step": 111
},
{
"epoch": 0.12444444444444444,
"grad_norm": 2.8452532291412354,
"learning_rate": 4.8113650840307834e-05,
"loss": 5.0675,
"step": 112
},
{
"epoch": 0.12555555555555556,
"grad_norm": 2.9468061923980713,
"learning_rate": 4.808025542782453e-05,
"loss": 4.8562,
"step": 113
},
{
"epoch": 0.12666666666666668,
"grad_norm": 3.0511226654052734,
"learning_rate": 4.8046578789712515e-05,
"loss": 4.5727,
"step": 114
},
{
"epoch": 0.12777777777777777,
"grad_norm": 2.0922510623931885,
"learning_rate": 4.8012621336311016e-05,
"loss": 4.7914,
"step": 115
},
{
"epoch": 0.1288888888888889,
"grad_norm": 2.8942031860351562,
"learning_rate": 4.797838348138086e-05,
"loss": 4.6763,
"step": 116
},
{
"epoch": 0.13,
"grad_norm": 3.84708571434021,
"learning_rate": 4.794386564209953e-05,
"loss": 4.2561,
"step": 117
},
{
"epoch": 0.13111111111111112,
"grad_norm": 2.471663236618042,
"learning_rate": 4.790906823905599e-05,
"loss": 4.4677,
"step": 118
},
{
"epoch": 0.1322222222222222,
"grad_norm": 2.5082037448883057,
"learning_rate": 4.7873991696245624e-05,
"loss": 4.56,
"step": 119
},
{
"epoch": 0.13333333333333333,
"grad_norm": 2.900052309036255,
"learning_rate": 4.783863644106502e-05,
"loss": 4.8909,
"step": 120
},
{
"epoch": 0.13444444444444445,
"grad_norm": 3.5951879024505615,
"learning_rate": 4.780300290430682e-05,
"loss": 4.6476,
"step": 121
},
{
"epoch": 0.13555555555555557,
"grad_norm": 4.468568325042725,
"learning_rate": 4.776709152015443e-05,
"loss": 4.3256,
"step": 122
},
{
"epoch": 0.13666666666666666,
"grad_norm": 3.0081839561462402,
"learning_rate": 4.773090272617672e-05,
"loss": 4.7223,
"step": 123
},
{
"epoch": 0.13777777777777778,
"grad_norm": 3.8555331230163574,
"learning_rate": 4.769443696332272e-05,
"loss": 4.4773,
"step": 124
},
{
"epoch": 0.1388888888888889,
"grad_norm": 3.729095697402954,
"learning_rate": 4.765769467591625e-05,
"loss": 4.7924,
"step": 125
},
{
"epoch": 0.14,
"grad_norm": 2.2823543548583984,
"learning_rate": 4.762067631165049e-05,
"loss": 4.5892,
"step": 126
},
{
"epoch": 0.1411111111111111,
"grad_norm": 3.335906982421875,
"learning_rate": 4.758338232158252e-05,
"loss": 4.8221,
"step": 127
},
{
"epoch": 0.14222222222222222,
"grad_norm": 5.226222038269043,
"learning_rate": 4.754581316012785e-05,
"loss": 4.5129,
"step": 128
},
{
"epoch": 0.14333333333333334,
"grad_norm": 3.1001462936401367,
"learning_rate": 4.7507969285054845e-05,
"loss": 4.4719,
"step": 129
},
{
"epoch": 0.14444444444444443,
"grad_norm": 3.3555104732513428,
"learning_rate": 4.7469851157479177e-05,
"loss": 4.5403,
"step": 130
},
{
"epoch": 0.14555555555555555,
"grad_norm": 2.935755968093872,
"learning_rate": 4.743145924185821e-05,
"loss": 4.8928,
"step": 131
},
{
"epoch": 0.14666666666666667,
"grad_norm": 2.488250970840454,
"learning_rate": 4.7392794005985326e-05,
"loss": 4.2008,
"step": 132
},
{
"epoch": 0.14777777777777779,
"grad_norm": 3.4012887477874756,
"learning_rate": 4.73538559209842e-05,
"loss": 4.6079,
"step": 133
},
{
"epoch": 0.14888888888888888,
"grad_norm": 2.7918901443481445,
"learning_rate": 4.731464546130314e-05,
"loss": 4.7116,
"step": 134
},
{
"epoch": 0.15,
"grad_norm": 3.9989566802978516,
"learning_rate": 4.72751631047092e-05,
"loss": 4.3616,
"step": 135
},
{
"epoch": 0.1511111111111111,
"grad_norm": 3.592566967010498,
"learning_rate": 4.723540933228244e-05,
"loss": 4.7092,
"step": 136
},
{
"epoch": 0.15222222222222223,
"grad_norm": 2.825819730758667,
"learning_rate": 4.719538462841003e-05,
"loss": 4.8076,
"step": 137
},
{
"epoch": 0.15333333333333332,
"grad_norm": 3.5768320560455322,
"learning_rate": 4.715508948078037e-05,
"loss": 4.2689,
"step": 138
},
{
"epoch": 0.15444444444444444,
"grad_norm": 2.7928998470306396,
"learning_rate": 4.71145243803771e-05,
"loss": 4.5123,
"step": 139
},
{
"epoch": 0.15555555555555556,
"grad_norm": 3.065845251083374,
"learning_rate": 4.707368982147318e-05,
"loss": 4.5658,
"step": 140
},
{
"epoch": 0.15666666666666668,
"grad_norm": 3.1111562252044678,
"learning_rate": 4.70325863016248e-05,
"loss": 4.6722,
"step": 141
},
{
"epoch": 0.15777777777777777,
"grad_norm": 3.132770299911499,
"learning_rate": 4.6991214321665414e-05,
"loss": 4.3566,
"step": 142
},
{
"epoch": 0.15888888888888889,
"grad_norm": 3.0841097831726074,
"learning_rate": 4.694957438569951e-05,
"loss": 4.9723,
"step": 143
},
{
"epoch": 0.16,
"grad_norm": 4.105175018310547,
"learning_rate": 4.690766700109659e-05,
"loss": 4.4099,
"step": 144
},
{
"epoch": 0.16111111111111112,
"grad_norm": 4.112144470214844,
"learning_rate": 4.6865492678484895e-05,
"loss": 4.2418,
"step": 145
},
{
"epoch": 0.1622222222222222,
"grad_norm": 2.671475648880005,
"learning_rate": 4.682305193174524e-05,
"loss": 4.823,
"step": 146
},
{
"epoch": 0.16333333333333333,
"grad_norm": 3.42596697807312,
"learning_rate": 4.678034527800474e-05,
"loss": 4.6529,
"step": 147
},
{
"epoch": 0.16444444444444445,
"grad_norm": 3.2327771186828613,
"learning_rate": 4.6737373237630476e-05,
"loss": 4.6662,
"step": 148
},
{
"epoch": 0.16555555555555557,
"grad_norm": 3.2889630794525146,
"learning_rate": 4.669413633422322e-05,
"loss": 4.3048,
"step": 149
},
{
"epoch": 0.16666666666666666,
"grad_norm": 2.366293430328369,
"learning_rate": 4.665063509461097e-05,
"loss": 4.7887,
"step": 150
},
{
"epoch": 0.16777777777777778,
"grad_norm": 2.6844308376312256,
"learning_rate": 4.6606870048842624e-05,
"loss": 4.954,
"step": 151
},
{
"epoch": 0.1688888888888889,
"grad_norm": 3.2190423011779785,
"learning_rate": 4.656284173018144e-05,
"loss": 5.189,
"step": 152
},
{
"epoch": 0.17,
"grad_norm": 3.640512466430664,
"learning_rate": 4.65185506750986e-05,
"loss": 4.4657,
"step": 153
},
{
"epoch": 0.1711111111111111,
"grad_norm": 2.7704906463623047,
"learning_rate": 4.6473997423266614e-05,
"loss": 4.6634,
"step": 154
},
{
"epoch": 0.17222222222222222,
"grad_norm": 2.7830865383148193,
"learning_rate": 4.642918251755281e-05,
"loss": 4.5943,
"step": 155
},
{
"epoch": 0.17333333333333334,
"grad_norm": 2.327153444290161,
"learning_rate": 4.638410650401267e-05,
"loss": 4.8456,
"step": 156
},
{
"epoch": 0.17444444444444446,
"grad_norm": 3.3280811309814453,
"learning_rate": 4.6338769931883185e-05,
"loss": 4.6068,
"step": 157
},
{
"epoch": 0.17555555555555555,
"grad_norm": 3.1970295906066895,
"learning_rate": 4.629317335357619e-05,
"loss": 4.6854,
"step": 158
},
{
"epoch": 0.17666666666666667,
"grad_norm": 2.481355667114258,
"learning_rate": 4.6247317324671605e-05,
"loss": 4.5796,
"step": 159
},
{
"epoch": 0.17777777777777778,
"grad_norm": 2.445061683654785,
"learning_rate": 4.620120240391065e-05,
"loss": 4.3907,
"step": 160
},
{
"epoch": 0.17888888888888888,
"grad_norm": 3.381376028060913,
"learning_rate": 4.615482915318911e-05,
"loss": 4.6822,
"step": 161
},
{
"epoch": 0.18,
"grad_norm": 2.4204893112182617,
"learning_rate": 4.610819813755038e-05,
"loss": 4.436,
"step": 162
},
{
"epoch": 0.1811111111111111,
"grad_norm": 2.725168228149414,
"learning_rate": 4.606130992517869e-05,
"loss": 5.0643,
"step": 163
},
{
"epoch": 0.18222222222222223,
"grad_norm": 3.7455644607543945,
"learning_rate": 4.601416508739211e-05,
"loss": 4.7741,
"step": 164
},
{
"epoch": 0.18333333333333332,
"grad_norm": 2.5998661518096924,
"learning_rate": 4.5966764198635606e-05,
"loss": 4.8321,
"step": 165
},
{
"epoch": 0.18444444444444444,
"grad_norm": 4.380634784698486,
"learning_rate": 4.591910783647404e-05,
"loss": 4.6678,
"step": 166
},
{
"epoch": 0.18555555555555556,
"grad_norm": 2.3288722038269043,
"learning_rate": 4.5871196581585166e-05,
"loss": 4.8369,
"step": 167
},
{
"epoch": 0.18666666666666668,
"grad_norm": 2.959716320037842,
"learning_rate": 4.5823031017752485e-05,
"loss": 4.0906,
"step": 168
},
{
"epoch": 0.18777777777777777,
"grad_norm": 2.6955947875976562,
"learning_rate": 4.577461173185821e-05,
"loss": 4.6623,
"step": 169
},
{
"epoch": 0.18888888888888888,
"grad_norm": 4.677174091339111,
"learning_rate": 4.572593931387604e-05,
"loss": 4.9871,
"step": 170
},
{
"epoch": 0.19,
"grad_norm": 2.5706987380981445,
"learning_rate": 4.567701435686404e-05,
"loss": 4.8683,
"step": 171
},
{
"epoch": 0.19111111111111112,
"grad_norm": 3.341663122177124,
"learning_rate": 4.562783745695738e-05,
"loss": 4.6751,
"step": 172
},
{
"epoch": 0.1922222222222222,
"grad_norm": 2.941930055618286,
"learning_rate": 4.557840921336105e-05,
"loss": 4.8538,
"step": 173
},
{
"epoch": 0.19333333333333333,
"grad_norm": 2.8567423820495605,
"learning_rate": 4.5528730228342605e-05,
"loss": 4.5703,
"step": 174
},
{
"epoch": 0.19444444444444445,
"grad_norm": 2.6613831520080566,
"learning_rate": 4.54788011072248e-05,
"loss": 4.4107,
"step": 175
},
{
"epoch": 0.19555555555555557,
"grad_norm": 2.5689127445220947,
"learning_rate": 4.542862245837821e-05,
"loss": 4.7408,
"step": 176
},
{
"epoch": 0.19666666666666666,
"grad_norm": 3.576414108276367,
"learning_rate": 4.537819489321386e-05,
"loss": 4.5211,
"step": 177
},
{
"epoch": 0.19777777777777777,
"grad_norm": 3.1265640258789062,
"learning_rate": 4.532751902617569e-05,
"loss": 4.0729,
"step": 178
},
{
"epoch": 0.1988888888888889,
"grad_norm": 3.3458447456359863,
"learning_rate": 4.527659547473317e-05,
"loss": 5.1058,
"step": 179
},
{
"epoch": 0.2,
"grad_norm": 4.459259033203125,
"learning_rate": 4.522542485937369e-05,
"loss": 4.3809,
"step": 180
},
{
"epoch": 0.2011111111111111,
"grad_norm": 2.7210464477539062,
"learning_rate": 4.5174007803595055e-05,
"loss": 4.5236,
"step": 181
},
{
"epoch": 0.20222222222222222,
"grad_norm": 3.285710334777832,
"learning_rate": 4.512234493389785e-05,
"loss": 4.6732,
"step": 182
},
{
"epoch": 0.20333333333333334,
"grad_norm": 3.063709020614624,
"learning_rate": 4.5070436879777865e-05,
"loss": 4.2399,
"step": 183
},
{
"epoch": 0.20444444444444446,
"grad_norm": 2.4527218341827393,
"learning_rate": 4.5018284273718336e-05,
"loss": 4.8007,
"step": 184
},
{
"epoch": 0.20555555555555555,
"grad_norm": 3.8102920055389404,
"learning_rate": 4.496588775118232e-05,
"loss": 4.8862,
"step": 185
},
{
"epoch": 0.20666666666666667,
"grad_norm": 4.2287139892578125,
"learning_rate": 4.491324795060491e-05,
"loss": 4.3253,
"step": 186
},
{
"epoch": 0.20777777777777778,
"grad_norm": 3.0381033420562744,
"learning_rate": 4.4860365513385456e-05,
"loss": 4.5634,
"step": 187
},
{
"epoch": 0.2088888888888889,
"grad_norm": 3.7136149406433105,
"learning_rate": 4.480724108387977e-05,
"loss": 5.318,
"step": 188
},
{
"epoch": 0.21,
"grad_norm": 2.9353251457214355,
"learning_rate": 4.4753875309392266e-05,
"loss": 4.6681,
"step": 189
},
{
"epoch": 0.2111111111111111,
"grad_norm": 2.4687249660491943,
"learning_rate": 4.4700268840168045e-05,
"loss": 4.7589,
"step": 190
},
{
"epoch": 0.21222222222222223,
"grad_norm": 2.1315102577209473,
"learning_rate": 4.464642232938505e-05,
"loss": 4.737,
"step": 191
},
{
"epoch": 0.21333333333333335,
"grad_norm": 3.893827438354492,
"learning_rate": 4.4592336433146e-05,
"loss": 4.1764,
"step": 192
},
{
"epoch": 0.21444444444444444,
"grad_norm": 2.8332619667053223,
"learning_rate": 4.453801181047047e-05,
"loss": 4.7298,
"step": 193
},
{
"epoch": 0.21555555555555556,
"grad_norm": 2.2660417556762695,
"learning_rate": 4.448344912328686e-05,
"loss": 4.6808,
"step": 194
},
{
"epoch": 0.21666666666666667,
"grad_norm": 3.361409902572632,
"learning_rate": 4.442864903642428e-05,
"loss": 4.6071,
"step": 195
},
{
"epoch": 0.21777777777777776,
"grad_norm": 2.6645731925964355,
"learning_rate": 4.4373612217604496e-05,
"loss": 4.552,
"step": 196
},
{
"epoch": 0.21888888888888888,
"grad_norm": 3.0278093814849854,
"learning_rate": 4.431833933743378e-05,
"loss": 4.9144,
"step": 197
},
{
"epoch": 0.22,
"grad_norm": 3.6761045455932617,
"learning_rate": 4.426283106939474e-05,
"loss": 4.6239,
"step": 198
},
{
"epoch": 0.22111111111111112,
"grad_norm": 3.8143444061279297,
"learning_rate": 4.420708808983809e-05,
"loss": 4.5675,
"step": 199
},
{
"epoch": 0.2222222222222222,
"grad_norm": 3.473196506500244,
"learning_rate": 4.415111107797445e-05,
"loss": 4.6748,
"step": 200
},
{
"epoch": 0.22333333333333333,
"grad_norm": 3.977616310119629,
"learning_rate": 4.4094900715866064e-05,
"loss": 4.3232,
"step": 201
},
{
"epoch": 0.22444444444444445,
"grad_norm": 3.6012306213378906,
"learning_rate": 4.403845768841842e-05,
"loss": 4.975,
"step": 202
},
{
"epoch": 0.22555555555555556,
"grad_norm": 3.368455171585083,
"learning_rate": 4.3981782683372016e-05,
"loss": 4.5316,
"step": 203
},
{
"epoch": 0.22666666666666666,
"grad_norm": 4.002109050750732,
"learning_rate": 4.3924876391293915e-05,
"loss": 4.2947,
"step": 204
},
{
"epoch": 0.22777777777777777,
"grad_norm": 2.9084315299987793,
"learning_rate": 4.386773950556931e-05,
"loss": 4.4741,
"step": 205
},
{
"epoch": 0.2288888888888889,
"grad_norm": 2.937263011932373,
"learning_rate": 4.381037272239311e-05,
"loss": 4.1499,
"step": 206
},
{
"epoch": 0.23,
"grad_norm": 2.508908987045288,
"learning_rate": 4.375277674076149e-05,
"loss": 4.856,
"step": 207
},
{
"epoch": 0.2311111111111111,
"grad_norm": 3.1218552589416504,
"learning_rate": 4.36949522624633e-05,
"loss": 4.7484,
"step": 208
},
{
"epoch": 0.23222222222222222,
"grad_norm": 3.868100881576538,
"learning_rate": 4.363689999207156e-05,
"loss": 4.4354,
"step": 209
},
{
"epoch": 0.23333333333333334,
"grad_norm": 2.4734623432159424,
"learning_rate": 4.357862063693486e-05,
"loss": 4.978,
"step": 210
},
{
"epoch": 0.23444444444444446,
"grad_norm": 3.2189548015594482,
"learning_rate": 4.352011490716875e-05,
"loss": 4.9206,
"step": 211
},
{
"epoch": 0.23555555555555555,
"grad_norm": 3.757636308670044,
"learning_rate": 4.3461383515647106e-05,
"loss": 4.5211,
"step": 212
},
{
"epoch": 0.23666666666666666,
"grad_norm": 4.024762153625488,
"learning_rate": 4.3402427177993366e-05,
"loss": 4.5448,
"step": 213
},
{
"epoch": 0.23777777777777778,
"grad_norm": 3.536659002304077,
"learning_rate": 4.334324661257191e-05,
"loss": 5.0555,
"step": 214
},
{
"epoch": 0.2388888888888889,
"grad_norm": 2.2506678104400635,
"learning_rate": 4.3283842540479264e-05,
"loss": 4.8103,
"step": 215
},
{
"epoch": 0.24,
"grad_norm": 2.8261499404907227,
"learning_rate": 4.3224215685535294e-05,
"loss": 4.449,
"step": 216
},
{
"epoch": 0.2411111111111111,
"grad_norm": 3.2074854373931885,
"learning_rate": 4.31643667742744e-05,
"loss": 4.3977,
"step": 217
},
{
"epoch": 0.24222222222222223,
"grad_norm": 2.743082284927368,
"learning_rate": 4.3104296535936695e-05,
"loss": 4.2452,
"step": 218
},
{
"epoch": 0.24333333333333335,
"grad_norm": 2.7638344764709473,
"learning_rate": 4.304400570245906e-05,
"loss": 4.6636,
"step": 219
},
{
"epoch": 0.24444444444444444,
"grad_norm": 3.1931586265563965,
"learning_rate": 4.2983495008466276e-05,
"loss": 4.3444,
"step": 220
},
{
"epoch": 0.24555555555555555,
"grad_norm": 3.946772575378418,
"learning_rate": 4.292276519126207e-05,
"loss": 4.4841,
"step": 221
},
{
"epoch": 0.24666666666666667,
"grad_norm": 2.5195651054382324,
"learning_rate": 4.2861816990820084e-05,
"loss": 4.7453,
"step": 222
},
{
"epoch": 0.2477777777777778,
"grad_norm": 2.1805219650268555,
"learning_rate": 4.280065114977492e-05,
"loss": 4.7288,
"step": 223
},
{
"epoch": 0.24888888888888888,
"grad_norm": 2.361443519592285,
"learning_rate": 4.273926841341302e-05,
"loss": 4.9004,
"step": 224
},
{
"epoch": 0.25,
"grad_norm": 3.45947265625,
"learning_rate": 4.267766952966369e-05,
"loss": 4.3567,
"step": 225
},
{
"epoch": 0.2511111111111111,
"grad_norm": 3.4783213138580322,
"learning_rate": 4.261585524908987e-05,
"loss": 4.9095,
"step": 226
},
{
"epoch": 0.25222222222222224,
"grad_norm": 2.565812110900879,
"learning_rate": 4.2553826324879064e-05,
"loss": 4.5359,
"step": 227
},
{
"epoch": 0.25333333333333335,
"grad_norm": 3.349132776260376,
"learning_rate": 4.249158351283414e-05,
"loss": 4.3791,
"step": 228
},
{
"epoch": 0.2544444444444444,
"grad_norm": 2.278238534927368,
"learning_rate": 4.242912757136412e-05,
"loss": 4.5528,
"step": 229
},
{
"epoch": 0.25555555555555554,
"grad_norm": 2.851348400115967,
"learning_rate": 4.2366459261474933e-05,
"loss": 4.1264,
"step": 230
},
{
"epoch": 0.25666666666666665,
"grad_norm": 2.4230828285217285,
"learning_rate": 4.230357934676017e-05,
"loss": 4.8431,
"step": 231
},
{
"epoch": 0.2577777777777778,
"grad_norm": 3.563849687576294,
"learning_rate": 4.224048859339175e-05,
"loss": 4.2379,
"step": 232
},
{
"epoch": 0.2588888888888889,
"grad_norm": 3.419377088546753,
"learning_rate": 4.2177187770110576e-05,
"loss": 4.5906,
"step": 233
},
{
"epoch": 0.26,
"grad_norm": 3.992064952850342,
"learning_rate": 4.211367764821722e-05,
"loss": 4.1902,
"step": 234
},
{
"epoch": 0.2611111111111111,
"grad_norm": 2.2617876529693604,
"learning_rate": 4.2049959001562464e-05,
"loss": 4.6505,
"step": 235
},
{
"epoch": 0.26222222222222225,
"grad_norm": 2.8081510066986084,
"learning_rate": 4.198603260653792e-05,
"loss": 4.9376,
"step": 236
},
{
"epoch": 0.2633333333333333,
"grad_norm": 3.759847402572632,
"learning_rate": 4.192189924206652e-05,
"loss": 4.4958,
"step": 237
},
{
"epoch": 0.2644444444444444,
"grad_norm": 3.2556324005126953,
"learning_rate": 4.185755968959308e-05,
"loss": 4.3312,
"step": 238
},
{
"epoch": 0.26555555555555554,
"grad_norm": 2.438190221786499,
"learning_rate": 4.179301473307476e-05,
"loss": 4.8255,
"step": 239
},
{
"epoch": 0.26666666666666666,
"grad_norm": 3.00315523147583,
"learning_rate": 4.172826515897146e-05,
"loss": 4.788,
"step": 240
},
{
"epoch": 0.2677777777777778,
"grad_norm": 3.5244944095611572,
"learning_rate": 4.166331175623631e-05,
"loss": 4.5552,
"step": 241
},
{
"epoch": 0.2688888888888889,
"grad_norm": 2.436034917831421,
"learning_rate": 4.1598155316306044e-05,
"loss": 4.3463,
"step": 242
},
{
"epoch": 0.27,
"grad_norm": 3.473583698272705,
"learning_rate": 4.1532796633091296e-05,
"loss": 4.6629,
"step": 243
},
{
"epoch": 0.27111111111111114,
"grad_norm": 2.3865156173706055,
"learning_rate": 4.146723650296701e-05,
"loss": 4.5326,
"step": 244
},
{
"epoch": 0.2722222222222222,
"grad_norm": 3.0051960945129395,
"learning_rate": 4.140147572476268e-05,
"loss": 4.6408,
"step": 245
},
{
"epoch": 0.2733333333333333,
"grad_norm": 2.631802797317505,
"learning_rate": 4.133551509975264e-05,
"loss": 4.6096,
"step": 246
},
{
"epoch": 0.27444444444444444,
"grad_norm": 3.8499889373779297,
"learning_rate": 4.1269355431646274e-05,
"loss": 4.3807,
"step": 247
},
{
"epoch": 0.27555555555555555,
"grad_norm": 4.838550090789795,
"learning_rate": 4.1202997526578276e-05,
"loss": 4.1733,
"step": 248
},
{
"epoch": 0.27666666666666667,
"grad_norm": 3.210563898086548,
"learning_rate": 4.113644219309877e-05,
"loss": 4.633,
"step": 249
},
{
"epoch": 0.2777777777777778,
"grad_norm": 3.254894256591797,
"learning_rate": 4.1069690242163484e-05,
"loss": 4.3214,
"step": 250
},
{
"epoch": 0.2788888888888889,
"grad_norm": 2.7834694385528564,
"learning_rate": 4.100274248712389e-05,
"loss": 4.3556,
"step": 251
},
{
"epoch": 0.28,
"grad_norm": 2.8591508865356445,
"learning_rate": 4.093559974371725e-05,
"loss": 4.4838,
"step": 252
},
{
"epoch": 0.2811111111111111,
"grad_norm": 3.7769737243652344,
"learning_rate": 4.086826283005669e-05,
"loss": 5.1812,
"step": 253
},
{
"epoch": 0.2822222222222222,
"grad_norm": 4.0656914710998535,
"learning_rate": 4.080073256662127e-05,
"loss": 4.4083,
"step": 254
},
{
"epoch": 0.2833333333333333,
"grad_norm": 3.192784547805786,
"learning_rate": 4.073300977624594e-05,
"loss": 4.7642,
"step": 255
},
{
"epoch": 0.28444444444444444,
"grad_norm": 3.2855887413024902,
"learning_rate": 4.066509528411152e-05,
"loss": 4.2253,
"step": 256
},
{
"epoch": 0.28555555555555556,
"grad_norm": 4.624244213104248,
"learning_rate": 4.059698991773466e-05,
"loss": 4.4538,
"step": 257
},
{
"epoch": 0.2866666666666667,
"grad_norm": 3.160623073577881,
"learning_rate": 4.052869450695776e-05,
"loss": 4.1262,
"step": 258
},
{
"epoch": 0.2877777777777778,
"grad_norm": 3.2087790966033936,
"learning_rate": 4.046020988393885e-05,
"loss": 4.3326,
"step": 259
},
{
"epoch": 0.28888888888888886,
"grad_norm": 3.1688692569732666,
"learning_rate": 4.039153688314145e-05,
"loss": 4.7679,
"step": 260
},
{
"epoch": 0.29,
"grad_norm": 2.6120312213897705,
"learning_rate": 4.0322676341324415e-05,
"loss": 4.9559,
"step": 261
},
{
"epoch": 0.2911111111111111,
"grad_norm": 3.8062994480133057,
"learning_rate": 4.02536290975317e-05,
"loss": 4.3511,
"step": 262
},
{
"epoch": 0.2922222222222222,
"grad_norm": 3.093778610229492,
"learning_rate": 4.018439599308217e-05,
"loss": 4.3003,
"step": 263
},
{
"epoch": 0.29333333333333333,
"grad_norm": 2.905430316925049,
"learning_rate": 4.011497787155938e-05,
"loss": 4.4313,
"step": 264
},
{
"epoch": 0.29444444444444445,
"grad_norm": 3.0712385177612305,
"learning_rate": 4.0045375578801214e-05,
"loss": 4.4432,
"step": 265
},
{
"epoch": 0.29555555555555557,
"grad_norm": 3.544624090194702,
"learning_rate": 3.997558996288965e-05,
"loss": 4.5969,
"step": 266
},
{
"epoch": 0.2966666666666667,
"grad_norm": 3.2956557273864746,
"learning_rate": 3.99056218741404e-05,
"loss": 4.5313,
"step": 267
},
{
"epoch": 0.29777777777777775,
"grad_norm": 2.8312222957611084,
"learning_rate": 3.983547216509254e-05,
"loss": 4.9315,
"step": 268
},
{
"epoch": 0.29888888888888887,
"grad_norm": 2.3425047397613525,
"learning_rate": 3.976514169049814e-05,
"loss": 4.3562,
"step": 269
},
{
"epoch": 0.3,
"grad_norm": 3.2110278606414795,
"learning_rate": 3.969463130731183e-05,
"loss": 4.3582,
"step": 270
},
{
"epoch": 0.3011111111111111,
"grad_norm": 2.364408493041992,
"learning_rate": 3.962394187468039e-05,
"loss": 4.8083,
"step": 271
},
{
"epoch": 0.3022222222222222,
"grad_norm": 3.1101303100585938,
"learning_rate": 3.955307425393224e-05,
"loss": 4.5352,
"step": 272
},
{
"epoch": 0.30333333333333334,
"grad_norm": 2.396379232406616,
"learning_rate": 3.948202930856697e-05,
"loss": 4.3274,
"step": 273
},
{
"epoch": 0.30444444444444446,
"grad_norm": 2.183039426803589,
"learning_rate": 3.941080790424484e-05,
"loss": 4.6847,
"step": 274
},
{
"epoch": 0.3055555555555556,
"grad_norm": 2.731586456298828,
"learning_rate": 3.933941090877615e-05,
"loss": 4.4002,
"step": 275
},
{
"epoch": 0.30666666666666664,
"grad_norm": 2.9183268547058105,
"learning_rate": 3.92678391921108e-05,
"loss": 4.7244,
"step": 276
},
{
"epoch": 0.30777777777777776,
"grad_norm": 2.450711965560913,
"learning_rate": 3.919609362632753e-05,
"loss": 4.5988,
"step": 277
},
{
"epoch": 0.3088888888888889,
"grad_norm": 3.4320664405822754,
"learning_rate": 3.912417508562345e-05,
"loss": 4.4192,
"step": 278
},
{
"epoch": 0.31,
"grad_norm": 3.2206807136535645,
"learning_rate": 3.905208444630327e-05,
"loss": 4.3554,
"step": 279
},
{
"epoch": 0.3111111111111111,
"grad_norm": 3.739584445953369,
"learning_rate": 3.897982258676867e-05,
"loss": 4.4398,
"step": 280
},
{
"epoch": 0.31222222222222223,
"grad_norm": 3.239889144897461,
"learning_rate": 3.8907390387507625e-05,
"loss": 4.3802,
"step": 281
},
{
"epoch": 0.31333333333333335,
"grad_norm": 2.5824975967407227,
"learning_rate": 3.883478873108361e-05,
"loss": 4.7278,
"step": 282
},
{
"epoch": 0.31444444444444447,
"grad_norm": 2.3867881298065186,
"learning_rate": 3.8762018502124894e-05,
"loss": 4.5456,
"step": 283
},
{
"epoch": 0.31555555555555553,
"grad_norm": 3.0963456630706787,
"learning_rate": 3.868908058731376e-05,
"loss": 3.8366,
"step": 284
},
{
"epoch": 0.31666666666666665,
"grad_norm": 2.538454532623291,
"learning_rate": 3.861597587537568e-05,
"loss": 4.2254,
"step": 285
},
{
"epoch": 0.31777777777777777,
"grad_norm": 3.2098913192749023,
"learning_rate": 3.85427052570685e-05,
"loss": 5.1547,
"step": 286
},
{
"epoch": 0.3188888888888889,
"grad_norm": 2.2710459232330322,
"learning_rate": 3.8469269625171576e-05,
"loss": 4.5172,
"step": 287
},
{
"epoch": 0.32,
"grad_norm": 3.4076178073883057,
"learning_rate": 3.8395669874474915e-05,
"loss": 4.2652,
"step": 288
},
{
"epoch": 0.3211111111111111,
"grad_norm": 2.152460813522339,
"learning_rate": 3.832190690176825e-05,
"loss": 4.4623,
"step": 289
},
{
"epoch": 0.32222222222222224,
"grad_norm": 2.5687735080718994,
"learning_rate": 3.824798160583012e-05,
"loss": 4.7519,
"step": 290
},
{
"epoch": 0.3233333333333333,
"grad_norm": 2.60406494140625,
"learning_rate": 3.8173894887416945e-05,
"loss": 4.7309,
"step": 291
},
{
"epoch": 0.3244444444444444,
"grad_norm": 3.08526349067688,
"learning_rate": 3.8099647649251986e-05,
"loss": 4.5816,
"step": 292
},
{
"epoch": 0.32555555555555554,
"grad_norm": 3.4984188079833984,
"learning_rate": 3.802524079601442e-05,
"loss": 4.5818,
"step": 293
},
{
"epoch": 0.32666666666666666,
"grad_norm": 4.528430938720703,
"learning_rate": 3.795067523432826e-05,
"loss": 4.1268,
"step": 294
},
{
"epoch": 0.3277777777777778,
"grad_norm": 3.826263904571533,
"learning_rate": 3.787595187275136e-05,
"loss": 4.4605,
"step": 295
},
{
"epoch": 0.3288888888888889,
"grad_norm": 2.9818341732025146,
"learning_rate": 3.780107162176429e-05,
"loss": 4.789,
"step": 296
},
{
"epoch": 0.33,
"grad_norm": 3.642854928970337,
"learning_rate": 3.7726035393759285e-05,
"loss": 4.7423,
"step": 297
},
{
"epoch": 0.33111111111111113,
"grad_norm": 2.6310813426971436,
"learning_rate": 3.765084410302909e-05,
"loss": 4.1751,
"step": 298
},
{
"epoch": 0.3322222222222222,
"grad_norm": 4.018439769744873,
"learning_rate": 3.757549866575588e-05,
"loss": 4.5056,
"step": 299
},
{
"epoch": 0.3333333333333333,
"grad_norm": 3.364558696746826,
"learning_rate": 3.7500000000000003e-05,
"loss": 4.3372,
"step": 300
},
{
"epoch": 0.33444444444444443,
"grad_norm": 2.8973915576934814,
"learning_rate": 3.742434902568889e-05,
"loss": 4.3383,
"step": 301
},
{
"epoch": 0.33555555555555555,
"grad_norm": 3.0985381603240967,
"learning_rate": 3.7348546664605777e-05,
"loss": 4.3256,
"step": 302
},
{
"epoch": 0.33666666666666667,
"grad_norm": 2.420278310775757,
"learning_rate": 3.727259384037852e-05,
"loss": 4.1622,
"step": 303
},
{
"epoch": 0.3377777777777778,
"grad_norm": 2.669018030166626,
"learning_rate": 3.719649147846832e-05,
"loss": 4.6532,
"step": 304
},
{
"epoch": 0.3388888888888889,
"grad_norm": 2.5606846809387207,
"learning_rate": 3.712024050615843e-05,
"loss": 4.8205,
"step": 305
},
{
"epoch": 0.34,
"grad_norm": 2.7075791358947754,
"learning_rate": 3.704384185254288e-05,
"loss": 4.7873,
"step": 306
},
{
"epoch": 0.3411111111111111,
"grad_norm": 2.576284170150757,
"learning_rate": 3.696729644851518e-05,
"loss": 4.2416,
"step": 307
},
{
"epoch": 0.3422222222222222,
"grad_norm": 2.5355241298675537,
"learning_rate": 3.689060522675689e-05,
"loss": 4.6843,
"step": 308
},
{
"epoch": 0.3433333333333333,
"grad_norm": 3.172502040863037,
"learning_rate": 3.681376912172636e-05,
"loss": 4.8512,
"step": 309
},
{
"epoch": 0.34444444444444444,
"grad_norm": 3.2482614517211914,
"learning_rate": 3.673678906964727e-05,
"loss": 4.6384,
"step": 310
},
{
"epoch": 0.34555555555555556,
"grad_norm": 3.223466634750366,
"learning_rate": 3.665966600849728e-05,
"loss": 4.593,
"step": 311
},
{
"epoch": 0.3466666666666667,
"grad_norm": 3.437298536300659,
"learning_rate": 3.6582400877996546e-05,
"loss": 4.376,
"step": 312
},
{
"epoch": 0.3477777777777778,
"grad_norm": 3.4591798782348633,
"learning_rate": 3.6504994619596294e-05,
"loss": 4.5369,
"step": 313
},
{
"epoch": 0.3488888888888889,
"grad_norm": 2.1622931957244873,
"learning_rate": 3.642744817646736e-05,
"loss": 4.4165,
"step": 314
},
{
"epoch": 0.35,
"grad_norm": 2.5694704055786133,
"learning_rate": 3.634976249348867e-05,
"loss": 4.4281,
"step": 315
},
{
"epoch": 0.3511111111111111,
"grad_norm": 3.8053269386291504,
"learning_rate": 3.627193851723577e-05,
"loss": 4.4582,
"step": 316
},
{
"epoch": 0.3522222222222222,
"grad_norm": 2.7068371772766113,
"learning_rate": 3.619397719596924e-05,
"loss": 4.141,
"step": 317
},
{
"epoch": 0.35333333333333333,
"grad_norm": 3.1135122776031494,
"learning_rate": 3.611587947962319e-05,
"loss": 4.4257,
"step": 318
},
{
"epoch": 0.35444444444444445,
"grad_norm": 2.5378129482269287,
"learning_rate": 3.603764631979363e-05,
"loss": 4.495,
"step": 319
},
{
"epoch": 0.35555555555555557,
"grad_norm": 3.563612937927246,
"learning_rate": 3.5959278669726935e-05,
"loss": 4.5916,
"step": 320
},
{
"epoch": 0.3566666666666667,
"grad_norm": 3.765002727508545,
"learning_rate": 3.588077748430819e-05,
"loss": 4.7184,
"step": 321
},
{
"epoch": 0.35777777777777775,
"grad_norm": 2.8586552143096924,
"learning_rate": 3.580214372004956e-05,
"loss": 4.6151,
"step": 322
},
{
"epoch": 0.35888888888888887,
"grad_norm": 3.664820432662964,
"learning_rate": 3.572337833507865e-05,
"loss": 4.0914,
"step": 323
},
{
"epoch": 0.36,
"grad_norm": 2.808751344680786,
"learning_rate": 3.564448228912682e-05,
"loss": 4.4035,
"step": 324
},
{
"epoch": 0.3611111111111111,
"grad_norm": 3.5564141273498535,
"learning_rate": 3.556545654351749e-05,
"loss": 4.4184,
"step": 325
},
{
"epoch": 0.3622222222222222,
"grad_norm": 3.4853861331939697,
"learning_rate": 3.548630206115443e-05,
"loss": 4.5303,
"step": 326
},
{
"epoch": 0.36333333333333334,
"grad_norm": 3.2625653743743896,
"learning_rate": 3.540701980651003e-05,
"loss": 4.5295,
"step": 327
},
{
"epoch": 0.36444444444444446,
"grad_norm": 2.611847162246704,
"learning_rate": 3.532761074561355e-05,
"loss": 4.8323,
"step": 328
},
{
"epoch": 0.3655555555555556,
"grad_norm": 2.4942116737365723,
"learning_rate": 3.524807584603932e-05,
"loss": 4.245,
"step": 329
},
{
"epoch": 0.36666666666666664,
"grad_norm": 2.132793664932251,
"learning_rate": 3.516841607689501e-05,
"loss": 4.6669,
"step": 330
},
{
"epoch": 0.36777777777777776,
"grad_norm": 2.5251405239105225,
"learning_rate": 3.5088632408809755e-05,
"loss": 5.0771,
"step": 331
},
{
"epoch": 0.3688888888888889,
"grad_norm": 3.042750358581543,
"learning_rate": 3.5008725813922386e-05,
"loss": 4.562,
"step": 332
},
{
"epoch": 0.37,
"grad_norm": 3.445188283920288,
"learning_rate": 3.4928697265869515e-05,
"loss": 4.6474,
"step": 333
},
{
"epoch": 0.3711111111111111,
"grad_norm": 2.4179251194000244,
"learning_rate": 3.484854773977378e-05,
"loss": 4.4356,
"step": 334
},
{
"epoch": 0.37222222222222223,
"grad_norm": 3.5280330181121826,
"learning_rate": 3.476827821223184e-05,
"loss": 4.1546,
"step": 335
},
{
"epoch": 0.37333333333333335,
"grad_norm": 3.564363718032837,
"learning_rate": 3.4687889661302576e-05,
"loss": 4.7374,
"step": 336
},
{
"epoch": 0.37444444444444447,
"grad_norm": 2.1853668689727783,
"learning_rate": 3.460738306649509e-05,
"loss": 4.4073,
"step": 337
},
{
"epoch": 0.37555555555555553,
"grad_norm": 4.435403823852539,
"learning_rate": 3.452675940875686e-05,
"loss": 4.4693,
"step": 338
},
{
"epoch": 0.37666666666666665,
"grad_norm": 2.2928574085235596,
"learning_rate": 3.444601967046168e-05,
"loss": 4.1636,
"step": 339
},
{
"epoch": 0.37777777777777777,
"grad_norm": 2.858842611312866,
"learning_rate": 3.436516483539781e-05,
"loss": 4.7424,
"step": 340
},
{
"epoch": 0.3788888888888889,
"grad_norm": 2.4767720699310303,
"learning_rate": 3.428419588875588e-05,
"loss": 4.6306,
"step": 341
},
{
"epoch": 0.38,
"grad_norm": 4.104574680328369,
"learning_rate": 3.4203113817116957e-05,
"loss": 4.2592,
"step": 342
},
{
"epoch": 0.3811111111111111,
"grad_norm": 3.349961757659912,
"learning_rate": 3.412191960844049e-05,
"loss": 4.2351,
"step": 343
},
{
"epoch": 0.38222222222222224,
"grad_norm": 2.902287244796753,
"learning_rate": 3.4040614252052305e-05,
"loss": 4.1556,
"step": 344
},
{
"epoch": 0.38333333333333336,
"grad_norm": 2.7805283069610596,
"learning_rate": 3.39591987386325e-05,
"loss": 4.8241,
"step": 345
},
{
"epoch": 0.3844444444444444,
"grad_norm": 3.494743585586548,
"learning_rate": 3.387767406020343e-05,
"loss": 4.4414,
"step": 346
},
{
"epoch": 0.38555555555555554,
"grad_norm": 3.4807887077331543,
"learning_rate": 3.3796041210117546e-05,
"loss": 4.4356,
"step": 347
},
{
"epoch": 0.38666666666666666,
"grad_norm": 2.875729560852051,
"learning_rate": 3.3714301183045385e-05,
"loss": 4.6104,
"step": 348
},
{
"epoch": 0.3877777777777778,
"grad_norm": 2.3670778274536133,
"learning_rate": 3.363245497496337e-05,
"loss": 4.6181,
"step": 349
},
{
"epoch": 0.3888888888888889,
"grad_norm": 3.511547088623047,
"learning_rate": 3.355050358314172e-05,
"loss": 4.5967,
"step": 350
},
{
"epoch": 0.39,
"grad_norm": 3.4935336112976074,
"learning_rate": 3.346844800613229e-05,
"loss": 4.4003,
"step": 351
},
{
"epoch": 0.39111111111111113,
"grad_norm": 3.7835776805877686,
"learning_rate": 3.338628924375638e-05,
"loss": 4.3227,
"step": 352
},
{
"epoch": 0.39222222222222225,
"grad_norm": 3.659778594970703,
"learning_rate": 3.330402829709258e-05,
"loss": 4.735,
"step": 353
},
{
"epoch": 0.3933333333333333,
"grad_norm": 2.280862331390381,
"learning_rate": 3.322166616846458e-05,
"loss": 4.3854,
"step": 354
},
{
"epoch": 0.39444444444444443,
"grad_norm": 3.5552432537078857,
"learning_rate": 3.313920386142892e-05,
"loss": 4.494,
"step": 355
},
{
"epoch": 0.39555555555555555,
"grad_norm": 3.2624123096466064,
"learning_rate": 3.305664238076278e-05,
"loss": 4.441,
"step": 356
},
{
"epoch": 0.39666666666666667,
"grad_norm": 2.5981621742248535,
"learning_rate": 3.2973982732451755e-05,
"loss": 4.1962,
"step": 357
},
{
"epoch": 0.3977777777777778,
"grad_norm": 3.8529744148254395,
"learning_rate": 3.289122592367757e-05,
"loss": 4.1754,
"step": 358
},
{
"epoch": 0.3988888888888889,
"grad_norm": 3.049600839614868,
"learning_rate": 3.2808372962805816e-05,
"loss": 4.638,
"step": 359
},
{
"epoch": 0.4,
"grad_norm": 3.134927988052368,
"learning_rate": 3.272542485937369e-05,
"loss": 4.4763,
"step": 360
},
{
"epoch": 0.4011111111111111,
"grad_norm": 2.298032522201538,
"learning_rate": 3.264238262407764e-05,
"loss": 4.8264,
"step": 361
},
{
"epoch": 0.4022222222222222,
"grad_norm": 3.3620493412017822,
"learning_rate": 3.2559247268761115e-05,
"loss": 4.3332,
"step": 362
},
{
"epoch": 0.4033333333333333,
"grad_norm": 3.7335774898529053,
"learning_rate": 3.247601980640217e-05,
"loss": 4.351,
"step": 363
},
{
"epoch": 0.40444444444444444,
"grad_norm": 2.7008893489837646,
"learning_rate": 3.239270125110117e-05,
"loss": 4.3373,
"step": 364
},
{
"epoch": 0.40555555555555556,
"grad_norm": 2.3377201557159424,
"learning_rate": 3.230929261806842e-05,
"loss": 4.6638,
"step": 365
},
{
"epoch": 0.4066666666666667,
"grad_norm": 2.796996831893921,
"learning_rate": 3.222579492361179e-05,
"loss": 4.4803,
"step": 366
},
{
"epoch": 0.4077777777777778,
"grad_norm": 3.004497766494751,
"learning_rate": 3.214220918512434e-05,
"loss": 4.8025,
"step": 367
},
{
"epoch": 0.4088888888888889,
"grad_norm": 2.762565851211548,
"learning_rate": 3.205853642107192e-05,
"loss": 4.3787,
"step": 368
},
{
"epoch": 0.41,
"grad_norm": 3.7421979904174805,
"learning_rate": 3.1974777650980735e-05,
"loss": 4.509,
"step": 369
},
{
"epoch": 0.4111111111111111,
"grad_norm": 2.6516170501708984,
"learning_rate": 3.1890933895424976e-05,
"loss": 4.8942,
"step": 370
},
{
"epoch": 0.4122222222222222,
"grad_norm": 3.2065646648406982,
"learning_rate": 3.180700617601436e-05,
"loss": 4.4659,
"step": 371
},
{
"epoch": 0.41333333333333333,
"grad_norm": 3.3278090953826904,
"learning_rate": 3.172299551538164e-05,
"loss": 4.3103,
"step": 372
},
{
"epoch": 0.41444444444444445,
"grad_norm": 3.3703420162200928,
"learning_rate": 3.163890293717022e-05,
"loss": 4.4312,
"step": 373
},
{
"epoch": 0.41555555555555557,
"grad_norm": 3.629591464996338,
"learning_rate": 3.155472946602162e-05,
"loss": 4.8446,
"step": 374
},
{
"epoch": 0.4166666666666667,
"grad_norm": 3.0027589797973633,
"learning_rate": 3.147047612756302e-05,
"loss": 4.5204,
"step": 375
},
{
"epoch": 0.4177777777777778,
"grad_norm": 3.2268624305725098,
"learning_rate": 3.138614394839476e-05,
"loss": 4.1844,
"step": 376
},
{
"epoch": 0.41888888888888887,
"grad_norm": 3.958721160888672,
"learning_rate": 3.130173395607785e-05,
"loss": 4.3071,
"step": 377
},
{
"epoch": 0.42,
"grad_norm": 2.1055030822753906,
"learning_rate": 3.121724717912138e-05,
"loss": 4.2903,
"step": 378
},
{
"epoch": 0.4211111111111111,
"grad_norm": 3.0535993576049805,
"learning_rate": 3.1132684646970064e-05,
"loss": 4.5096,
"step": 379
},
{
"epoch": 0.4222222222222222,
"grad_norm": 2.5504136085510254,
"learning_rate": 3.104804738999169e-05,
"loss": 4.4653,
"step": 380
},
{
"epoch": 0.42333333333333334,
"grad_norm": 3.1297409534454346,
"learning_rate": 3.0963336439464526e-05,
"loss": 4.3652,
"step": 381
},
{
"epoch": 0.42444444444444446,
"grad_norm": 2.2639200687408447,
"learning_rate": 3.087855282756475e-05,
"loss": 4.8119,
"step": 382
},
{
"epoch": 0.4255555555555556,
"grad_norm": 2.581587314605713,
"learning_rate": 3.079369758735393e-05,
"loss": 4.0573,
"step": 383
},
{
"epoch": 0.4266666666666667,
"grad_norm": 2.61521577835083,
"learning_rate": 3.0708771752766394e-05,
"loss": 4.3563,
"step": 384
},
{
"epoch": 0.42777777777777776,
"grad_norm": 3.1898045539855957,
"learning_rate": 3.062377635859663e-05,
"loss": 5.091,
"step": 385
},
{
"epoch": 0.4288888888888889,
"grad_norm": 3.8822860717773438,
"learning_rate": 3.053871244048669e-05,
"loss": 4.1214,
"step": 386
},
{
"epoch": 0.43,
"grad_norm": 3.2095978260040283,
"learning_rate": 3.045358103491357e-05,
"loss": 4.6263,
"step": 387
},
{
"epoch": 0.4311111111111111,
"grad_norm": 2.3496782779693604,
"learning_rate": 3.0368383179176585e-05,
"loss": 4.2816,
"step": 388
},
{
"epoch": 0.43222222222222223,
"grad_norm": 3.5458877086639404,
"learning_rate": 3.028311991138472e-05,
"loss": 4.1913,
"step": 389
},
{
"epoch": 0.43333333333333335,
"grad_norm": 2.9822726249694824,
"learning_rate": 3.0197792270443982e-05,
"loss": 4.5714,
"step": 390
},
{
"epoch": 0.43444444444444447,
"grad_norm": 2.740432024002075,
"learning_rate": 3.0112401296044757e-05,
"loss": 4.565,
"step": 391
},
{
"epoch": 0.43555555555555553,
"grad_norm": 3.301563024520874,
"learning_rate": 3.002694802864912e-05,
"loss": 4.4515,
"step": 392
},
{
"epoch": 0.43666666666666665,
"grad_norm": 3.0773849487304688,
"learning_rate": 2.9941433509478156e-05,
"loss": 4.4196,
"step": 393
},
{
"epoch": 0.43777777777777777,
"grad_norm": 2.6573879718780518,
"learning_rate": 2.98558587804993e-05,
"loss": 4.2847,
"step": 394
},
{
"epoch": 0.4388888888888889,
"grad_norm": 3.498271942138672,
"learning_rate": 2.9770224884413623e-05,
"loss": 4.1811,
"step": 395
},
{
"epoch": 0.44,
"grad_norm": 3.6309518814086914,
"learning_rate": 2.9684532864643122e-05,
"loss": 4.223,
"step": 396
},
{
"epoch": 0.4411111111111111,
"grad_norm": 3.3358840942382812,
"learning_rate": 2.9598783765318007e-05,
"loss": 4.1402,
"step": 397
},
{
"epoch": 0.44222222222222224,
"grad_norm": 2.367056369781494,
"learning_rate": 2.9512978631264006e-05,
"loss": 4.3926,
"step": 398
},
{
"epoch": 0.44333333333333336,
"grad_norm": 3.7848873138427734,
"learning_rate": 2.9427118507989586e-05,
"loss": 4.1587,
"step": 399
},
{
"epoch": 0.4444444444444444,
"grad_norm": 2.969773769378662,
"learning_rate": 2.9341204441673266e-05,
"loss": 4.1827,
"step": 400
},
{
"epoch": 0.44555555555555554,
"grad_norm": 3.8604671955108643,
"learning_rate": 2.9255237479150816e-05,
"loss": 4.7396,
"step": 401
},
{
"epoch": 0.44666666666666666,
"grad_norm": 2.6314451694488525,
"learning_rate": 2.916921866790256e-05,
"loss": 4.7571,
"step": 402
},
{
"epoch": 0.4477777777777778,
"grad_norm": 3.2930240631103516,
"learning_rate": 2.908314905604056e-05,
"loss": 4.6127,
"step": 403
},
{
"epoch": 0.4488888888888889,
"grad_norm": 2.809821367263794,
"learning_rate": 2.8997029692295874e-05,
"loss": 4.9782,
"step": 404
},
{
"epoch": 0.45,
"grad_norm": 3.108168363571167,
"learning_rate": 2.8910861626005776e-05,
"loss": 4.8604,
"step": 405
},
{
"epoch": 0.45111111111111113,
"grad_norm": 3.2190604209899902,
"learning_rate": 2.8824645907100954e-05,
"loss": 4.4294,
"step": 406
},
{
"epoch": 0.45222222222222225,
"grad_norm": 3.8671491146087646,
"learning_rate": 2.8738383586092745e-05,
"loss": 4.5494,
"step": 407
},
{
"epoch": 0.4533333333333333,
"grad_norm": 3.078843355178833,
"learning_rate": 2.8652075714060295e-05,
"loss": 4.4064,
"step": 408
},
{
"epoch": 0.45444444444444443,
"grad_norm": 2.9444501399993896,
"learning_rate": 2.8565723342637796e-05,
"loss": 4.6974,
"step": 409
},
{
"epoch": 0.45555555555555555,
"grad_norm": 3.3965888023376465,
"learning_rate": 2.8479327524001636e-05,
"loss": 4.6253,
"step": 410
},
{
"epoch": 0.45666666666666667,
"grad_norm": 2.8222734928131104,
"learning_rate": 2.8392889310857612e-05,
"loss": 4.4421,
"step": 411
},
{
"epoch": 0.4577777777777778,
"grad_norm": 3.996683359146118,
"learning_rate": 2.8306409756428064e-05,
"loss": 3.9591,
"step": 412
},
{
"epoch": 0.4588888888888889,
"grad_norm": 3.7605364322662354,
"learning_rate": 2.8219889914439074e-05,
"loss": 4.0585,
"step": 413
},
{
"epoch": 0.46,
"grad_norm": 3.199702739715576,
"learning_rate": 2.8133330839107608e-05,
"loss": 4.5694,
"step": 414
},
{
"epoch": 0.46111111111111114,
"grad_norm": 2.3009800910949707,
"learning_rate": 2.8046733585128687e-05,
"loss": 4.5112,
"step": 415
},
{
"epoch": 0.4622222222222222,
"grad_norm": 3.8310000896453857,
"learning_rate": 2.7960099207662532e-05,
"loss": 4.1967,
"step": 416
},
{
"epoch": 0.4633333333333333,
"grad_norm": 4.325408458709717,
"learning_rate": 2.787342876232167e-05,
"loss": 4.1846,
"step": 417
},
{
"epoch": 0.46444444444444444,
"grad_norm": 3.2290215492248535,
"learning_rate": 2.7786723305158136e-05,
"loss": 4.3148,
"step": 418
},
{
"epoch": 0.46555555555555556,
"grad_norm": 3.246396541595459,
"learning_rate": 2.7699983892650573e-05,
"loss": 4.2778,
"step": 419
},
{
"epoch": 0.4666666666666667,
"grad_norm": 3.091440200805664,
"learning_rate": 2.761321158169134e-05,
"loss": 4.6429,
"step": 420
},
{
"epoch": 0.4677777777777778,
"grad_norm": 2.5828170776367188,
"learning_rate": 2.7526407429573657e-05,
"loss": 4.6686,
"step": 421
},
{
"epoch": 0.4688888888888889,
"grad_norm": 2.3290863037109375,
"learning_rate": 2.7439572493978736e-05,
"loss": 4.1659,
"step": 422
},
{
"epoch": 0.47,
"grad_norm": 4.536559104919434,
"learning_rate": 2.7352707832962865e-05,
"loss": 3.9368,
"step": 423
},
{
"epoch": 0.4711111111111111,
"grad_norm": 2.5557074546813965,
"learning_rate": 2.726581450494451e-05,
"loss": 4.7282,
"step": 424
},
{
"epoch": 0.4722222222222222,
"grad_norm": 2.8363335132598877,
"learning_rate": 2.717889356869146e-05,
"loss": 4.4236,
"step": 425
},
{
"epoch": 0.47333333333333333,
"grad_norm": 4.480076313018799,
"learning_rate": 2.7091946083307896e-05,
"loss": 4.3532,
"step": 426
},
{
"epoch": 0.47444444444444445,
"grad_norm": 2.9685823917388916,
"learning_rate": 2.7004973108221472e-05,
"loss": 3.8564,
"step": 427
},
{
"epoch": 0.47555555555555556,
"grad_norm": 2.388016939163208,
"learning_rate": 2.6917975703170466e-05,
"loss": 4.6472,
"step": 428
},
{
"epoch": 0.4766666666666667,
"grad_norm": 3.0806846618652344,
"learning_rate": 2.6830954928190794e-05,
"loss": 4.4021,
"step": 429
},
{
"epoch": 0.4777777777777778,
"grad_norm": 2.4739530086517334,
"learning_rate": 2.674391184360313e-05,
"loss": 4.3841,
"step": 430
},
{
"epoch": 0.47888888888888886,
"grad_norm": 2.37337327003479,
"learning_rate": 2.6656847510000012e-05,
"loss": 4.5123,
"step": 431
},
{
"epoch": 0.48,
"grad_norm": 2.335357904434204,
"learning_rate": 2.656976298823284e-05,
"loss": 4.5225,
"step": 432
},
{
"epoch": 0.4811111111111111,
"grad_norm": 2.888369083404541,
"learning_rate": 2.6482659339399045e-05,
"loss": 4.0533,
"step": 433
},
{
"epoch": 0.4822222222222222,
"grad_norm": 3.412767171859741,
"learning_rate": 2.6395537624829096e-05,
"loss": 4.6112,
"step": 434
},
{
"epoch": 0.48333333333333334,
"grad_norm": 3.0738167762756348,
"learning_rate": 2.63083989060736e-05,
"loss": 4.0807,
"step": 435
},
{
"epoch": 0.48444444444444446,
"grad_norm": 2.038522720336914,
"learning_rate": 2.6221244244890336e-05,
"loss": 4.3389,
"step": 436
},
{
"epoch": 0.4855555555555556,
"grad_norm": 2.4719579219818115,
"learning_rate": 2.6134074703231344e-05,
"loss": 4.1841,
"step": 437
},
{
"epoch": 0.4866666666666667,
"grad_norm": 2.653308153152466,
"learning_rate": 2.604689134322999e-05,
"loss": 4.6983,
"step": 438
},
{
"epoch": 0.48777777777777775,
"grad_norm": 3.113577127456665,
"learning_rate": 2.5959695227188004e-05,
"loss": 4.46,
"step": 439
},
{
"epoch": 0.4888888888888889,
"grad_norm": 2.463456153869629,
"learning_rate": 2.587248741756253e-05,
"loss": 4.5327,
"step": 440
},
{
"epoch": 0.49,
"grad_norm": 3.733887195587158,
"learning_rate": 2.578526897695321e-05,
"loss": 4.2684,
"step": 441
},
{
"epoch": 0.4911111111111111,
"grad_norm": 3.1210319995880127,
"learning_rate": 2.5698040968089225e-05,
"loss": 4.2219,
"step": 442
},
{
"epoch": 0.4922222222222222,
"grad_norm": 2.173025369644165,
"learning_rate": 2.5610804453816333e-05,
"loss": 4.7141,
"step": 443
},
{
"epoch": 0.49333333333333335,
"grad_norm": 3.4339637756347656,
"learning_rate": 2.5523560497083926e-05,
"loss": 4.8913,
"step": 444
},
{
"epoch": 0.49444444444444446,
"grad_norm": 3.317446231842041,
"learning_rate": 2.5436310160932092e-05,
"loss": 4.6514,
"step": 445
},
{
"epoch": 0.4955555555555556,
"grad_norm": 3.317653179168701,
"learning_rate": 2.5349054508478637e-05,
"loss": 4.7476,
"step": 446
},
{
"epoch": 0.49666666666666665,
"grad_norm": 2.9672698974609375,
"learning_rate": 2.5261794602906145e-05,
"loss": 4.6061,
"step": 447
},
{
"epoch": 0.49777777777777776,
"grad_norm": 3.32000470161438,
"learning_rate": 2.517453150744904e-05,
"loss": 4.1218,
"step": 448
},
{
"epoch": 0.4988888888888889,
"grad_norm": 4.345942497253418,
"learning_rate": 2.5087266285380596e-05,
"loss": 4.4872,
"step": 449
},
{
"epoch": 0.5,
"grad_norm": 3.055283784866333,
"learning_rate": 2.5e-05,
"loss": 4.2396,
"step": 450
},
{
"epoch": 0.5011111111111111,
"grad_norm": 3.0910513401031494,
"learning_rate": 2.4912733714619417e-05,
"loss": 4.9336,
"step": 451
},
{
"epoch": 0.5022222222222222,
"grad_norm": 2.6645710468292236,
"learning_rate": 2.4825468492550964e-05,
"loss": 4.7848,
"step": 452
},
{
"epoch": 0.5033333333333333,
"grad_norm": 3.63999342918396,
"learning_rate": 2.4738205397093864e-05,
"loss": 4.357,
"step": 453
},
{
"epoch": 0.5044444444444445,
"grad_norm": 2.507779598236084,
"learning_rate": 2.4650945491521372e-05,
"loss": 4.5025,
"step": 454
},
{
"epoch": 0.5055555555555555,
"grad_norm": 2.3816704750061035,
"learning_rate": 2.4563689839067913e-05,
"loss": 4.5438,
"step": 455
},
{
"epoch": 0.5066666666666667,
"grad_norm": 3.367776870727539,
"learning_rate": 2.447643950291608e-05,
"loss": 3.8387,
"step": 456
},
{
"epoch": 0.5077777777777778,
"grad_norm": 3.0507357120513916,
"learning_rate": 2.4389195546183673e-05,
"loss": 4.3121,
"step": 457
},
{
"epoch": 0.5088888888888888,
"grad_norm": 4.110062122344971,
"learning_rate": 2.4301959031910784e-05,
"loss": 4.2072,
"step": 458
},
{
"epoch": 0.51,
"grad_norm": 3.110203504562378,
"learning_rate": 2.4214731023046793e-05,
"loss": 4.3653,
"step": 459
},
{
"epoch": 0.5111111111111111,
"grad_norm": 3.3167712688446045,
"learning_rate": 2.4127512582437485e-05,
"loss": 4.2322,
"step": 460
},
{
"epoch": 0.5122222222222222,
"grad_norm": 2.507969856262207,
"learning_rate": 2.4040304772812002e-05,
"loss": 4.382,
"step": 461
},
{
"epoch": 0.5133333333333333,
"grad_norm": 3.365709066390991,
"learning_rate": 2.3953108656770016e-05,
"loss": 4.0521,
"step": 462
},
{
"epoch": 0.5144444444444445,
"grad_norm": 2.8844547271728516,
"learning_rate": 2.386592529676866e-05,
"loss": 4.5033,
"step": 463
},
{
"epoch": 0.5155555555555555,
"grad_norm": 3.0520310401916504,
"learning_rate": 2.377875575510967e-05,
"loss": 4.8204,
"step": 464
},
{
"epoch": 0.5166666666666667,
"grad_norm": 4.429820537567139,
"learning_rate": 2.3691601093926404e-05,
"loss": 4.3879,
"step": 465
},
{
"epoch": 0.5177777777777778,
"grad_norm": 2.3193674087524414,
"learning_rate": 2.3604462375170906e-05,
"loss": 4.0077,
"step": 466
},
{
"epoch": 0.5188888888888888,
"grad_norm": 3.9970319271087646,
"learning_rate": 2.3517340660600964e-05,
"loss": 4.2043,
"step": 467
},
{
"epoch": 0.52,
"grad_norm": 2.8194077014923096,
"learning_rate": 2.3430237011767167e-05,
"loss": 4.5441,
"step": 468
},
{
"epoch": 0.5211111111111111,
"grad_norm": 2.7785353660583496,
"learning_rate": 2.3343152490000004e-05,
"loss": 5.0854,
"step": 469
},
{
"epoch": 0.5222222222222223,
"grad_norm": 3.0042474269866943,
"learning_rate": 2.3256088156396868e-05,
"loss": 4.7248,
"step": 470
},
{
"epoch": 0.5233333333333333,
"grad_norm": 3.4051711559295654,
"learning_rate": 2.3169045071809215e-05,
"loss": 4.1062,
"step": 471
},
{
"epoch": 0.5244444444444445,
"grad_norm": 3.4314067363739014,
"learning_rate": 2.3082024296829536e-05,
"loss": 4.1021,
"step": 472
},
{
"epoch": 0.5255555555555556,
"grad_norm": 3.356543779373169,
"learning_rate": 2.299502689177853e-05,
"loss": 4.1495,
"step": 473
},
{
"epoch": 0.5266666666666666,
"grad_norm": 4.4954633712768555,
"learning_rate": 2.2908053916692117e-05,
"loss": 4.1691,
"step": 474
},
{
"epoch": 0.5277777777777778,
"grad_norm": 3.4520392417907715,
"learning_rate": 2.2821106431308544e-05,
"loss": 4.5688,
"step": 475
},
{
"epoch": 0.5288888888888889,
"grad_norm": 2.547987699508667,
"learning_rate": 2.2734185495055503e-05,
"loss": 4.8845,
"step": 476
},
{
"epoch": 0.53,
"grad_norm": 2.991994619369507,
"learning_rate": 2.2647292167037144e-05,
"loss": 4.5536,
"step": 477
},
{
"epoch": 0.5311111111111111,
"grad_norm": 3.409557819366455,
"learning_rate": 2.2560427506021266e-05,
"loss": 4.674,
"step": 478
},
{
"epoch": 0.5322222222222223,
"grad_norm": 2.5158376693725586,
"learning_rate": 2.247359257042634e-05,
"loss": 3.8624,
"step": 479
},
{
"epoch": 0.5333333333333333,
"grad_norm": 5.460207939147949,
"learning_rate": 2.238678841830867e-05,
"loss": 3.9847,
"step": 480
},
{
"epoch": 0.5344444444444445,
"grad_norm": 3.18215274810791,
"learning_rate": 2.230001610734943e-05,
"loss": 4.4479,
"step": 481
},
{
"epoch": 0.5355555555555556,
"grad_norm": 3.3176145553588867,
"learning_rate": 2.2213276694841866e-05,
"loss": 4.5647,
"step": 482
},
{
"epoch": 0.5366666666666666,
"grad_norm": 2.4659605026245117,
"learning_rate": 2.212657123767834e-05,
"loss": 4.6482,
"step": 483
},
{
"epoch": 0.5377777777777778,
"grad_norm": 3.418905019760132,
"learning_rate": 2.2039900792337474e-05,
"loss": 4.0517,
"step": 484
},
{
"epoch": 0.5388888888888889,
"grad_norm": 2.7777280807495117,
"learning_rate": 2.195326641487132e-05,
"loss": 3.9006,
"step": 485
},
{
"epoch": 0.54,
"grad_norm": 2.1448440551757812,
"learning_rate": 2.186666916089239e-05,
"loss": 4.5849,
"step": 486
},
{
"epoch": 0.5411111111111111,
"grad_norm": 2.304466485977173,
"learning_rate": 2.1780110085560935e-05,
"loss": 4.6166,
"step": 487
},
{
"epoch": 0.5422222222222223,
"grad_norm": 4.101543426513672,
"learning_rate": 2.1693590243571938e-05,
"loss": 4.1855,
"step": 488
},
{
"epoch": 0.5433333333333333,
"grad_norm": 4.019572734832764,
"learning_rate": 2.1607110689142393e-05,
"loss": 4.511,
"step": 489
},
{
"epoch": 0.5444444444444444,
"grad_norm": 3.2479324340820312,
"learning_rate": 2.1520672475998373e-05,
"loss": 4.694,
"step": 490
},
{
"epoch": 0.5455555555555556,
"grad_norm": 2.5916500091552734,
"learning_rate": 2.1434276657362213e-05,
"loss": 4.4817,
"step": 491
},
{
"epoch": 0.5466666666666666,
"grad_norm": 2.4654204845428467,
"learning_rate": 2.1347924285939714e-05,
"loss": 4.241,
"step": 492
},
{
"epoch": 0.5477777777777778,
"grad_norm": 3.4962589740753174,
"learning_rate": 2.1261616413907265e-05,
"loss": 4.2751,
"step": 493
},
{
"epoch": 0.5488888888888889,
"grad_norm": 2.41613507270813,
"learning_rate": 2.117535409289905e-05,
"loss": 4.6923,
"step": 494
},
{
"epoch": 0.55,
"grad_norm": 2.8937885761260986,
"learning_rate": 2.1089138373994223e-05,
"loss": 4.5164,
"step": 495
},
{
"epoch": 0.5511111111111111,
"grad_norm": 3.6244256496429443,
"learning_rate": 2.1002970307704132e-05,
"loss": 4.0702,
"step": 496
},
{
"epoch": 0.5522222222222222,
"grad_norm": 2.670847177505493,
"learning_rate": 2.0916850943959452e-05,
"loss": 4.1953,
"step": 497
},
{
"epoch": 0.5533333333333333,
"grad_norm": 3.030318021774292,
"learning_rate": 2.0830781332097446e-05,
"loss": 4.2784,
"step": 498
},
{
"epoch": 0.5544444444444444,
"grad_norm": 3.4792563915252686,
"learning_rate": 2.0744762520849193e-05,
"loss": 4.7421,
"step": 499
},
{
"epoch": 0.5555555555555556,
"grad_norm": 4.749427318572998,
"learning_rate": 2.0658795558326743e-05,
"loss": 4.0809,
"step": 500
},
{
"epoch": 0.5566666666666666,
"grad_norm": 2.382559061050415,
"learning_rate": 2.057288149201042e-05,
"loss": 4.2373,
"step": 501
},
{
"epoch": 0.5577777777777778,
"grad_norm": 4.044615745544434,
"learning_rate": 2.0487021368736003e-05,
"loss": 4.3278,
"step": 502
},
{
"epoch": 0.5588888888888889,
"grad_norm": 2.4403457641601562,
"learning_rate": 2.0401216234681995e-05,
"loss": 4.6996,
"step": 503
},
{
"epoch": 0.56,
"grad_norm": 2.6123414039611816,
"learning_rate": 2.031546713535688e-05,
"loss": 3.7175,
"step": 504
},
{
"epoch": 0.5611111111111111,
"grad_norm": 2.930072784423828,
"learning_rate": 2.022977511558638e-05,
"loss": 3.8051,
"step": 505
},
{
"epoch": 0.5622222222222222,
"grad_norm": 2.092438220977783,
"learning_rate": 2.0144141219500705e-05,
"loss": 4.6131,
"step": 506
},
{
"epoch": 0.5633333333333334,
"grad_norm": 3.791438579559326,
"learning_rate": 2.0058566490521847e-05,
"loss": 4.5328,
"step": 507
},
{
"epoch": 0.5644444444444444,
"grad_norm": 2.1941120624542236,
"learning_rate": 1.9973051971350888e-05,
"loss": 4.0611,
"step": 508
},
{
"epoch": 0.5655555555555556,
"grad_norm": 2.723223924636841,
"learning_rate": 1.9887598703955242e-05,
"loss": 4.2184,
"step": 509
},
{
"epoch": 0.5666666666666667,
"grad_norm": 2.9518237113952637,
"learning_rate": 1.980220772955602e-05,
"loss": 4.3172,
"step": 510
},
{
"epoch": 0.5677777777777778,
"grad_norm": 3.06872296333313,
"learning_rate": 1.9716880088615285e-05,
"loss": 4.2687,
"step": 511
},
{
"epoch": 0.5688888888888889,
"grad_norm": 2.8538174629211426,
"learning_rate": 1.963161682082342e-05,
"loss": 4.265,
"step": 512
},
{
"epoch": 0.57,
"grad_norm": 3.3108673095703125,
"learning_rate": 1.9546418965086442e-05,
"loss": 4.5094,
"step": 513
},
{
"epoch": 0.5711111111111111,
"grad_norm": 3.3742525577545166,
"learning_rate": 1.946128755951332e-05,
"loss": 4.4563,
"step": 514
},
{
"epoch": 0.5722222222222222,
"grad_norm": 2.6200695037841797,
"learning_rate": 1.937622364140338e-05,
"loss": 4.3567,
"step": 515
},
{
"epoch": 0.5733333333333334,
"grad_norm": 2.5701615810394287,
"learning_rate": 1.9291228247233605e-05,
"loss": 4.2645,
"step": 516
},
{
"epoch": 0.5744444444444444,
"grad_norm": 4.248501777648926,
"learning_rate": 1.920630241264607e-05,
"loss": 3.8413,
"step": 517
},
{
"epoch": 0.5755555555555556,
"grad_norm": 3.4751811027526855,
"learning_rate": 1.912144717243525e-05,
"loss": 4.258,
"step": 518
},
{
"epoch": 0.5766666666666667,
"grad_norm": 2.8151302337646484,
"learning_rate": 1.9036663560535483e-05,
"loss": 4.4939,
"step": 519
},
{
"epoch": 0.5777777777777777,
"grad_norm": 3.2205138206481934,
"learning_rate": 1.895195261000831e-05,
"loss": 4.7516,
"step": 520
},
{
"epoch": 0.5788888888888889,
"grad_norm": 2.6713924407958984,
"learning_rate": 1.8867315353029935e-05,
"loss": 4.3591,
"step": 521
},
{
"epoch": 0.58,
"grad_norm": 3.1901915073394775,
"learning_rate": 1.8782752820878634e-05,
"loss": 4.2607,
"step": 522
},
{
"epoch": 0.5811111111111111,
"grad_norm": 3.473564863204956,
"learning_rate": 1.869826604392216e-05,
"loss": 4.125,
"step": 523
},
{
"epoch": 0.5822222222222222,
"grad_norm": 2.8697259426116943,
"learning_rate": 1.8613856051605243e-05,
"loss": 4.2696,
"step": 524
},
{
"epoch": 0.5833333333333334,
"grad_norm": 2.74684739112854,
"learning_rate": 1.852952387243698e-05,
"loss": 4.2943,
"step": 525
},
{
"epoch": 0.5844444444444444,
"grad_norm": 2.807659387588501,
"learning_rate": 1.8445270533978388e-05,
"loss": 4.6892,
"step": 526
},
{
"epoch": 0.5855555555555556,
"grad_norm": 2.5258119106292725,
"learning_rate": 1.8361097062829778e-05,
"loss": 4.4269,
"step": 527
},
{
"epoch": 0.5866666666666667,
"grad_norm": 4.046256065368652,
"learning_rate": 1.827700448461836e-05,
"loss": 4.4027,
"step": 528
},
{
"epoch": 0.5877777777777777,
"grad_norm": 2.256350517272949,
"learning_rate": 1.8192993823985643e-05,
"loss": 4.1628,
"step": 529
},
{
"epoch": 0.5888888888888889,
"grad_norm": 2.6388349533081055,
"learning_rate": 1.8109066104575023e-05,
"loss": 5.2738,
"step": 530
},
{
"epoch": 0.59,
"grad_norm": 3.1763365268707275,
"learning_rate": 1.802522234901927e-05,
"loss": 4.4906,
"step": 531
},
{
"epoch": 0.5911111111111111,
"grad_norm": 2.969287157058716,
"learning_rate": 1.7941463578928086e-05,
"loss": 5.1068,
"step": 532
},
{
"epoch": 0.5922222222222222,
"grad_norm": 4.471690654754639,
"learning_rate": 1.7857790814875663e-05,
"loss": 4.1047,
"step": 533
},
{
"epoch": 0.5933333333333334,
"grad_norm": 3.2363221645355225,
"learning_rate": 1.7774205076388206e-05,
"loss": 4.587,
"step": 534
},
{
"epoch": 0.5944444444444444,
"grad_norm": 2.6446151733398438,
"learning_rate": 1.7690707381931583e-05,
"loss": 4.4606,
"step": 535
},
{
"epoch": 0.5955555555555555,
"grad_norm": 3.1010208129882812,
"learning_rate": 1.7607298748898842e-05,
"loss": 4.3764,
"step": 536
},
{
"epoch": 0.5966666666666667,
"grad_norm": 2.4426517486572266,
"learning_rate": 1.7523980193597836e-05,
"loss": 4.4518,
"step": 537
},
{
"epoch": 0.5977777777777777,
"grad_norm": 1.913076400756836,
"learning_rate": 1.744075273123889e-05,
"loss": 4.2522,
"step": 538
},
{
"epoch": 0.5988888888888889,
"grad_norm": 2.492178440093994,
"learning_rate": 1.735761737592236e-05,
"loss": 4.5726,
"step": 539
},
{
"epoch": 0.6,
"grad_norm": 2.457730531692505,
"learning_rate": 1.7274575140626318e-05,
"loss": 4.662,
"step": 540
},
{
"epoch": 0.6011111111111112,
"grad_norm": 2.3263602256774902,
"learning_rate": 1.7191627037194186e-05,
"loss": 4.4692,
"step": 541
},
{
"epoch": 0.6022222222222222,
"grad_norm": 3.4461264610290527,
"learning_rate": 1.7108774076322443e-05,
"loss": 4.287,
"step": 542
},
{
"epoch": 0.6033333333333334,
"grad_norm": 3.4049248695373535,
"learning_rate": 1.702601726754825e-05,
"loss": 3.8536,
"step": 543
},
{
"epoch": 0.6044444444444445,
"grad_norm": 3.2425801753997803,
"learning_rate": 1.6943357619237226e-05,
"loss": 3.8095,
"step": 544
},
{
"epoch": 0.6055555555555555,
"grad_norm": 3.209322452545166,
"learning_rate": 1.686079613857109e-05,
"loss": 4.0113,
"step": 545
},
{
"epoch": 0.6066666666666667,
"grad_norm": 2.507138729095459,
"learning_rate": 1.677833383153542e-05,
"loss": 4.3496,
"step": 546
},
{
"epoch": 0.6077777777777778,
"grad_norm": 3.377285957336426,
"learning_rate": 1.6695971702907426e-05,
"loss": 4.2639,
"step": 547
},
{
"epoch": 0.6088888888888889,
"grad_norm": 3.625976324081421,
"learning_rate": 1.6613710756243626e-05,
"loss": 4.1149,
"step": 548
},
{
"epoch": 0.61,
"grad_norm": 2.757136821746826,
"learning_rate": 1.6531551993867717e-05,
"loss": 4.0329,
"step": 549
},
{
"epoch": 0.6111111111111112,
"grad_norm": 3.1707332134246826,
"learning_rate": 1.6449496416858284e-05,
"loss": 4.3594,
"step": 550
},
{
"epoch": 0.6122222222222222,
"grad_norm": 3.1145691871643066,
"learning_rate": 1.6367545025036636e-05,
"loss": 4.6404,
"step": 551
},
{
"epoch": 0.6133333333333333,
"grad_norm": 3.2511072158813477,
"learning_rate": 1.6285698816954624e-05,
"loss": 4.0102,
"step": 552
},
{
"epoch": 0.6144444444444445,
"grad_norm": 3.0803847312927246,
"learning_rate": 1.6203958789882456e-05,
"loss": 4.2037,
"step": 553
},
{
"epoch": 0.6155555555555555,
"grad_norm": 2.6308162212371826,
"learning_rate": 1.612232593979658e-05,
"loss": 4.1265,
"step": 554
},
{
"epoch": 0.6166666666666667,
"grad_norm": 2.719158172607422,
"learning_rate": 1.6040801261367493e-05,
"loss": 4.4969,
"step": 555
},
{
"epoch": 0.6177777777777778,
"grad_norm": 4.231455326080322,
"learning_rate": 1.5959385747947698e-05,
"loss": 3.8209,
"step": 556
},
{
"epoch": 0.6188888888888889,
"grad_norm": 3.452610731124878,
"learning_rate": 1.5878080391559508e-05,
"loss": 4.8977,
"step": 557
},
{
"epoch": 0.62,
"grad_norm": 2.3357810974121094,
"learning_rate": 1.5796886182883053e-05,
"loss": 4.2065,
"step": 558
},
{
"epoch": 0.6211111111111111,
"grad_norm": 3.3810150623321533,
"learning_rate": 1.5715804111244137e-05,
"loss": 4.1377,
"step": 559
},
{
"epoch": 0.6222222222222222,
"grad_norm": 2.561292886734009,
"learning_rate": 1.56348351646022e-05,
"loss": 4.304,
"step": 560
},
{
"epoch": 0.6233333333333333,
"grad_norm": 4.378098011016846,
"learning_rate": 1.5553980329538326e-05,
"loss": 4.2043,
"step": 561
},
{
"epoch": 0.6244444444444445,
"grad_norm": 2.817155599594116,
"learning_rate": 1.547324059124315e-05,
"loss": 4.8234,
"step": 562
},
{
"epoch": 0.6255555555555555,
"grad_norm": 2.7013378143310547,
"learning_rate": 1.539261693350491e-05,
"loss": 4.5842,
"step": 563
},
{
"epoch": 0.6266666666666667,
"grad_norm": 3.0469796657562256,
"learning_rate": 1.5312110338697426e-05,
"loss": 4.475,
"step": 564
},
{
"epoch": 0.6277777777777778,
"grad_norm": 2.944330930709839,
"learning_rate": 1.523172178776816e-05,
"loss": 4.7153,
"step": 565
},
{
"epoch": 0.6288888888888889,
"grad_norm": 3.1219630241394043,
"learning_rate": 1.5151452260226224e-05,
"loss": 4.2081,
"step": 566
},
{
"epoch": 0.63,
"grad_norm": 3.091395139694214,
"learning_rate": 1.5071302734130489e-05,
"loss": 3.8313,
"step": 567
},
{
"epoch": 0.6311111111111111,
"grad_norm": 3.610748767852783,
"learning_rate": 1.4991274186077632e-05,
"loss": 4.3207,
"step": 568
},
{
"epoch": 0.6322222222222222,
"grad_norm": 2.4512412548065186,
"learning_rate": 1.4911367591190248e-05,
"loss": 4.6405,
"step": 569
},
{
"epoch": 0.6333333333333333,
"grad_norm": 2.7447104454040527,
"learning_rate": 1.4831583923104999e-05,
"loss": 4.3182,
"step": 570
},
{
"epoch": 0.6344444444444445,
"grad_norm": 2.834606409072876,
"learning_rate": 1.475192415396068e-05,
"loss": 4.0676,
"step": 571
},
{
"epoch": 0.6355555555555555,
"grad_norm": 2.1336636543273926,
"learning_rate": 1.467238925438646e-05,
"loss": 4.4181,
"step": 572
},
{
"epoch": 0.6366666666666667,
"grad_norm": 2.7370517253875732,
"learning_rate": 1.4592980193489975e-05,
"loss": 4.2872,
"step": 573
},
{
"epoch": 0.6377777777777778,
"grad_norm": 2.991546392440796,
"learning_rate": 1.4513697938845572e-05,
"loss": 3.8864,
"step": 574
},
{
"epoch": 0.6388888888888888,
"grad_norm": 2.664534330368042,
"learning_rate": 1.443454345648252e-05,
"loss": 4.5979,
"step": 575
},
{
"epoch": 0.64,
"grad_norm": 2.904681921005249,
"learning_rate": 1.4355517710873184e-05,
"loss": 4.2368,
"step": 576
},
{
"epoch": 0.6411111111111111,
"grad_norm": 3.317148447036743,
"learning_rate": 1.4276621664921357e-05,
"loss": 3.8986,
"step": 577
},
{
"epoch": 0.6422222222222222,
"grad_norm": 3.1722943782806396,
"learning_rate": 1.4197856279950438e-05,
"loss": 4.3489,
"step": 578
},
{
"epoch": 0.6433333333333333,
"grad_norm": 2.8325436115264893,
"learning_rate": 1.4119222515691816e-05,
"loss": 4.7594,
"step": 579
},
{
"epoch": 0.6444444444444445,
"grad_norm": 3.2218034267425537,
"learning_rate": 1.4040721330273062e-05,
"loss": 4.2976,
"step": 580
},
{
"epoch": 0.6455555555555555,
"grad_norm": 3.36842679977417,
"learning_rate": 1.3962353680206373e-05,
"loss": 4.2204,
"step": 581
},
{
"epoch": 0.6466666666666666,
"grad_norm": 3.8246467113494873,
"learning_rate": 1.388412052037682e-05,
"loss": 4.1247,
"step": 582
},
{
"epoch": 0.6477777777777778,
"grad_norm": 3.131218910217285,
"learning_rate": 1.380602280403076e-05,
"loss": 4.3663,
"step": 583
},
{
"epoch": 0.6488888888888888,
"grad_norm": 3.3939664363861084,
"learning_rate": 1.3728061482764238e-05,
"loss": 4.3556,
"step": 584
},
{
"epoch": 0.65,
"grad_norm": 2.252523183822632,
"learning_rate": 1.3650237506511331e-05,
"loss": 4.0815,
"step": 585
},
{
"epoch": 0.6511111111111111,
"grad_norm": 4.023004055023193,
"learning_rate": 1.3572551823532654e-05,
"loss": 4.6529,
"step": 586
},
{
"epoch": 0.6522222222222223,
"grad_norm": 2.3044891357421875,
"learning_rate": 1.349500538040371e-05,
"loss": 4.7574,
"step": 587
},
{
"epoch": 0.6533333333333333,
"grad_norm": 2.99569034576416,
"learning_rate": 1.3417599122003464e-05,
"loss": 4.1623,
"step": 588
},
{
"epoch": 0.6544444444444445,
"grad_norm": 3.384570360183716,
"learning_rate": 1.3340333991502724e-05,
"loss": 4.3638,
"step": 589
},
{
"epoch": 0.6555555555555556,
"grad_norm": 3.1633384227752686,
"learning_rate": 1.3263210930352737e-05,
"loss": 4.2265,
"step": 590
},
{
"epoch": 0.6566666666666666,
"grad_norm": 2.922513484954834,
"learning_rate": 1.3186230878273653e-05,
"loss": 4.2723,
"step": 591
},
{
"epoch": 0.6577777777777778,
"grad_norm": 2.408703327178955,
"learning_rate": 1.3109394773243117e-05,
"loss": 4.3487,
"step": 592
},
{
"epoch": 0.6588888888888889,
"grad_norm": 2.839890718460083,
"learning_rate": 1.3032703551484832e-05,
"loss": 4.3941,
"step": 593
},
{
"epoch": 0.66,
"grad_norm": 2.723208427429199,
"learning_rate": 1.2956158147457115e-05,
"loss": 4.5581,
"step": 594
},
{
"epoch": 0.6611111111111111,
"grad_norm": 3.162594795227051,
"learning_rate": 1.2879759493841575e-05,
"loss": 4.6902,
"step": 595
},
{
"epoch": 0.6622222222222223,
"grad_norm": 2.862002372741699,
"learning_rate": 1.280350852153168e-05,
"loss": 4.6792,
"step": 596
},
{
"epoch": 0.6633333333333333,
"grad_norm": 3.029798746109009,
"learning_rate": 1.272740615962148e-05,
"loss": 4.2943,
"step": 597
},
{
"epoch": 0.6644444444444444,
"grad_norm": 2.505032539367676,
"learning_rate": 1.2651453335394231e-05,
"loss": 4.373,
"step": 598
},
{
"epoch": 0.6655555555555556,
"grad_norm": 3.039720058441162,
"learning_rate": 1.2575650974311119e-05,
"loss": 4.5615,
"step": 599
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.9847793579101562,
"learning_rate": 1.2500000000000006e-05,
"loss": 4.7977,
"step": 600
},
{
"epoch": 0.6677777777777778,
"grad_norm": 2.6300618648529053,
"learning_rate": 1.2424501334244123e-05,
"loss": 4.3867,
"step": 601
},
{
"epoch": 0.6688888888888889,
"grad_norm": 3.1111793518066406,
"learning_rate": 1.234915589697091e-05,
"loss": 4.3681,
"step": 602
},
{
"epoch": 0.67,
"grad_norm": 3.274426221847534,
"learning_rate": 1.2273964606240718e-05,
"loss": 4.3242,
"step": 603
},
{
"epoch": 0.6711111111111111,
"grad_norm": 2.1483371257781982,
"learning_rate": 1.2198928378235716e-05,
"loss": 4.468,
"step": 604
},
{
"epoch": 0.6722222222222223,
"grad_norm": 3.4644694328308105,
"learning_rate": 1.2124048127248644e-05,
"loss": 4.5052,
"step": 605
},
{
"epoch": 0.6733333333333333,
"grad_norm": 2.960430860519409,
"learning_rate": 1.2049324765671749e-05,
"loss": 4.3371,
"step": 606
},
{
"epoch": 0.6744444444444444,
"grad_norm": 3.3487277030944824,
"learning_rate": 1.19747592039856e-05,
"loss": 4.8553,
"step": 607
},
{
"epoch": 0.6755555555555556,
"grad_norm": 3.0505776405334473,
"learning_rate": 1.1900352350748026e-05,
"loss": 4.2182,
"step": 608
},
{
"epoch": 0.6766666666666666,
"grad_norm": 3.3368704319000244,
"learning_rate": 1.1826105112583061e-05,
"loss": 3.9389,
"step": 609
},
{
"epoch": 0.6777777777777778,
"grad_norm": 3.4237194061279297,
"learning_rate": 1.175201839416988e-05,
"loss": 4.4959,
"step": 610
},
{
"epoch": 0.6788888888888889,
"grad_norm": 2.389981269836426,
"learning_rate": 1.167809309823175e-05,
"loss": 4.1658,
"step": 611
},
{
"epoch": 0.68,
"grad_norm": 4.309886455535889,
"learning_rate": 1.1604330125525079e-05,
"loss": 3.8112,
"step": 612
},
{
"epoch": 0.6811111111111111,
"grad_norm": 2.3204903602600098,
"learning_rate": 1.1530730374828422e-05,
"loss": 4.2661,
"step": 613
},
{
"epoch": 0.6822222222222222,
"grad_norm": 3.4007372856140137,
"learning_rate": 1.1457294742931507e-05,
"loss": 4.3726,
"step": 614
},
{
"epoch": 0.6833333333333333,
"grad_norm": 3.3282923698425293,
"learning_rate": 1.1384024124624324e-05,
"loss": 4.3936,
"step": 615
},
{
"epoch": 0.6844444444444444,
"grad_norm": 3.03949236869812,
"learning_rate": 1.1310919412686247e-05,
"loss": 3.7927,
"step": 616
},
{
"epoch": 0.6855555555555556,
"grad_norm": 2.703687906265259,
"learning_rate": 1.123798149787511e-05,
"loss": 4.5,
"step": 617
},
{
"epoch": 0.6866666666666666,
"grad_norm": 3.9763340950012207,
"learning_rate": 1.11652112689164e-05,
"loss": 3.7004,
"step": 618
},
{
"epoch": 0.6877777777777778,
"grad_norm": 4.126737594604492,
"learning_rate": 1.109260961249238e-05,
"loss": 4.308,
"step": 619
},
{
"epoch": 0.6888888888888889,
"grad_norm": 2.2291946411132812,
"learning_rate": 1.1020177413231334e-05,
"loss": 4.4126,
"step": 620
},
{
"epoch": 0.69,
"grad_norm": 3.0533227920532227,
"learning_rate": 1.0947915553696742e-05,
"loss": 3.8855,
"step": 621
},
{
"epoch": 0.6911111111111111,
"grad_norm": 3.887996196746826,
"learning_rate": 1.0875824914376553e-05,
"loss": 4.3823,
"step": 622
},
{
"epoch": 0.6922222222222222,
"grad_norm": 2.5938265323638916,
"learning_rate": 1.0803906373672476e-05,
"loss": 4.2245,
"step": 623
},
{
"epoch": 0.6933333333333334,
"grad_norm": 3.3216285705566406,
"learning_rate": 1.0732160807889211e-05,
"loss": 4.1916,
"step": 624
},
{
"epoch": 0.6944444444444444,
"grad_norm": 3.3373751640319824,
"learning_rate": 1.0660589091223855e-05,
"loss": 4.3503,
"step": 625
},
{
"epoch": 0.6955555555555556,
"grad_norm": 2.2890090942382812,
"learning_rate": 1.058919209575517e-05,
"loss": 4.0717,
"step": 626
},
{
"epoch": 0.6966666666666667,
"grad_norm": 4.276199817657471,
"learning_rate": 1.0517970691433035e-05,
"loss": 4.7337,
"step": 627
},
{
"epoch": 0.6977777777777778,
"grad_norm": 2.4809815883636475,
"learning_rate": 1.0446925746067768e-05,
"loss": 4.5435,
"step": 628
},
{
"epoch": 0.6988888888888889,
"grad_norm": 4.138044357299805,
"learning_rate": 1.0376058125319613e-05,
"loss": 4.2157,
"step": 629
},
{
"epoch": 0.7,
"grad_norm": 2.8675029277801514,
"learning_rate": 1.0305368692688174e-05,
"loss": 4.7462,
"step": 630
},
{
"epoch": 0.7011111111111111,
"grad_norm": 2.406512498855591,
"learning_rate": 1.0234858309501862e-05,
"loss": 3.879,
"step": 631
},
{
"epoch": 0.7022222222222222,
"grad_norm": 2.695511817932129,
"learning_rate": 1.0164527834907467e-05,
"loss": 4.4111,
"step": 632
},
{
"epoch": 0.7033333333333334,
"grad_norm": 2.458010673522949,
"learning_rate": 1.0094378125859602e-05,
"loss": 4.5713,
"step": 633
},
{
"epoch": 0.7044444444444444,
"grad_norm": 3.0153543949127197,
"learning_rate": 1.0024410037110357e-05,
"loss": 4.3096,
"step": 634
},
{
"epoch": 0.7055555555555556,
"grad_norm": 2.6395087242126465,
"learning_rate": 9.954624421198792e-06,
"loss": 4.4095,
"step": 635
},
{
"epoch": 0.7066666666666667,
"grad_norm": 2.174259662628174,
"learning_rate": 9.88502212844063e-06,
"loss": 4.396,
"step": 636
},
{
"epoch": 0.7077777777777777,
"grad_norm": 2.7941372394561768,
"learning_rate": 9.815604006917839e-06,
"loss": 4.1945,
"step": 637
},
{
"epoch": 0.7088888888888889,
"grad_norm": 3.5960729122161865,
"learning_rate": 9.746370902468311e-06,
"loss": 4.4889,
"step": 638
},
{
"epoch": 0.71,
"grad_norm": 3.816847562789917,
"learning_rate": 9.677323658675594e-06,
"loss": 4.6575,
"step": 639
},
{
"epoch": 0.7111111111111111,
"grad_norm": 2.2747933864593506,
"learning_rate": 9.608463116858542e-06,
"loss": 4.5163,
"step": 640
},
{
"epoch": 0.7122222222222222,
"grad_norm": 3.0500824451446533,
"learning_rate": 9.539790116061151e-06,
"loss": 4.5815,
"step": 641
},
{
"epoch": 0.7133333333333334,
"grad_norm": 3.7249062061309814,
"learning_rate": 9.471305493042243e-06,
"loss": 4.206,
"step": 642
},
{
"epoch": 0.7144444444444444,
"grad_norm": 3.3056654930114746,
"learning_rate": 9.403010082265351e-06,
"loss": 4.3168,
"step": 643
},
{
"epoch": 0.7155555555555555,
"grad_norm": 3.2421038150787354,
"learning_rate": 9.334904715888495e-06,
"loss": 4.815,
"step": 644
},
{
"epoch": 0.7166666666666667,
"grad_norm": 3.044701337814331,
"learning_rate": 9.266990223754069e-06,
"loss": 4.4632,
"step": 645
},
{
"epoch": 0.7177777777777777,
"grad_norm": 3.229823589324951,
"learning_rate": 9.199267433378727e-06,
"loss": 4.5471,
"step": 646
},
{
"epoch": 0.7188888888888889,
"grad_norm": 2.7537453174591064,
"learning_rate": 9.131737169943314e-06,
"loss": 4.0644,
"step": 647
},
{
"epoch": 0.72,
"grad_norm": 2.645606517791748,
"learning_rate": 9.064400256282757e-06,
"loss": 4.3359,
"step": 648
},
{
"epoch": 0.7211111111111111,
"grad_norm": 2.989220380783081,
"learning_rate": 8.997257512876108e-06,
"loss": 4.7121,
"step": 649
},
{
"epoch": 0.7222222222222222,
"grad_norm": 2.0871849060058594,
"learning_rate": 8.930309757836517e-06,
"loss": 4.7126,
"step": 650
},
{
"epoch": 0.7233333333333334,
"grad_norm": 2.9518845081329346,
"learning_rate": 8.863557806901233e-06,
"loss": 3.8763,
"step": 651
},
{
"epoch": 0.7244444444444444,
"grad_norm": 3.165712356567383,
"learning_rate": 8.797002473421728e-06,
"loss": 4.0458,
"step": 652
},
{
"epoch": 0.7255555555555555,
"grad_norm": 2.202949285507202,
"learning_rate": 8.73064456835373e-06,
"loss": 4.5542,
"step": 653
},
{
"epoch": 0.7266666666666667,
"grad_norm": 3.328310489654541,
"learning_rate": 8.664484900247363e-06,
"loss": 4.3315,
"step": 654
},
{
"epoch": 0.7277777777777777,
"grad_norm": 3.02006459236145,
"learning_rate": 8.598524275237322e-06,
"loss": 4.4394,
"step": 655
},
{
"epoch": 0.7288888888888889,
"grad_norm": 2.5074918270111084,
"learning_rate": 8.532763497032987e-06,
"loss": 4.2377,
"step": 656
},
{
"epoch": 0.73,
"grad_norm": 2.847383975982666,
"learning_rate": 8.467203366908707e-06,
"loss": 4.1128,
"step": 657
},
{
"epoch": 0.7311111111111112,
"grad_norm": 2.9188661575317383,
"learning_rate": 8.40184468369396e-06,
"loss": 4.2968,
"step": 658
},
{
"epoch": 0.7322222222222222,
"grad_norm": 3.0603976249694824,
"learning_rate": 8.33668824376369e-06,
"loss": 4.175,
"step": 659
},
{
"epoch": 0.7333333333333333,
"grad_norm": 3.114797592163086,
"learning_rate": 8.271734841028553e-06,
"loss": 3.9943,
"step": 660
},
{
"epoch": 0.7344444444444445,
"grad_norm": 3.7101423740386963,
"learning_rate": 8.206985266925249e-06,
"loss": 4.4357,
"step": 661
},
{
"epoch": 0.7355555555555555,
"grad_norm": 4.916779041290283,
"learning_rate": 8.142440310406924e-06,
"loss": 4.9196,
"step": 662
},
{
"epoch": 0.7366666666666667,
"grad_norm": 3.456704616546631,
"learning_rate": 8.078100757933485e-06,
"loss": 4.7176,
"step": 663
},
{
"epoch": 0.7377777777777778,
"grad_norm": 3.1686041355133057,
"learning_rate": 8.013967393462094e-06,
"loss": 4.3609,
"step": 664
},
{
"epoch": 0.7388888888888889,
"grad_norm": 2.5040740966796875,
"learning_rate": 7.950040998437542e-06,
"loss": 4.0855,
"step": 665
},
{
"epoch": 0.74,
"grad_norm": 3.923576831817627,
"learning_rate": 7.886322351782783e-06,
"loss": 3.7909,
"step": 666
},
{
"epoch": 0.7411111111111112,
"grad_norm": 3.781975269317627,
"learning_rate": 7.822812229889428e-06,
"loss": 4.4285,
"step": 667
},
{
"epoch": 0.7422222222222222,
"grad_norm": 2.2183735370635986,
"learning_rate": 7.759511406608255e-06,
"loss": 4.2021,
"step": 668
},
{
"epoch": 0.7433333333333333,
"grad_norm": 2.5517868995666504,
"learning_rate": 7.696420653239833e-06,
"loss": 4.0788,
"step": 669
},
{
"epoch": 0.7444444444444445,
"grad_norm": 3.1512372493743896,
"learning_rate": 7.633540738525066e-06,
"loss": 4.128,
"step": 670
},
{
"epoch": 0.7455555555555555,
"grad_norm": 2.474193811416626,
"learning_rate": 7.570872428635889e-06,
"loss": 4.6547,
"step": 671
},
{
"epoch": 0.7466666666666667,
"grad_norm": 3.1348423957824707,
"learning_rate": 7.508416487165862e-06,
"loss": 4.6837,
"step": 672
},
{
"epoch": 0.7477777777777778,
"grad_norm": 3.7456905841827393,
"learning_rate": 7.4461736751209405e-06,
"loss": 4.5965,
"step": 673
},
{
"epoch": 0.7488888888888889,
"grad_norm": 2.9826486110687256,
"learning_rate": 7.384144750910133e-06,
"loss": 4.1727,
"step": 674
},
{
"epoch": 0.75,
"grad_norm": 3.4859273433685303,
"learning_rate": 7.3223304703363135e-06,
"loss": 4.188,
"step": 675
},
{
"epoch": 0.7511111111111111,
"grad_norm": 3.679555892944336,
"learning_rate": 7.260731586586983e-06,
"loss": 4.3418,
"step": 676
},
{
"epoch": 0.7522222222222222,
"grad_norm": 4.997726917266846,
"learning_rate": 7.19934885022509e-06,
"loss": 4.0094,
"step": 677
},
{
"epoch": 0.7533333333333333,
"grad_norm": 3.3679285049438477,
"learning_rate": 7.138183009179922e-06,
"loss": 4.3927,
"step": 678
},
{
"epoch": 0.7544444444444445,
"grad_norm": 3.4834442138671875,
"learning_rate": 7.0772348087379315e-06,
"loss": 4.0955,
"step": 679
},
{
"epoch": 0.7555555555555555,
"grad_norm": 3.499994993209839,
"learning_rate": 7.016504991533726e-06,
"loss": 4.2323,
"step": 680
},
{
"epoch": 0.7566666666666667,
"grad_norm": 2.678922176361084,
"learning_rate": 6.9559942975409465e-06,
"loss": 4.549,
"step": 681
},
{
"epoch": 0.7577777777777778,
"grad_norm": 2.43112850189209,
"learning_rate": 6.895703464063319e-06,
"loss": 4.4337,
"step": 682
},
{
"epoch": 0.7588888888888888,
"grad_norm": 2.440561294555664,
"learning_rate": 6.835633225725605e-06,
"loss": 4.0068,
"step": 683
},
{
"epoch": 0.76,
"grad_norm": 3.2796149253845215,
"learning_rate": 6.775784314464717e-06,
"loss": 4.2503,
"step": 684
},
{
"epoch": 0.7611111111111111,
"grad_norm": 3.672053098678589,
"learning_rate": 6.716157459520739e-06,
"loss": 3.8174,
"step": 685
},
{
"epoch": 0.7622222222222222,
"grad_norm": 4.125499248504639,
"learning_rate": 6.656753387428089e-06,
"loss": 4.1682,
"step": 686
},
{
"epoch": 0.7633333333333333,
"grad_norm": 2.379180669784546,
"learning_rate": 6.5975728220066425e-06,
"loss": 3.9803,
"step": 687
},
{
"epoch": 0.7644444444444445,
"grad_norm": 2.5495798587799072,
"learning_rate": 6.538616484352902e-06,
"loss": 4.4606,
"step": 688
},
{
"epoch": 0.7655555555555555,
"grad_norm": 3.079115629196167,
"learning_rate": 6.47988509283125e-06,
"loss": 4.4226,
"step": 689
},
{
"epoch": 0.7666666666666667,
"grad_norm": 3.088437795639038,
"learning_rate": 6.421379363065142e-06,
"loss": 4.5023,
"step": 690
},
{
"epoch": 0.7677777777777778,
"grad_norm": 6.411847114562988,
"learning_rate": 6.363100007928446e-06,
"loss": 4.488,
"step": 691
},
{
"epoch": 0.7688888888888888,
"grad_norm": 2.622467517852783,
"learning_rate": 6.305047737536707e-06,
"loss": 3.7526,
"step": 692
},
{
"epoch": 0.77,
"grad_norm": 3.189143657684326,
"learning_rate": 6.247223259238511e-06,
"loss": 4.0969,
"step": 693
},
{
"epoch": 0.7711111111111111,
"grad_norm": 3.9353489875793457,
"learning_rate": 6.189627277606894e-06,
"loss": 4.0019,
"step": 694
},
{
"epoch": 0.7722222222222223,
"grad_norm": 2.4755685329437256,
"learning_rate": 6.1322604944307e-06,
"loss": 4.0007,
"step": 695
},
{
"epoch": 0.7733333333333333,
"grad_norm": 3.4721150398254395,
"learning_rate": 6.075123608706093e-06,
"loss": 4.1602,
"step": 696
},
{
"epoch": 0.7744444444444445,
"grad_norm": 2.571910858154297,
"learning_rate": 6.01821731662798e-06,
"loss": 4.5001,
"step": 697
},
{
"epoch": 0.7755555555555556,
"grad_norm": 2.0862197875976562,
"learning_rate": 5.961542311581586e-06,
"loss": 4.2366,
"step": 698
},
{
"epoch": 0.7766666666666666,
"grad_norm": 2.0852468013763428,
"learning_rate": 5.905099284133952e-06,
"loss": 4.1254,
"step": 699
},
{
"epoch": 0.7777777777777778,
"grad_norm": 3.239201784133911,
"learning_rate": 5.848888922025553e-06,
"loss": 4.0255,
"step": 700
},
{
"epoch": 0.7788888888888889,
"grad_norm": 3.3855128288269043,
"learning_rate": 5.792911910161922e-06,
"loss": 4.4192,
"step": 701
},
{
"epoch": 0.78,
"grad_norm": 2.8477554321289062,
"learning_rate": 5.737168930605272e-06,
"loss": 4.6303,
"step": 702
},
{
"epoch": 0.7811111111111111,
"grad_norm": 2.2901785373687744,
"learning_rate": 5.681660662566224e-06,
"loss": 4.3732,
"step": 703
},
{
"epoch": 0.7822222222222223,
"grad_norm": 3.0778727531433105,
"learning_rate": 5.626387782395512e-06,
"loss": 3.9875,
"step": 704
},
{
"epoch": 0.7833333333333333,
"grad_norm": 2.725858449935913,
"learning_rate": 5.571350963575728e-06,
"loss": 4.274,
"step": 705
},
{
"epoch": 0.7844444444444445,
"grad_norm": 2.9397945404052734,
"learning_rate": 5.5165508767131415e-06,
"loss": 3.8244,
"step": 706
},
{
"epoch": 0.7855555555555556,
"grad_norm": 3.8967740535736084,
"learning_rate": 5.461988189529529e-06,
"loss": 4.3882,
"step": 707
},
{
"epoch": 0.7866666666666666,
"grad_norm": 3.6964597702026367,
"learning_rate": 5.4076635668540075e-06,
"loss": 4.6633,
"step": 708
},
{
"epoch": 0.7877777777777778,
"grad_norm": 3.322463035583496,
"learning_rate": 5.3535776706149505e-06,
"loss": 4.0363,
"step": 709
},
{
"epoch": 0.7888888888888889,
"grad_norm": 3.0608179569244385,
"learning_rate": 5.299731159831953e-06,
"loss": 3.9402,
"step": 710
},
{
"epoch": 0.79,
"grad_norm": 3.0244638919830322,
"learning_rate": 5.24612469060774e-06,
"loss": 4.7072,
"step": 711
},
{
"epoch": 0.7911111111111111,
"grad_norm": 3.1370954513549805,
"learning_rate": 5.192758916120236e-06,
"loss": 3.6739,
"step": 712
},
{
"epoch": 0.7922222222222223,
"grad_norm": 2.96083402633667,
"learning_rate": 5.139634486614544e-06,
"loss": 4.561,
"step": 713
},
{
"epoch": 0.7933333333333333,
"grad_norm": 3.8822271823883057,
"learning_rate": 5.086752049395094e-06,
"loss": 4.6279,
"step": 714
},
{
"epoch": 0.7944444444444444,
"grad_norm": 3.556574583053589,
"learning_rate": 5.034112248817685e-06,
"loss": 4.029,
"step": 715
},
{
"epoch": 0.7955555555555556,
"grad_norm": 2.4491796493530273,
"learning_rate": 4.981715726281666e-06,
"loss": 4.0463,
"step": 716
},
{
"epoch": 0.7966666666666666,
"grad_norm": 3.314884901046753,
"learning_rate": 4.929563120222141e-06,
"loss": 4.2907,
"step": 717
},
{
"epoch": 0.7977777777777778,
"grad_norm": 2.998528480529785,
"learning_rate": 4.877655066102149e-06,
"loss": 4.1238,
"step": 718
},
{
"epoch": 0.7988888888888889,
"grad_norm": 2.5107343196868896,
"learning_rate": 4.825992196404957e-06,
"loss": 4.8033,
"step": 719
},
{
"epoch": 0.8,
"grad_norm": 3.2697060108184814,
"learning_rate": 4.7745751406263165e-06,
"loss": 3.598,
"step": 720
},
{
"epoch": 0.8011111111111111,
"grad_norm": 2.1032586097717285,
"learning_rate": 4.723404525266839e-06,
"loss": 4.0662,
"step": 721
},
{
"epoch": 0.8022222222222222,
"grad_norm": 2.2804932594299316,
"learning_rate": 4.672480973824311e-06,
"loss": 4.6152,
"step": 722
},
{
"epoch": 0.8033333333333333,
"grad_norm": 3.123626470565796,
"learning_rate": 4.621805106786142e-06,
"loss": 4.8937,
"step": 723
},
{
"epoch": 0.8044444444444444,
"grad_norm": 3.6036055088043213,
"learning_rate": 4.571377541621788e-06,
"loss": 4.5689,
"step": 724
},
{
"epoch": 0.8055555555555556,
"grad_norm": 3.6055924892425537,
"learning_rate": 4.521198892775203e-06,
"loss": 3.8795,
"step": 725
},
{
"epoch": 0.8066666666666666,
"grad_norm": 2.7153923511505127,
"learning_rate": 4.4712697716574e-06,
"loss": 4.3875,
"step": 726
},
{
"epoch": 0.8077777777777778,
"grad_norm": 3.3169379234313965,
"learning_rate": 4.421590786638951e-06,
"loss": 3.9778,
"step": 727
},
{
"epoch": 0.8088888888888889,
"grad_norm": 3.1773722171783447,
"learning_rate": 4.372162543042624e-06,
"loss": 4.7571,
"step": 728
},
{
"epoch": 0.81,
"grad_norm": 2.2997097969055176,
"learning_rate": 4.322985643135952e-06,
"loss": 4.4563,
"step": 729
},
{
"epoch": 0.8111111111111111,
"grad_norm": 3.0270705223083496,
"learning_rate": 4.274060686123959e-06,
"loss": 4.1449,
"step": 730
},
{
"epoch": 0.8122222222222222,
"grad_norm": 3.159769296646118,
"learning_rate": 4.225388268141797e-06,
"loss": 4.2249,
"step": 731
},
{
"epoch": 0.8133333333333334,
"grad_norm": 3.2132275104522705,
"learning_rate": 4.176968982247514e-06,
"loss": 3.7882,
"step": 732
},
{
"epoch": 0.8144444444444444,
"grad_norm": 2.145144462585449,
"learning_rate": 4.128803418414839e-06,
"loss": 3.8867,
"step": 733
},
{
"epoch": 0.8155555555555556,
"grad_norm": 3.366910696029663,
"learning_rate": 4.08089216352596e-06,
"loss": 4.5035,
"step": 734
},
{
"epoch": 0.8166666666666667,
"grad_norm": 3.334970235824585,
"learning_rate": 4.0332358013644016e-06,
"loss": 3.9257,
"step": 735
},
{
"epoch": 0.8177777777777778,
"grad_norm": 3.1020681858062744,
"learning_rate": 3.985834912607894e-06,
"loss": 4.4492,
"step": 736
},
{
"epoch": 0.8188888888888889,
"grad_norm": 2.6478145122528076,
"learning_rate": 3.938690074821313e-06,
"loss": 3.7261,
"step": 737
},
{
"epoch": 0.82,
"grad_norm": 4.352097988128662,
"learning_rate": 3.891801862449629e-06,
"loss": 3.9445,
"step": 738
},
{
"epoch": 0.8211111111111111,
"grad_norm": 2.18900728225708,
"learning_rate": 3.845170846810902e-06,
"loss": 4.1073,
"step": 739
},
{
"epoch": 0.8222222222222222,
"grad_norm": 3.0373637676239014,
"learning_rate": 3.798797596089351e-06,
"loss": 4.4156,
"step": 740
},
{
"epoch": 0.8233333333333334,
"grad_norm": 2.5862083435058594,
"learning_rate": 3.752682675328406e-06,
"loss": 4.5457,
"step": 741
},
{
"epoch": 0.8244444444444444,
"grad_norm": 2.2058353424072266,
"learning_rate": 3.7068266464238084e-06,
"loss": 3.9846,
"step": 742
},
{
"epoch": 0.8255555555555556,
"grad_norm": 2.6216089725494385,
"learning_rate": 3.661230068116811e-06,
"loss": 4.0467,
"step": 743
},
{
"epoch": 0.8266666666666667,
"grad_norm": 2.514681816101074,
"learning_rate": 3.6158934959873353e-06,
"loss": 4.6638,
"step": 744
},
{
"epoch": 0.8277777777777777,
"grad_norm": 3.9455208778381348,
"learning_rate": 3.5708174824471947e-06,
"loss": 3.9174,
"step": 745
},
{
"epoch": 0.8288888888888889,
"grad_norm": 3.5356788635253906,
"learning_rate": 3.5260025767333893e-06,
"loss": 4.3817,
"step": 746
},
{
"epoch": 0.83,
"grad_norm": 2.5251104831695557,
"learning_rate": 3.4814493249014116e-06,
"loss": 4.3528,
"step": 747
},
{
"epoch": 0.8311111111111111,
"grad_norm": 3.5254111289978027,
"learning_rate": 3.4371582698185633e-06,
"loss": 3.896,
"step": 748
},
{
"epoch": 0.8322222222222222,
"grad_norm": 3.865900754928589,
"learning_rate": 3.393129951157384e-06,
"loss": 4.0786,
"step": 749
},
{
"epoch": 0.8333333333333334,
"grad_norm": 3.2861948013305664,
"learning_rate": 3.3493649053890326e-06,
"loss": 4.2953,
"step": 750
},
{
"epoch": 0.8344444444444444,
"grad_norm": 2.263437509536743,
"learning_rate": 3.305863665776793e-06,
"loss": 3.9898,
"step": 751
},
{
"epoch": 0.8355555555555556,
"grad_norm": 3.1119070053100586,
"learning_rate": 3.262626762369525e-06,
"loss": 4.3465,
"step": 752
},
{
"epoch": 0.8366666666666667,
"grad_norm": 3.526019334793091,
"learning_rate": 3.219654721995266e-06,
"loss": 4.4787,
"step": 753
},
{
"epoch": 0.8377777777777777,
"grad_norm": 3.2192864418029785,
"learning_rate": 3.176948068254762e-06,
"loss": 4.0421,
"step": 754
},
{
"epoch": 0.8388888888888889,
"grad_norm": 3.149193525314331,
"learning_rate": 3.1345073215151066e-06,
"loss": 4.7629,
"step": 755
},
{
"epoch": 0.84,
"grad_norm": 4.5595197677612305,
"learning_rate": 3.092332998903416e-06,
"loss": 4.4459,
"step": 756
},
{
"epoch": 0.8411111111111111,
"grad_norm": 2.972254991531372,
"learning_rate": 3.0504256143004866e-06,
"loss": 4.1769,
"step": 757
},
{
"epoch": 0.8422222222222222,
"grad_norm": 2.9443609714508057,
"learning_rate": 3.0087856783345914e-06,
"loss": 3.9543,
"step": 758
},
{
"epoch": 0.8433333333333334,
"grad_norm": 2.5057802200317383,
"learning_rate": 2.967413698375196e-06,
"loss": 4.6031,
"step": 759
},
{
"epoch": 0.8444444444444444,
"grad_norm": 3.078894853591919,
"learning_rate": 2.9263101785268254e-06,
"loss": 3.9584,
"step": 760
},
{
"epoch": 0.8455555555555555,
"grad_norm": 2.3315460681915283,
"learning_rate": 2.8854756196229016e-06,
"loss": 4.0571,
"step": 761
},
{
"epoch": 0.8466666666666667,
"grad_norm": 3.303471326828003,
"learning_rate": 2.8449105192196316e-06,
"loss": 4.3249,
"step": 762
},
{
"epoch": 0.8477777777777777,
"grad_norm": 3.592991590499878,
"learning_rate": 2.8046153715899692e-06,
"loss": 3.8605,
"step": 763
},
{
"epoch": 0.8488888888888889,
"grad_norm": 2.9544084072113037,
"learning_rate": 2.764590667717562e-06,
"loss": 4.0353,
"step": 764
},
{
"epoch": 0.85,
"grad_norm": 3.3249425888061523,
"learning_rate": 2.7248368952908053e-06,
"loss": 4.319,
"step": 765
},
{
"epoch": 0.8511111111111112,
"grad_norm": 4.874053001403809,
"learning_rate": 2.6853545386968606e-06,
"loss": 4.458,
"step": 766
},
{
"epoch": 0.8522222222222222,
"grad_norm": 2.6893086433410645,
"learning_rate": 2.646144079015797e-06,
"loss": 4.0627,
"step": 767
},
{
"epoch": 0.8533333333333334,
"grad_norm": 2.3588078022003174,
"learning_rate": 2.6072059940146775e-06,
"loss": 4.5383,
"step": 768
},
{
"epoch": 0.8544444444444445,
"grad_norm": 2.887497663497925,
"learning_rate": 2.5685407581417907e-06,
"loss": 4.0523,
"step": 769
},
{
"epoch": 0.8555555555555555,
"grad_norm": 2.968337059020996,
"learning_rate": 2.5301488425208296e-06,
"loss": 3.6994,
"step": 770
},
{
"epoch": 0.8566666666666667,
"grad_norm": 2.897677183151245,
"learning_rate": 2.492030714945162e-06,
"loss": 4.2895,
"step": 771
},
{
"epoch": 0.8577777777777778,
"grad_norm": 4.195917129516602,
"learning_rate": 2.454186839872158e-06,
"loss": 3.8239,
"step": 772
},
{
"epoch": 0.8588888888888889,
"grad_norm": 3.7905352115631104,
"learning_rate": 2.4166176784174795e-06,
"loss": 4.0074,
"step": 773
},
{
"epoch": 0.86,
"grad_norm": 3.2009356021881104,
"learning_rate": 2.379323688349516e-06,
"loss": 4.156,
"step": 774
},
{
"epoch": 0.8611111111111112,
"grad_norm": 3.1370296478271484,
"learning_rate": 2.3423053240837515e-06,
"loss": 4.7336,
"step": 775
},
{
"epoch": 0.8622222222222222,
"grad_norm": 2.4123268127441406,
"learning_rate": 2.3055630366772856e-06,
"loss": 4.3871,
"step": 776
},
{
"epoch": 0.8633333333333333,
"grad_norm": 2.304750919342041,
"learning_rate": 2.269097273823287e-06,
"loss": 4.492,
"step": 777
},
{
"epoch": 0.8644444444444445,
"grad_norm": 3.0814766883850098,
"learning_rate": 2.2329084798455746e-06,
"loss": 4.9373,
"step": 778
},
{
"epoch": 0.8655555555555555,
"grad_norm": 3.1735880374908447,
"learning_rate": 2.1969970956931762e-06,
"loss": 4.2959,
"step": 779
},
{
"epoch": 0.8666666666666667,
"grad_norm": 2.108222007751465,
"learning_rate": 2.1613635589349756e-06,
"loss": 4.3394,
"step": 780
},
{
"epoch": 0.8677777777777778,
"grad_norm": 3.263927459716797,
"learning_rate": 2.1260083037543817e-06,
"loss": 4.5798,
"step": 781
},
{
"epoch": 0.8688888888888889,
"grad_norm": 2.2533228397369385,
"learning_rate": 2.0909317609440095e-06,
"loss": 4.4232,
"step": 782
},
{
"epoch": 0.87,
"grad_norm": 3.333630323410034,
"learning_rate": 2.0561343579004715e-06,
"loss": 4.1046,
"step": 783
},
{
"epoch": 0.8711111111111111,
"grad_norm": 3.1644370555877686,
"learning_rate": 2.0216165186191407e-06,
"loss": 4.2146,
"step": 784
},
{
"epoch": 0.8722222222222222,
"grad_norm": 3.5359654426574707,
"learning_rate": 1.9873786636889906e-06,
"loss": 3.8819,
"step": 785
},
{
"epoch": 0.8733333333333333,
"grad_norm": 4.151428699493408,
"learning_rate": 1.95342121028749e-06,
"loss": 3.9319,
"step": 786
},
{
"epoch": 0.8744444444444445,
"grad_norm": 3.5409927368164062,
"learning_rate": 1.9197445721754776e-06,
"loss": 4.463,
"step": 787
},
{
"epoch": 0.8755555555555555,
"grad_norm": 2.9611496925354004,
"learning_rate": 1.8863491596921745e-06,
"loss": 3.84,
"step": 788
},
{
"epoch": 0.8766666666666667,
"grad_norm": 2.815295934677124,
"learning_rate": 1.8532353797501318e-06,
"loss": 4.3042,
"step": 789
},
{
"epoch": 0.8777777777777778,
"grad_norm": 3.664135456085205,
"learning_rate": 1.8204036358303173e-06,
"loss": 3.7069,
"step": 790
},
{
"epoch": 0.8788888888888889,
"grad_norm": 2.666962146759033,
"learning_rate": 1.787854327977162e-06,
"loss": 3.9498,
"step": 791
},
{
"epoch": 0.88,
"grad_norm": 3.5214672088623047,
"learning_rate": 1.7555878527937164e-06,
"loss": 4.15,
"step": 792
},
{
"epoch": 0.8811111111111111,
"grad_norm": 3.4584808349609375,
"learning_rate": 1.7236046034367958e-06,
"loss": 4.2487,
"step": 793
},
{
"epoch": 0.8822222222222222,
"grad_norm": 3.4059367179870605,
"learning_rate": 1.6919049696121958e-06,
"loss": 4.1058,
"step": 794
},
{
"epoch": 0.8833333333333333,
"grad_norm": 2.7756285667419434,
"learning_rate": 1.6604893375699594e-06,
"loss": 4.0675,
"step": 795
},
{
"epoch": 0.8844444444444445,
"grad_norm": 2.357132911682129,
"learning_rate": 1.629358090099639e-06,
"loss": 4.3934,
"step": 796
},
{
"epoch": 0.8855555555555555,
"grad_norm": 3.2852492332458496,
"learning_rate": 1.5985116065256684e-06,
"loss": 4.4737,
"step": 797
},
{
"epoch": 0.8866666666666667,
"grad_norm": 2.238274335861206,
"learning_rate": 1.5679502627027136e-06,
"loss": 4.4498,
"step": 798
},
{
"epoch": 0.8877777777777778,
"grad_norm": 2.4234659671783447,
"learning_rate": 1.5376744310111019e-06,
"loss": 4.5667,
"step": 799
},
{
"epoch": 0.8888888888888888,
"grad_norm": 3.9129526615142822,
"learning_rate": 1.5076844803522922e-06,
"loss": 4.4,
"step": 800
},
{
"epoch": 0.89,
"grad_norm": 2.525909900665283,
"learning_rate": 1.4779807761443636e-06,
"loss": 3.7143,
"step": 801
},
{
"epoch": 0.8911111111111111,
"grad_norm": 3.369392156600952,
"learning_rate": 1.4485636803175829e-06,
"loss": 3.9016,
"step": 802
},
{
"epoch": 0.8922222222222222,
"grad_norm": 3.97363543510437,
"learning_rate": 1.4194335513099761e-06,
"loss": 4.5144,
"step": 803
},
{
"epoch": 0.8933333333333333,
"grad_norm": 3.287062883377075,
"learning_rate": 1.3905907440629752e-06,
"loss": 3.7106,
"step": 804
},
{
"epoch": 0.8944444444444445,
"grad_norm": 3.274184465408325,
"learning_rate": 1.362035610017079e-06,
"loss": 4.6111,
"step": 805
},
{
"epoch": 0.8955555555555555,
"grad_norm": 2.367525815963745,
"learning_rate": 1.333768497107593e-06,
"loss": 4.1843,
"step": 806
},
{
"epoch": 0.8966666666666666,
"grad_norm": 3.559544324874878,
"learning_rate": 1.305789749760361e-06,
"loss": 4.0371,
"step": 807
},
{
"epoch": 0.8977777777777778,
"grad_norm": 2.5813517570495605,
"learning_rate": 1.2780997088875869e-06,
"loss": 4.2183,
"step": 808
},
{
"epoch": 0.8988888888888888,
"grad_norm": 3.533015012741089,
"learning_rate": 1.250698711883691e-06,
"loss": 4.4354,
"step": 809
},
{
"epoch": 0.9,
"grad_norm": 3.268846273422241,
"learning_rate": 1.2235870926211619e-06,
"loss": 4.167,
"step": 810
},
{
"epoch": 0.9011111111111111,
"grad_norm": 2.674999237060547,
"learning_rate": 1.1967651814465354e-06,
"loss": 4.5461,
"step": 811
},
{
"epoch": 0.9022222222222223,
"grad_norm": 2.9661448001861572,
"learning_rate": 1.170233305176327e-06,
"loss": 4.4819,
"step": 812
},
{
"epoch": 0.9033333333333333,
"grad_norm": 3.1719062328338623,
"learning_rate": 1.1439917870930793e-06,
"loss": 4.7351,
"step": 813
},
{
"epoch": 0.9044444444444445,
"grad_norm": 2.9525768756866455,
"learning_rate": 1.1180409469414094e-06,
"loss": 4.0522,
"step": 814
},
{
"epoch": 0.9055555555555556,
"grad_norm": 3.972069025039673,
"learning_rate": 1.0923811009241142e-06,
"loss": 3.8434,
"step": 815
},
{
"epoch": 0.9066666666666666,
"grad_norm": 2.4678852558135986,
"learning_rate": 1.067012561698319e-06,
"loss": 4.2084,
"step": 816
},
{
"epoch": 0.9077777777777778,
"grad_norm": 2.2604753971099854,
"learning_rate": 1.0419356383716688e-06,
"loss": 4.3375,
"step": 817
},
{
"epoch": 0.9088888888888889,
"grad_norm": 3.653369426727295,
"learning_rate": 1.0171506364985622e-06,
"loss": 4.2001,
"step": 818
},
{
"epoch": 0.91,
"grad_norm": 2.612713575363159,
"learning_rate": 9.926578580764234e-07,
"loss": 3.8888,
"step": 819
},
{
"epoch": 0.9111111111111111,
"grad_norm": 3.2685186862945557,
"learning_rate": 9.684576015420278e-07,
"loss": 3.8959,
"step": 820
},
{
"epoch": 0.9122222222222223,
"grad_norm": 3.556123733520508,
"learning_rate": 9.445501617678654e-07,
"loss": 3.9459,
"step": 821
},
{
"epoch": 0.9133333333333333,
"grad_norm": 5.377536773681641,
"learning_rate": 9.209358300585474e-07,
"loss": 3.5557,
"step": 822
},
{
"epoch": 0.9144444444444444,
"grad_norm": 2.2200653553009033,
"learning_rate": 8.976148941472501e-07,
"loss": 4.1984,
"step": 823
},
{
"epoch": 0.9155555555555556,
"grad_norm": 2.290778875350952,
"learning_rate": 8.745876381922147e-07,
"loss": 3.9423,
"step": 824
},
{
"epoch": 0.9166666666666666,
"grad_norm": 3.241596221923828,
"learning_rate": 8.51854342773295e-07,
"loss": 4.2498,
"step": 825
},
{
"epoch": 0.9177777777777778,
"grad_norm": 4.267802715301514,
"learning_rate": 8.294152848885157e-07,
"loss": 3.9039,
"step": 826
},
{
"epoch": 0.9188888888888889,
"grad_norm": 2.7669568061828613,
"learning_rate": 8.072707379507216e-07,
"loss": 4.0191,
"step": 827
},
{
"epoch": 0.92,
"grad_norm": 3.132392168045044,
"learning_rate": 7.854209717842231e-07,
"loss": 4.8574,
"step": 828
},
{
"epoch": 0.9211111111111111,
"grad_norm": 2.444495916366577,
"learning_rate": 7.638662526215284e-07,
"loss": 4.0902,
"step": 829
},
{
"epoch": 0.9222222222222223,
"grad_norm": 3.362197160720825,
"learning_rate": 7.426068431000882e-07,
"loss": 4.5238,
"step": 830
},
{
"epoch": 0.9233333333333333,
"grad_norm": 4.276552200317383,
"learning_rate": 7.216430022591008e-07,
"loss": 3.5991,
"step": 831
},
{
"epoch": 0.9244444444444444,
"grad_norm": 2.867725372314453,
"learning_rate": 7.009749855363456e-07,
"loss": 4.3119,
"step": 832
},
{
"epoch": 0.9255555555555556,
"grad_norm": 3.021606922149658,
"learning_rate": 6.806030447650879e-07,
"loss": 3.9519,
"step": 833
},
{
"epoch": 0.9266666666666666,
"grad_norm": 3.299363851547241,
"learning_rate": 6.605274281709928e-07,
"loss": 3.9062,
"step": 834
},
{
"epoch": 0.9277777777777778,
"grad_norm": 3.487799644470215,
"learning_rate": 6.407483803691216e-07,
"loss": 4.1561,
"step": 835
},
{
"epoch": 0.9288888888888889,
"grad_norm": 3.5065345764160156,
"learning_rate": 6.212661423609184e-07,
"loss": 3.9922,
"step": 836
},
{
"epoch": 0.93,
"grad_norm": 2.3422510623931885,
"learning_rate": 6.020809515313142e-07,
"loss": 4.5369,
"step": 837
},
{
"epoch": 0.9311111111111111,
"grad_norm": 3.0932629108428955,
"learning_rate": 5.83193041645802e-07,
"loss": 3.8776,
"step": 838
},
{
"epoch": 0.9322222222222222,
"grad_norm": 2.2441000938415527,
"learning_rate": 5.646026428476031e-07,
"loss": 4.301,
"step": 839
},
{
"epoch": 0.9333333333333333,
"grad_norm": 3.151946783065796,
"learning_rate": 5.463099816548579e-07,
"loss": 4.5223,
"step": 840
},
{
"epoch": 0.9344444444444444,
"grad_norm": 2.9941930770874023,
"learning_rate": 5.283152809578751e-07,
"loss": 4.2136,
"step": 841
},
{
"epoch": 0.9355555555555556,
"grad_norm": 3.5801138877868652,
"learning_rate": 5.106187600163987e-07,
"loss": 4.0182,
"step": 842
},
{
"epoch": 0.9366666666666666,
"grad_norm": 2.327622413635254,
"learning_rate": 4.932206344569562e-07,
"loss": 4.3929,
"step": 843
},
{
"epoch": 0.9377777777777778,
"grad_norm": 2.793179512023926,
"learning_rate": 4.7612111627021175e-07,
"loss": 4.1198,
"step": 844
},
{
"epoch": 0.9388888888888889,
"grad_norm": 2.1341638565063477,
"learning_rate": 4.5932041380840065e-07,
"loss": 4.2686,
"step": 845
},
{
"epoch": 0.94,
"grad_norm": 2.8225035667419434,
"learning_rate": 4.4281873178278475e-07,
"loss": 4.3253,
"step": 846
},
{
"epoch": 0.9411111111111111,
"grad_norm": 2.065812349319458,
"learning_rate": 4.26616271261146e-07,
"loss": 4.5926,
"step": 847
},
{
"epoch": 0.9422222222222222,
"grad_norm": 3.443786859512329,
"learning_rate": 4.107132296653549e-07,
"loss": 4.6865,
"step": 848
},
{
"epoch": 0.9433333333333334,
"grad_norm": 3.73835825920105,
"learning_rate": 3.95109800768953e-07,
"loss": 4.357,
"step": 849
},
{
"epoch": 0.9444444444444444,
"grad_norm": 3.801854372024536,
"learning_rate": 3.7980617469479953e-07,
"loss": 4.4399,
"step": 850
},
{
"epoch": 0.9455555555555556,
"grad_norm": 2.6715407371520996,
"learning_rate": 3.6480253791274786e-07,
"loss": 4.0495,
"step": 851
},
{
"epoch": 0.9466666666666667,
"grad_norm": 2.8497438430786133,
"learning_rate": 3.5009907323737825e-07,
"loss": 4.6042,
"step": 852
},
{
"epoch": 0.9477777777777778,
"grad_norm": 2.0767159461975098,
"learning_rate": 3.3569595982576583e-07,
"loss": 4.3431,
"step": 853
},
{
"epoch": 0.9488888888888889,
"grad_norm": 2.8185853958129883,
"learning_rate": 3.215933731753024e-07,
"loss": 4.2526,
"step": 854
},
{
"epoch": 0.95,
"grad_norm": 2.341989040374756,
"learning_rate": 3.077914851215585e-07,
"loss": 4.798,
"step": 855
},
{
"epoch": 0.9511111111111111,
"grad_norm": 2.7204387187957764,
"learning_rate": 2.942904638361804e-07,
"loss": 4.4965,
"step": 856
},
{
"epoch": 0.9522222222222222,
"grad_norm": 3.139683485031128,
"learning_rate": 2.810904738248549e-07,
"loss": 4.7001,
"step": 857
},
{
"epoch": 0.9533333333333334,
"grad_norm": 3.4762489795684814,
"learning_rate": 2.681916759252917e-07,
"loss": 4.2364,
"step": 858
},
{
"epoch": 0.9544444444444444,
"grad_norm": 3.4384474754333496,
"learning_rate": 2.555942273052753e-07,
"loss": 4.2242,
"step": 859
},
{
"epoch": 0.9555555555555556,
"grad_norm": 7.5540571212768555,
"learning_rate": 2.4329828146074095e-07,
"loss": 4.4938,
"step": 860
},
{
"epoch": 0.9566666666666667,
"grad_norm": 3.0467262268066406,
"learning_rate": 2.3130398821391007e-07,
"loss": 3.6816,
"step": 861
},
{
"epoch": 0.9577777777777777,
"grad_norm": 2.601795196533203,
"learning_rate": 2.1961149371145795e-07,
"loss": 4.1958,
"step": 862
},
{
"epoch": 0.9588888888888889,
"grad_norm": 4.4815216064453125,
"learning_rate": 2.0822094042274032e-07,
"loss": 3.994,
"step": 863
},
{
"epoch": 0.96,
"grad_norm": 3.6530420780181885,
"learning_rate": 1.9713246713805588e-07,
"loss": 4.3858,
"step": 864
},
{
"epoch": 0.9611111111111111,
"grad_norm": 3.1802806854248047,
"learning_rate": 1.8634620896695043e-07,
"loss": 4.0963,
"step": 865
},
{
"epoch": 0.9622222222222222,
"grad_norm": 3.68103289604187,
"learning_rate": 1.7586229733657644e-07,
"loss": 4.4613,
"step": 866
},
{
"epoch": 0.9633333333333334,
"grad_norm": 2.8591842651367188,
"learning_rate": 1.6568085999008888e-07,
"loss": 4.3585,
"step": 867
},
{
"epoch": 0.9644444444444444,
"grad_norm": 2.928494691848755,
"learning_rate": 1.5580202098509077e-07,
"loss": 4.3471,
"step": 868
},
{
"epoch": 0.9655555555555555,
"grad_norm": 2.9359610080718994,
"learning_rate": 1.4622590069211516e-07,
"loss": 4.5431,
"step": 869
},
{
"epoch": 0.9666666666666667,
"grad_norm": 2.3987393379211426,
"learning_rate": 1.3695261579316777e-07,
"loss": 3.9504,
"step": 870
},
{
"epoch": 0.9677777777777777,
"grad_norm": 2.556596040725708,
"learning_rate": 1.2798227928029482e-07,
"loss": 3.7707,
"step": 871
},
{
"epoch": 0.9688888888888889,
"grad_norm": 2.1739792823791504,
"learning_rate": 1.193150004542204e-07,
"loss": 4.1831,
"step": 872
},
{
"epoch": 0.97,
"grad_norm": 2.5820603370666504,
"learning_rate": 1.109508849230001e-07,
"loss": 4.4525,
"step": 873
},
{
"epoch": 0.9711111111111111,
"grad_norm": 4.120064735412598,
"learning_rate": 1.0289003460074165e-07,
"loss": 3.9264,
"step": 874
},
{
"epoch": 0.9722222222222222,
"grad_norm": 3.187326669692993,
"learning_rate": 9.513254770636137e-08,
"loss": 4.4163,
"step": 875
},
{
"epoch": 0.9733333333333334,
"grad_norm": 2.66953706741333,
"learning_rate": 8.767851876239074e-08,
"loss": 4.3193,
"step": 876
},
{
"epoch": 0.9744444444444444,
"grad_norm": 3.3307957649230957,
"learning_rate": 8.052803859382174e-08,
"loss": 4.6534,
"step": 877
},
{
"epoch": 0.9755555555555555,
"grad_norm": 3.617739200592041,
"learning_rate": 7.368119432699383e-08,
"loss": 4.4387,
"step": 878
},
{
"epoch": 0.9766666666666667,
"grad_norm": 2.623586654663086,
"learning_rate": 6.71380693885476e-08,
"loss": 4.3379,
"step": 879
},
{
"epoch": 0.9777777777777777,
"grad_norm": 3.5560302734375,
"learning_rate": 6.089874350439506e-08,
"loss": 4.536,
"step": 880
},
{
"epoch": 0.9788888888888889,
"grad_norm": 3.401707887649536,
"learning_rate": 5.496329269875089e-08,
"loss": 4.1187,
"step": 881
},
{
"epoch": 0.98,
"grad_norm": 2.7142131328582764,
"learning_rate": 4.9331789293211026e-08,
"loss": 4.5272,
"step": 882
},
{
"epoch": 0.9811111111111112,
"grad_norm": 3.471452236175537,
"learning_rate": 4.400430190586724e-08,
"loss": 4.081,
"step": 883
},
{
"epoch": 0.9822222222222222,
"grad_norm": 3.4180078506469727,
"learning_rate": 3.8980895450474455e-08,
"loss": 4.3884,
"step": 884
},
{
"epoch": 0.9833333333333333,
"grad_norm": 3.2058937549591064,
"learning_rate": 3.426163113565417e-08,
"loss": 4.178,
"step": 885
},
{
"epoch": 0.9844444444444445,
"grad_norm": 2.5083703994750977,
"learning_rate": 2.9846566464150626e-08,
"loss": 4.1643,
"step": 886
},
{
"epoch": 0.9855555555555555,
"grad_norm": 2.149996042251587,
"learning_rate": 2.5735755232134118e-08,
"loss": 4.0516,
"step": 887
},
{
"epoch": 0.9866666666666667,
"grad_norm": 3.1104636192321777,
"learning_rate": 2.192924752854042e-08,
"loss": 4.1596,
"step": 888
},
{
"epoch": 0.9877777777777778,
"grad_norm": 3.0319011211395264,
"learning_rate": 1.842708973447127e-08,
"loss": 4.1681,
"step": 889
},
{
"epoch": 0.9888888888888889,
"grad_norm": 2.075939893722534,
"learning_rate": 1.522932452260595e-08,
"loss": 4.1082,
"step": 890
},
{
"epoch": 0.99,
"grad_norm": 2.8124101161956787,
"learning_rate": 1.233599085671e-08,
"loss": 4.0637,
"step": 891
},
{
"epoch": 0.9911111111111112,
"grad_norm": 3.1139042377471924,
"learning_rate": 9.747123991141194e-09,
"loss": 3.8563,
"step": 892
},
{
"epoch": 0.9922222222222222,
"grad_norm": 2.973275661468506,
"learning_rate": 7.462755470422078e-09,
"loss": 4.8157,
"step": 893
},
{
"epoch": 0.9933333333333333,
"grad_norm": 3.155707359313965,
"learning_rate": 5.48291312886251e-09,
"loss": 4.6411,
"step": 894
},
{
"epoch": 0.9944444444444445,
"grad_norm": 4.164730548858643,
"learning_rate": 3.807621090218261e-09,
"loss": 3.6138,
"step": 895
},
{
"epoch": 0.9955555555555555,
"grad_norm": 2.4967424869537354,
"learning_rate": 2.4368997673940297e-09,
"loss": 3.9694,
"step": 896
},
{
"epoch": 0.9966666666666667,
"grad_norm": 2.2673118114471436,
"learning_rate": 1.3707658621964215e-09,
"loss": 4.1773,
"step": 897
},
{
"epoch": 0.9977777777777778,
"grad_norm": 2.343024730682373,
"learning_rate": 6.092323651313292e-10,
"loss": 4.2347,
"step": 898
},
{
"epoch": 0.9988888888888889,
"grad_norm": 3.1533758640289307,
"learning_rate": 1.5230855524017708e-10,
"loss": 4.2303,
"step": 899
},
{
"epoch": 1.0,
"grad_norm": 3.3015313148498535,
"learning_rate": 0.0,
"loss": 4.4286,
"step": 900
}
],
"logging_steps": 1,
"max_steps": 900,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0254817236897792e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}