finis-est's picture
Upload folder using huggingface_hub
f69ecda verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.251700680272109,
"eval_steps": 184,
"global_step": 920,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0013605442176870747,
"grad_norm": 0.24701461672354935,
"learning_rate": 1.36986301369863e-07,
"loss": 1.6736,
"step": 1
},
{
"epoch": 0.0013605442176870747,
"eval_loss": 1.7904456853866577,
"eval_runtime": 75.582,
"eval_samples_per_second": 53.888,
"eval_steps_per_second": 6.748,
"step": 1
},
{
"epoch": 0.0027210884353741495,
"grad_norm": 0.21437113339785932,
"learning_rate": 2.73972602739726e-07,
"loss": 1.5884,
"step": 2
},
{
"epoch": 0.004081632653061225,
"grad_norm": 0.3228668200940542,
"learning_rate": 4.1095890410958903e-07,
"loss": 1.6821,
"step": 3
},
{
"epoch": 0.005442176870748299,
"grad_norm": 0.19408831616689562,
"learning_rate": 5.47945205479452e-07,
"loss": 1.8146,
"step": 4
},
{
"epoch": 0.006802721088435374,
"grad_norm": 0.18446566294319683,
"learning_rate": 6.849315068493151e-07,
"loss": 1.6316,
"step": 5
},
{
"epoch": 0.00816326530612245,
"grad_norm": 0.26237580245842185,
"learning_rate": 8.219178082191781e-07,
"loss": 1.7544,
"step": 6
},
{
"epoch": 0.009523809523809525,
"grad_norm": 0.1659195721310037,
"learning_rate": 9.589041095890411e-07,
"loss": 1.8325,
"step": 7
},
{
"epoch": 0.010884353741496598,
"grad_norm": 0.14112003912821341,
"learning_rate": 1.095890410958904e-06,
"loss": 1.8533,
"step": 8
},
{
"epoch": 0.012244897959183673,
"grad_norm": 0.22295406766041573,
"learning_rate": 1.2328767123287673e-06,
"loss": 1.7309,
"step": 9
},
{
"epoch": 0.013605442176870748,
"grad_norm": 0.20855919407710727,
"learning_rate": 1.3698630136986302e-06,
"loss": 1.4983,
"step": 10
},
{
"epoch": 0.014965986394557823,
"grad_norm": 0.39634451341504184,
"learning_rate": 1.5068493150684932e-06,
"loss": 1.71,
"step": 11
},
{
"epoch": 0.0163265306122449,
"grad_norm": 0.2918296142957545,
"learning_rate": 1.6438356164383561e-06,
"loss": 1.6983,
"step": 12
},
{
"epoch": 0.017687074829931974,
"grad_norm": 0.3333249210865954,
"learning_rate": 1.7808219178082193e-06,
"loss": 1.6435,
"step": 13
},
{
"epoch": 0.01904761904761905,
"grad_norm": 0.3288930419026758,
"learning_rate": 1.9178082191780823e-06,
"loss": 1.9445,
"step": 14
},
{
"epoch": 0.02040816326530612,
"grad_norm": 0.3311742875918285,
"learning_rate": 2.0547945205479454e-06,
"loss": 1.8007,
"step": 15
},
{
"epoch": 0.021768707482993196,
"grad_norm": 0.24222843258421317,
"learning_rate": 2.191780821917808e-06,
"loss": 1.8698,
"step": 16
},
{
"epoch": 0.02312925170068027,
"grad_norm": 0.2863215351075517,
"learning_rate": 2.3287671232876713e-06,
"loss": 1.8295,
"step": 17
},
{
"epoch": 0.024489795918367346,
"grad_norm": 0.37000991286313667,
"learning_rate": 2.4657534246575345e-06,
"loss": 1.7748,
"step": 18
},
{
"epoch": 0.02585034013605442,
"grad_norm": 0.305345665951125,
"learning_rate": 2.6027397260273973e-06,
"loss": 1.7799,
"step": 19
},
{
"epoch": 0.027210884353741496,
"grad_norm": 0.276577651886119,
"learning_rate": 2.7397260273972604e-06,
"loss": 1.5892,
"step": 20
},
{
"epoch": 0.02857142857142857,
"grad_norm": 0.40747672861545675,
"learning_rate": 2.876712328767123e-06,
"loss": 1.797,
"step": 21
},
{
"epoch": 0.029931972789115645,
"grad_norm": 0.1663214297242309,
"learning_rate": 3.0136986301369864e-06,
"loss": 1.8254,
"step": 22
},
{
"epoch": 0.031292517006802724,
"grad_norm": 0.34875514252556655,
"learning_rate": 3.1506849315068495e-06,
"loss": 1.5967,
"step": 23
},
{
"epoch": 0.0326530612244898,
"grad_norm": 0.31488445168418,
"learning_rate": 3.2876712328767123e-06,
"loss": 1.8033,
"step": 24
},
{
"epoch": 0.034013605442176874,
"grad_norm": 0.9585107293220959,
"learning_rate": 3.4246575342465754e-06,
"loss": 1.9985,
"step": 25
},
{
"epoch": 0.03537414965986395,
"grad_norm": 0.4719659909416967,
"learning_rate": 3.5616438356164386e-06,
"loss": 1.6673,
"step": 26
},
{
"epoch": 0.036734693877551024,
"grad_norm": 0.5206398105101208,
"learning_rate": 3.6986301369863014e-06,
"loss": 1.7832,
"step": 27
},
{
"epoch": 0.0380952380952381,
"grad_norm": 0.5525391513084628,
"learning_rate": 3.8356164383561645e-06,
"loss": 1.8033,
"step": 28
},
{
"epoch": 0.03945578231292517,
"grad_norm": 0.5864368554335787,
"learning_rate": 3.972602739726027e-06,
"loss": 1.637,
"step": 29
},
{
"epoch": 0.04081632653061224,
"grad_norm": 0.18211390682659326,
"learning_rate": 4.109589041095891e-06,
"loss": 1.6996,
"step": 30
},
{
"epoch": 0.04217687074829932,
"grad_norm": 0.26324481615027445,
"learning_rate": 4.246575342465754e-06,
"loss": 1.7077,
"step": 31
},
{
"epoch": 0.04353741496598639,
"grad_norm": 0.487665052197852,
"learning_rate": 4.383561643835616e-06,
"loss": 1.5757,
"step": 32
},
{
"epoch": 0.044897959183673466,
"grad_norm": 0.5110956602957011,
"learning_rate": 4.52054794520548e-06,
"loss": 1.6525,
"step": 33
},
{
"epoch": 0.04625850340136054,
"grad_norm": 0.41488349790070234,
"learning_rate": 4.657534246575343e-06,
"loss": 1.7469,
"step": 34
},
{
"epoch": 0.047619047619047616,
"grad_norm": 0.3205549447320179,
"learning_rate": 4.7945205479452054e-06,
"loss": 1.6621,
"step": 35
},
{
"epoch": 0.04897959183673469,
"grad_norm": 0.2759804237950767,
"learning_rate": 4.931506849315069e-06,
"loss": 1.8364,
"step": 36
},
{
"epoch": 0.050340136054421766,
"grad_norm": 0.4070079284746193,
"learning_rate": 5.068493150684932e-06,
"loss": 1.7928,
"step": 37
},
{
"epoch": 0.05170068027210884,
"grad_norm": 0.3162452736080499,
"learning_rate": 5.2054794520547945e-06,
"loss": 1.8174,
"step": 38
},
{
"epoch": 0.053061224489795916,
"grad_norm": 0.339190852848117,
"learning_rate": 5.342465753424658e-06,
"loss": 1.8372,
"step": 39
},
{
"epoch": 0.05442176870748299,
"grad_norm": 0.31599130496764827,
"learning_rate": 5.479452054794521e-06,
"loss": 1.7265,
"step": 40
},
{
"epoch": 0.055782312925170066,
"grad_norm": 0.18290357316608127,
"learning_rate": 5.6164383561643845e-06,
"loss": 1.7055,
"step": 41
},
{
"epoch": 0.05714285714285714,
"grad_norm": 0.26450493295787797,
"learning_rate": 5.753424657534246e-06,
"loss": 1.859,
"step": 42
},
{
"epoch": 0.058503401360544216,
"grad_norm": 0.2105468885683211,
"learning_rate": 5.89041095890411e-06,
"loss": 1.7903,
"step": 43
},
{
"epoch": 0.05986394557823129,
"grad_norm": 0.21904274744659627,
"learning_rate": 6.027397260273973e-06,
"loss": 1.7112,
"step": 44
},
{
"epoch": 0.061224489795918366,
"grad_norm": 0.2766631664495227,
"learning_rate": 6.164383561643836e-06,
"loss": 1.626,
"step": 45
},
{
"epoch": 0.06258503401360545,
"grad_norm": 0.27137304801321466,
"learning_rate": 6.301369863013699e-06,
"loss": 1.8546,
"step": 46
},
{
"epoch": 0.06394557823129252,
"grad_norm": 0.17562873404669305,
"learning_rate": 6.438356164383563e-06,
"loss": 1.8687,
"step": 47
},
{
"epoch": 0.0653061224489796,
"grad_norm": 0.23608638226381062,
"learning_rate": 6.5753424657534245e-06,
"loss": 1.5768,
"step": 48
},
{
"epoch": 0.06666666666666667,
"grad_norm": 0.12395160133391969,
"learning_rate": 6.712328767123288e-06,
"loss": 1.8217,
"step": 49
},
{
"epoch": 0.06802721088435375,
"grad_norm": 0.21069127406909471,
"learning_rate": 6.849315068493151e-06,
"loss": 1.7057,
"step": 50
},
{
"epoch": 0.06938775510204082,
"grad_norm": 0.17153884217244356,
"learning_rate": 6.9863013698630145e-06,
"loss": 1.9143,
"step": 51
},
{
"epoch": 0.0707482993197279,
"grad_norm": 0.3084343242877715,
"learning_rate": 7.123287671232877e-06,
"loss": 1.8398,
"step": 52
},
{
"epoch": 0.07210884353741497,
"grad_norm": 0.14644662918576262,
"learning_rate": 7.260273972602741e-06,
"loss": 1.6646,
"step": 53
},
{
"epoch": 0.07346938775510205,
"grad_norm": 0.3001793602079481,
"learning_rate": 7.397260273972603e-06,
"loss": 1.689,
"step": 54
},
{
"epoch": 0.07482993197278912,
"grad_norm": 0.301851334470962,
"learning_rate": 7.534246575342466e-06,
"loss": 1.5179,
"step": 55
},
{
"epoch": 0.0761904761904762,
"grad_norm": 0.33200247196496224,
"learning_rate": 7.671232876712329e-06,
"loss": 1.8986,
"step": 56
},
{
"epoch": 0.07755102040816327,
"grad_norm": 0.18181195505623798,
"learning_rate": 7.808219178082192e-06,
"loss": 1.6426,
"step": 57
},
{
"epoch": 0.07891156462585033,
"grad_norm": 0.12250708549849011,
"learning_rate": 7.945205479452055e-06,
"loss": 1.6214,
"step": 58
},
{
"epoch": 0.08027210884353742,
"grad_norm": 0.09796847494385076,
"learning_rate": 8.082191780821919e-06,
"loss": 1.6547,
"step": 59
},
{
"epoch": 0.08163265306122448,
"grad_norm": 0.12998919923759888,
"learning_rate": 8.219178082191782e-06,
"loss": 1.7818,
"step": 60
},
{
"epoch": 0.08299319727891157,
"grad_norm": 0.2260386111575877,
"learning_rate": 8.356164383561644e-06,
"loss": 1.7807,
"step": 61
},
{
"epoch": 0.08435374149659863,
"grad_norm": 0.33754760373428094,
"learning_rate": 8.493150684931507e-06,
"loss": 1.617,
"step": 62
},
{
"epoch": 0.08571428571428572,
"grad_norm": 0.35962963555168737,
"learning_rate": 8.63013698630137e-06,
"loss": 1.6799,
"step": 63
},
{
"epoch": 0.08707482993197278,
"grad_norm": 0.32506967541048193,
"learning_rate": 8.767123287671233e-06,
"loss": 1.6454,
"step": 64
},
{
"epoch": 0.08843537414965986,
"grad_norm": 0.21523079823600388,
"learning_rate": 8.904109589041097e-06,
"loss": 1.8856,
"step": 65
},
{
"epoch": 0.08979591836734693,
"grad_norm": 0.5363358811064897,
"learning_rate": 9.04109589041096e-06,
"loss": 1.6952,
"step": 66
},
{
"epoch": 0.09115646258503401,
"grad_norm": 0.14306066721600327,
"learning_rate": 9.178082191780823e-06,
"loss": 1.8208,
"step": 67
},
{
"epoch": 0.09251700680272108,
"grad_norm": 0.18646957264381078,
"learning_rate": 9.315068493150685e-06,
"loss": 1.7517,
"step": 68
},
{
"epoch": 0.09387755102040816,
"grad_norm": 0.19137982075531637,
"learning_rate": 9.452054794520548e-06,
"loss": 1.6456,
"step": 69
},
{
"epoch": 0.09523809523809523,
"grad_norm": 0.15987203027468555,
"learning_rate": 9.589041095890411e-06,
"loss": 1.7148,
"step": 70
},
{
"epoch": 0.09659863945578231,
"grad_norm": 0.16311504243422864,
"learning_rate": 9.726027397260275e-06,
"loss": 1.6627,
"step": 71
},
{
"epoch": 0.09795918367346938,
"grad_norm": 0.10186314299964105,
"learning_rate": 9.863013698630138e-06,
"loss": 1.5856,
"step": 72
},
{
"epoch": 0.09931972789115646,
"grad_norm": 0.13469761876363148,
"learning_rate": 1e-05,
"loss": 1.6557,
"step": 73
},
{
"epoch": 0.10068027210884353,
"grad_norm": 0.11568418682806415,
"learning_rate": 9.999987357098372e-06,
"loss": 1.7807,
"step": 74
},
{
"epoch": 0.10204081632653061,
"grad_norm": 0.11288388506482096,
"learning_rate": 9.999949428457423e-06,
"loss": 1.8232,
"step": 75
},
{
"epoch": 0.10340136054421768,
"grad_norm": 0.16329859637421754,
"learning_rate": 9.999886214268967e-06,
"loss": 1.7462,
"step": 76
},
{
"epoch": 0.10476190476190476,
"grad_norm": 0.20231664635671653,
"learning_rate": 9.999797714852686e-06,
"loss": 1.5938,
"step": 77
},
{
"epoch": 0.10612244897959183,
"grad_norm": 0.34538065180937266,
"learning_rate": 9.999683930656135e-06,
"loss": 1.8806,
"step": 78
},
{
"epoch": 0.10748299319727891,
"grad_norm": 0.13354157904043504,
"learning_rate": 9.999544862254743e-06,
"loss": 1.801,
"step": 79
},
{
"epoch": 0.10884353741496598,
"grad_norm": 0.13220305876865404,
"learning_rate": 9.999380510351796e-06,
"loss": 1.6805,
"step": 80
},
{
"epoch": 0.11020408163265306,
"grad_norm": 0.13768110879863274,
"learning_rate": 9.999190875778452e-06,
"loss": 1.7481,
"step": 81
},
{
"epoch": 0.11156462585034013,
"grad_norm": 0.11222690770456831,
"learning_rate": 9.998975959493722e-06,
"loss": 1.7894,
"step": 82
},
{
"epoch": 0.11292517006802721,
"grad_norm": 0.11775170157819592,
"learning_rate": 9.998735762584471e-06,
"loss": 1.8592,
"step": 83
},
{
"epoch": 0.11428571428571428,
"grad_norm": 0.20855277686570553,
"learning_rate": 9.998470286265415e-06,
"loss": 1.7145,
"step": 84
},
{
"epoch": 0.11564625850340136,
"grad_norm": 0.10682809945125131,
"learning_rate": 9.998179531879112e-06,
"loss": 1.7563,
"step": 85
},
{
"epoch": 0.11700680272108843,
"grad_norm": 0.1332681057101403,
"learning_rate": 9.99786350089595e-06,
"loss": 1.6698,
"step": 86
},
{
"epoch": 0.11836734693877551,
"grad_norm": 0.1442352006249483,
"learning_rate": 9.99752219491415e-06,
"loss": 1.542,
"step": 87
},
{
"epoch": 0.11972789115646258,
"grad_norm": 0.09723976872539679,
"learning_rate": 9.997155615659753e-06,
"loss": 1.5545,
"step": 88
},
{
"epoch": 0.12108843537414966,
"grad_norm": 0.15078850009122496,
"learning_rate": 9.996763764986606e-06,
"loss": 1.6872,
"step": 89
},
{
"epoch": 0.12244897959183673,
"grad_norm": 0.09880013032692718,
"learning_rate": 9.996346644876363e-06,
"loss": 1.5761,
"step": 90
},
{
"epoch": 0.12380952380952381,
"grad_norm": 0.1797981570168221,
"learning_rate": 9.995904257438467e-06,
"loss": 1.5885,
"step": 91
},
{
"epoch": 0.1251700680272109,
"grad_norm": 0.14066405347976094,
"learning_rate": 9.995436604910142e-06,
"loss": 1.7558,
"step": 92
},
{
"epoch": 0.12653061224489795,
"grad_norm": 0.2804984380485241,
"learning_rate": 9.994943689656381e-06,
"loss": 1.5653,
"step": 93
},
{
"epoch": 0.12789115646258503,
"grad_norm": 0.09802426112688165,
"learning_rate": 9.994425514169938e-06,
"loss": 1.8666,
"step": 94
},
{
"epoch": 0.1292517006802721,
"grad_norm": 0.2640163991220947,
"learning_rate": 9.993882081071307e-06,
"loss": 1.8331,
"step": 95
},
{
"epoch": 0.1306122448979592,
"grad_norm": 0.12584718580988416,
"learning_rate": 9.99331339310872e-06,
"loss": 1.7264,
"step": 96
},
{
"epoch": 0.13197278911564625,
"grad_norm": 0.11723300893007116,
"learning_rate": 9.99271945315812e-06,
"loss": 1.774,
"step": 97
},
{
"epoch": 0.13333333333333333,
"grad_norm": 0.11104245778454394,
"learning_rate": 9.992100264223156e-06,
"loss": 1.7154,
"step": 98
},
{
"epoch": 0.1346938775510204,
"grad_norm": 0.0915644970371204,
"learning_rate": 9.99145582943517e-06,
"loss": 1.6768,
"step": 99
},
{
"epoch": 0.1360544217687075,
"grad_norm": 0.11971918094721708,
"learning_rate": 9.990786152053169e-06,
"loss": 1.895,
"step": 100
},
{
"epoch": 0.13741496598639455,
"grad_norm": 0.13849974347702929,
"learning_rate": 9.99009123546382e-06,
"loss": 1.9232,
"step": 101
},
{
"epoch": 0.13877551020408163,
"grad_norm": 0.0832290902024341,
"learning_rate": 9.98937108318143e-06,
"loss": 1.419,
"step": 102
},
{
"epoch": 0.1401360544217687,
"grad_norm": 0.09490309244168035,
"learning_rate": 9.988625698847921e-06,
"loss": 1.6096,
"step": 103
},
{
"epoch": 0.1414965986394558,
"grad_norm": 0.08634281151584555,
"learning_rate": 9.987855086232824e-06,
"loss": 1.6766,
"step": 104
},
{
"epoch": 0.14285714285714285,
"grad_norm": 0.12657846070776754,
"learning_rate": 9.98705924923325e-06,
"loss": 1.7755,
"step": 105
},
{
"epoch": 0.14421768707482993,
"grad_norm": 0.1730231080244019,
"learning_rate": 9.986238191873874e-06,
"loss": 1.671,
"step": 106
},
{
"epoch": 0.145578231292517,
"grad_norm": 0.11653855558191023,
"learning_rate": 9.985391918306915e-06,
"loss": 1.6012,
"step": 107
},
{
"epoch": 0.1469387755102041,
"grad_norm": 0.09868922955378823,
"learning_rate": 9.984520432812117e-06,
"loss": 1.8218,
"step": 108
},
{
"epoch": 0.14829931972789115,
"grad_norm": 0.08718149041105193,
"learning_rate": 9.983623739796718e-06,
"loss": 1.6361,
"step": 109
},
{
"epoch": 0.14965986394557823,
"grad_norm": 0.08536190731319725,
"learning_rate": 9.982701843795441e-06,
"loss": 1.8356,
"step": 110
},
{
"epoch": 0.1510204081632653,
"grad_norm": 0.1778419657439268,
"learning_rate": 9.981754749470463e-06,
"loss": 1.6968,
"step": 111
},
{
"epoch": 0.1523809523809524,
"grad_norm": 0.12982223254146993,
"learning_rate": 9.980782461611391e-06,
"loss": 1.8005,
"step": 112
},
{
"epoch": 0.15374149659863945,
"grad_norm": 0.08982117932691205,
"learning_rate": 9.979784985135239e-06,
"loss": 1.7645,
"step": 113
},
{
"epoch": 0.15510204081632653,
"grad_norm": 0.12460716696891104,
"learning_rate": 9.978762325086408e-06,
"loss": 1.6455,
"step": 114
},
{
"epoch": 0.1564625850340136,
"grad_norm": 0.09362932823935477,
"learning_rate": 9.977714486636657e-06,
"loss": 1.8083,
"step": 115
},
{
"epoch": 0.15782312925170067,
"grad_norm": 0.09099536634076917,
"learning_rate": 9.976641475085067e-06,
"loss": 1.7776,
"step": 116
},
{
"epoch": 0.15918367346938775,
"grad_norm": 0.08568595730791906,
"learning_rate": 9.975543295858035e-06,
"loss": 1.8846,
"step": 117
},
{
"epoch": 0.16054421768707483,
"grad_norm": 0.1310404323604523,
"learning_rate": 9.974419954509225e-06,
"loss": 1.5725,
"step": 118
},
{
"epoch": 0.1619047619047619,
"grad_norm": 0.11863021260862251,
"learning_rate": 9.97327145671956e-06,
"loss": 1.6409,
"step": 119
},
{
"epoch": 0.16326530612244897,
"grad_norm": 0.11864941995819639,
"learning_rate": 9.972097808297174e-06,
"loss": 1.7081,
"step": 120
},
{
"epoch": 0.16462585034013605,
"grad_norm": 0.08013610894171046,
"learning_rate": 9.970899015177398e-06,
"loss": 1.7804,
"step": 121
},
{
"epoch": 0.16598639455782313,
"grad_norm": 0.12399055095582327,
"learning_rate": 9.969675083422719e-06,
"loss": 1.6848,
"step": 122
},
{
"epoch": 0.1673469387755102,
"grad_norm": 0.1433779964353759,
"learning_rate": 9.96842601922276e-06,
"loss": 1.6888,
"step": 123
},
{
"epoch": 0.16870748299319727,
"grad_norm": 0.09915990901687576,
"learning_rate": 9.967151828894234e-06,
"loss": 1.7802,
"step": 124
},
{
"epoch": 0.17006802721088435,
"grad_norm": 0.10206449162778881,
"learning_rate": 9.965852518880931e-06,
"loss": 1.806,
"step": 125
},
{
"epoch": 0.17142857142857143,
"grad_norm": 0.09282143748721522,
"learning_rate": 9.964528095753669e-06,
"loss": 1.5987,
"step": 126
},
{
"epoch": 0.1727891156462585,
"grad_norm": 0.16907020113729054,
"learning_rate": 9.963178566210268e-06,
"loss": 1.7569,
"step": 127
},
{
"epoch": 0.17414965986394557,
"grad_norm": 0.08207848199751772,
"learning_rate": 9.961803937075516e-06,
"loss": 1.6724,
"step": 128
},
{
"epoch": 0.17551020408163265,
"grad_norm": 0.07319670048822571,
"learning_rate": 9.960404215301133e-06,
"loss": 1.7498,
"step": 129
},
{
"epoch": 0.17687074829931973,
"grad_norm": 0.08159880339274488,
"learning_rate": 9.958979407965738e-06,
"loss": 1.65,
"step": 130
},
{
"epoch": 0.1782312925170068,
"grad_norm": 0.09730054828361595,
"learning_rate": 9.95752952227481e-06,
"loss": 1.7796,
"step": 131
},
{
"epoch": 0.17959183673469387,
"grad_norm": 0.2629732363287427,
"learning_rate": 9.956054565560653e-06,
"loss": 1.6904,
"step": 132
},
{
"epoch": 0.18095238095238095,
"grad_norm": 0.07651749890098045,
"learning_rate": 9.954554545282363e-06,
"loss": 1.7809,
"step": 133
},
{
"epoch": 0.18231292517006803,
"grad_norm": 0.09628395581138101,
"learning_rate": 9.953029469025777e-06,
"loss": 1.8135,
"step": 134
},
{
"epoch": 0.1836734693877551,
"grad_norm": 0.09612376832963275,
"learning_rate": 9.951479344503459e-06,
"loss": 1.6617,
"step": 135
},
{
"epoch": 0.18503401360544217,
"grad_norm": 0.08107993061371403,
"learning_rate": 9.949904179554632e-06,
"loss": 1.6634,
"step": 136
},
{
"epoch": 0.18639455782312925,
"grad_norm": 0.07754512965459885,
"learning_rate": 9.94830398214516e-06,
"loss": 1.7732,
"step": 137
},
{
"epoch": 0.18775510204081633,
"grad_norm": 0.07265030754659244,
"learning_rate": 9.946678760367498e-06,
"loss": 1.7905,
"step": 138
},
{
"epoch": 0.1891156462585034,
"grad_norm": 0.09088517967487394,
"learning_rate": 9.945028522440654e-06,
"loss": 1.49,
"step": 139
},
{
"epoch": 0.19047619047619047,
"grad_norm": 0.21999926224724559,
"learning_rate": 9.943353276710146e-06,
"loss": 2.0726,
"step": 140
},
{
"epoch": 0.19183673469387755,
"grad_norm": 0.07397509235485085,
"learning_rate": 9.941653031647963e-06,
"loss": 1.6069,
"step": 141
},
{
"epoch": 0.19319727891156463,
"grad_norm": 0.17678430730401373,
"learning_rate": 9.939927795852513e-06,
"loss": 1.8128,
"step": 142
},
{
"epoch": 0.1945578231292517,
"grad_norm": 0.09311447920236875,
"learning_rate": 9.938177578048593e-06,
"loss": 1.682,
"step": 143
},
{
"epoch": 0.19591836734693877,
"grad_norm": 0.08923483853542422,
"learning_rate": 9.936402387087339e-06,
"loss": 1.7808,
"step": 144
},
{
"epoch": 0.19727891156462585,
"grad_norm": 0.3457260004062318,
"learning_rate": 9.93460223194617e-06,
"loss": 1.921,
"step": 145
},
{
"epoch": 0.19863945578231293,
"grad_norm": 0.11272086420065035,
"learning_rate": 9.932777121728765e-06,
"loss": 1.627,
"step": 146
},
{
"epoch": 0.2,
"grad_norm": 0.0828154138118513,
"learning_rate": 9.930927065664997e-06,
"loss": 1.85,
"step": 147
},
{
"epoch": 0.20136054421768707,
"grad_norm": 0.10871781486388528,
"learning_rate": 9.929052073110897e-06,
"loss": 1.8526,
"step": 148
},
{
"epoch": 0.20272108843537415,
"grad_norm": 0.08372164326475892,
"learning_rate": 9.927152153548605e-06,
"loss": 1.6184,
"step": 149
},
{
"epoch": 0.20408163265306123,
"grad_norm": 0.2301749352348319,
"learning_rate": 9.925227316586316e-06,
"loss": 1.6416,
"step": 150
},
{
"epoch": 0.2054421768707483,
"grad_norm": 0.11534866479323268,
"learning_rate": 9.923277571958245e-06,
"loss": 1.6587,
"step": 151
},
{
"epoch": 0.20680272108843537,
"grad_norm": 0.1411655046905855,
"learning_rate": 9.921302929524561e-06,
"loss": 1.671,
"step": 152
},
{
"epoch": 0.20816326530612245,
"grad_norm": 0.07211757999248616,
"learning_rate": 9.919303399271348e-06,
"loss": 1.7163,
"step": 153
},
{
"epoch": 0.20952380952380953,
"grad_norm": 0.0873746156242924,
"learning_rate": 9.917278991310553e-06,
"loss": 1.6367,
"step": 154
},
{
"epoch": 0.2108843537414966,
"grad_norm": 0.0819591281688772,
"learning_rate": 9.915229715879928e-06,
"loss": 1.6989,
"step": 155
},
{
"epoch": 0.21224489795918366,
"grad_norm": 0.08552981032847369,
"learning_rate": 9.913155583342994e-06,
"loss": 1.5244,
"step": 156
},
{
"epoch": 0.21360544217687075,
"grad_norm": 0.13550974122069206,
"learning_rate": 9.91105660418897e-06,
"loss": 1.7495,
"step": 157
},
{
"epoch": 0.21496598639455783,
"grad_norm": 0.07091163304804983,
"learning_rate": 9.908932789032729e-06,
"loss": 1.7387,
"step": 158
},
{
"epoch": 0.2163265306122449,
"grad_norm": 0.0838103140533003,
"learning_rate": 9.906784148614745e-06,
"loss": 1.7076,
"step": 159
},
{
"epoch": 0.21768707482993196,
"grad_norm": 0.11349611508198672,
"learning_rate": 9.904610693801042e-06,
"loss": 1.6596,
"step": 160
},
{
"epoch": 0.21904761904761905,
"grad_norm": 0.07733122749252737,
"learning_rate": 9.902412435583127e-06,
"loss": 1.6503,
"step": 161
},
{
"epoch": 0.22040816326530613,
"grad_norm": 0.14625572340923682,
"learning_rate": 9.900189385077948e-06,
"loss": 1.564,
"step": 162
},
{
"epoch": 0.2217687074829932,
"grad_norm": 0.09712144690644532,
"learning_rate": 9.897941553527823e-06,
"loss": 1.7217,
"step": 163
},
{
"epoch": 0.22312925170068026,
"grad_norm": 0.0712274015908157,
"learning_rate": 9.895668952300403e-06,
"loss": 1.6412,
"step": 164
},
{
"epoch": 0.22448979591836735,
"grad_norm": 0.08811945291100708,
"learning_rate": 9.893371592888594e-06,
"loss": 1.6192,
"step": 165
},
{
"epoch": 0.22585034013605443,
"grad_norm": 0.07563751954927482,
"learning_rate": 9.891049486910513e-06,
"loss": 1.6283,
"step": 166
},
{
"epoch": 0.2272108843537415,
"grad_norm": 0.07473029887668768,
"learning_rate": 9.888702646109423e-06,
"loss": 1.6979,
"step": 167
},
{
"epoch": 0.22857142857142856,
"grad_norm": 0.07966661835478112,
"learning_rate": 9.886331082353673e-06,
"loss": 1.6951,
"step": 168
},
{
"epoch": 0.22993197278911565,
"grad_norm": 0.08625904148958655,
"learning_rate": 9.883934807636645e-06,
"loss": 1.6239,
"step": 169
},
{
"epoch": 0.23129251700680273,
"grad_norm": 0.06615144618602906,
"learning_rate": 9.881513834076683e-06,
"loss": 1.7456,
"step": 170
},
{
"epoch": 0.23265306122448978,
"grad_norm": 0.14491038831608893,
"learning_rate": 9.87906817391704e-06,
"loss": 1.7035,
"step": 171
},
{
"epoch": 0.23401360544217686,
"grad_norm": 0.0832300629302243,
"learning_rate": 9.876597839525814e-06,
"loss": 1.6672,
"step": 172
},
{
"epoch": 0.23537414965986395,
"grad_norm": 0.0917489076009908,
"learning_rate": 9.87410284339588e-06,
"loss": 1.6075,
"step": 173
},
{
"epoch": 0.23673469387755103,
"grad_norm": 0.06866742872418008,
"learning_rate": 9.871583198144836e-06,
"loss": 1.7646,
"step": 174
},
{
"epoch": 0.23809523809523808,
"grad_norm": 0.08465371920438951,
"learning_rate": 9.869038916514932e-06,
"loss": 1.6692,
"step": 175
},
{
"epoch": 0.23945578231292516,
"grad_norm": 0.09375415555940526,
"learning_rate": 9.866470011373009e-06,
"loss": 1.778,
"step": 176
},
{
"epoch": 0.24081632653061225,
"grad_norm": 0.07220432655814331,
"learning_rate": 9.863876495710433e-06,
"loss": 1.6857,
"step": 177
},
{
"epoch": 0.24217687074829933,
"grad_norm": 0.0797192184377915,
"learning_rate": 9.86125838264303e-06,
"loss": 1.7893,
"step": 178
},
{
"epoch": 0.24353741496598638,
"grad_norm": 0.07597718520214916,
"learning_rate": 9.858615685411018e-06,
"loss": 1.8848,
"step": 179
},
{
"epoch": 0.24489795918367346,
"grad_norm": 0.08003681020814803,
"learning_rate": 9.85594841737894e-06,
"loss": 1.8,
"step": 180
},
{
"epoch": 0.24625850340136055,
"grad_norm": 0.09696384289585193,
"learning_rate": 9.853256592035602e-06,
"loss": 1.7965,
"step": 181
},
{
"epoch": 0.24761904761904763,
"grad_norm": 0.12333580747104468,
"learning_rate": 9.850540222993994e-06,
"loss": 1.6365,
"step": 182
},
{
"epoch": 0.24897959183673468,
"grad_norm": 0.07310272321033273,
"learning_rate": 9.847799323991234e-06,
"loss": 1.5765,
"step": 183
},
{
"epoch": 0.2503401360544218,
"grad_norm": 0.12923131777808997,
"learning_rate": 9.845033908888485e-06,
"loss": 1.8017,
"step": 184
},
{
"epoch": 0.2503401360544218,
"eval_loss": 1.7241544723510742,
"eval_runtime": 76.6185,
"eval_samples_per_second": 53.159,
"eval_steps_per_second": 6.656,
"step": 184
},
{
"epoch": 0.25170068027210885,
"grad_norm": 0.06809715986288661,
"learning_rate": 9.842243991670899e-06,
"loss": 1.79,
"step": 185
},
{
"epoch": 0.2530612244897959,
"grad_norm": 0.08842474286261379,
"learning_rate": 9.839429586447534e-06,
"loss": 1.6168,
"step": 186
},
{
"epoch": 0.254421768707483,
"grad_norm": 0.10935159810036835,
"learning_rate": 9.836590707451287e-06,
"loss": 1.8505,
"step": 187
},
{
"epoch": 0.25578231292517006,
"grad_norm": 0.12270237138661655,
"learning_rate": 9.833727369038827e-06,
"loss": 1.635,
"step": 188
},
{
"epoch": 0.2571428571428571,
"grad_norm": 0.10719376109260877,
"learning_rate": 9.830839585690519e-06,
"loss": 1.8374,
"step": 189
},
{
"epoch": 0.2585034013605442,
"grad_norm": 0.07990417660478581,
"learning_rate": 9.827927372010343e-06,
"loss": 1.5681,
"step": 190
},
{
"epoch": 0.2598639455782313,
"grad_norm": 0.09668140137221073,
"learning_rate": 9.824990742725835e-06,
"loss": 1.6568,
"step": 191
},
{
"epoch": 0.2612244897959184,
"grad_norm": 0.07968515377548961,
"learning_rate": 9.822029712687999e-06,
"loss": 1.6007,
"step": 192
},
{
"epoch": 0.26258503401360545,
"grad_norm": 0.08837508173810749,
"learning_rate": 9.81904429687124e-06,
"loss": 1.6621,
"step": 193
},
{
"epoch": 0.2639455782312925,
"grad_norm": 0.08728025707185973,
"learning_rate": 9.816034510373287e-06,
"loss": 1.8335,
"step": 194
},
{
"epoch": 0.2653061224489796,
"grad_norm": 0.07961309839255727,
"learning_rate": 9.81300036841511e-06,
"loss": 1.7037,
"step": 195
},
{
"epoch": 0.26666666666666666,
"grad_norm": 0.09980119114286523,
"learning_rate": 9.809941886340854e-06,
"loss": 1.5664,
"step": 196
},
{
"epoch": 0.2680272108843537,
"grad_norm": 0.07147953268098678,
"learning_rate": 9.806859079617757e-06,
"loss": 1.7601,
"step": 197
},
{
"epoch": 0.2693877551020408,
"grad_norm": 0.08653388534305975,
"learning_rate": 9.803751963836065e-06,
"loss": 1.54,
"step": 198
},
{
"epoch": 0.2707482993197279,
"grad_norm": 0.0776685121413518,
"learning_rate": 9.800620554708962e-06,
"loss": 1.5557,
"step": 199
},
{
"epoch": 0.272108843537415,
"grad_norm": 0.07711567735740478,
"learning_rate": 9.797464868072489e-06,
"loss": 1.7101,
"step": 200
},
{
"epoch": 0.27346938775510204,
"grad_norm": 0.09355853262847387,
"learning_rate": 9.794284919885456e-06,
"loss": 1.7454,
"step": 201
},
{
"epoch": 0.2748299319727891,
"grad_norm": 0.0975587232648776,
"learning_rate": 9.791080726229376e-06,
"loss": 1.7479,
"step": 202
},
{
"epoch": 0.2761904761904762,
"grad_norm": 0.07709180794261607,
"learning_rate": 9.78785230330837e-06,
"loss": 1.8086,
"step": 203
},
{
"epoch": 0.27755102040816326,
"grad_norm": 0.09748041740615765,
"learning_rate": 9.784599667449088e-06,
"loss": 1.683,
"step": 204
},
{
"epoch": 0.2789115646258503,
"grad_norm": 0.09608384188874226,
"learning_rate": 9.781322835100639e-06,
"loss": 1.7985,
"step": 205
},
{
"epoch": 0.2802721088435374,
"grad_norm": 0.24417626356607502,
"learning_rate": 9.778021822834484e-06,
"loss": 1.721,
"step": 206
},
{
"epoch": 0.2816326530612245,
"grad_norm": 0.08000490559142356,
"learning_rate": 9.774696647344376e-06,
"loss": 1.5646,
"step": 207
},
{
"epoch": 0.2829931972789116,
"grad_norm": 0.08448656773098678,
"learning_rate": 9.771347325446261e-06,
"loss": 1.7897,
"step": 208
},
{
"epoch": 0.28435374149659864,
"grad_norm": 0.09777185072102372,
"learning_rate": 9.767973874078196e-06,
"loss": 1.8829,
"step": 209
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.07490737419370372,
"learning_rate": 9.764576310300268e-06,
"loss": 1.8031,
"step": 210
},
{
"epoch": 0.2870748299319728,
"grad_norm": 0.07567900709987288,
"learning_rate": 9.761154651294505e-06,
"loss": 1.752,
"step": 211
},
{
"epoch": 0.28843537414965986,
"grad_norm": 0.08322325294858353,
"learning_rate": 9.757708914364784e-06,
"loss": 1.6328,
"step": 212
},
{
"epoch": 0.2897959183673469,
"grad_norm": 0.19949313188011264,
"learning_rate": 9.75423911693675e-06,
"loss": 1.8577,
"step": 213
},
{
"epoch": 0.291156462585034,
"grad_norm": 0.07497191387563905,
"learning_rate": 9.750745276557725e-06,
"loss": 1.4911,
"step": 214
},
{
"epoch": 0.2925170068027211,
"grad_norm": 0.09096003185892962,
"learning_rate": 9.747227410896624e-06,
"loss": 1.5857,
"step": 215
},
{
"epoch": 0.2938775510204082,
"grad_norm": 0.06790968657114778,
"learning_rate": 9.743685537743856e-06,
"loss": 1.6452,
"step": 216
},
{
"epoch": 0.29523809523809524,
"grad_norm": 0.08487437350333037,
"learning_rate": 9.740119675011246e-06,
"loss": 1.674,
"step": 217
},
{
"epoch": 0.2965986394557823,
"grad_norm": 0.07405858389177783,
"learning_rate": 9.73652984073193e-06,
"loss": 1.7461,
"step": 218
},
{
"epoch": 0.2979591836734694,
"grad_norm": 0.07067520018576251,
"learning_rate": 9.73291605306028e-06,
"loss": 1.7163,
"step": 219
},
{
"epoch": 0.29931972789115646,
"grad_norm": 0.0791420166635673,
"learning_rate": 9.7292783302718e-06,
"loss": 1.6668,
"step": 220
},
{
"epoch": 0.3006802721088435,
"grad_norm": 0.07323313348575836,
"learning_rate": 9.72561669076304e-06,
"loss": 1.6398,
"step": 221
},
{
"epoch": 0.3020408163265306,
"grad_norm": 0.07634813609350796,
"learning_rate": 9.721931153051497e-06,
"loss": 1.6447,
"step": 222
},
{
"epoch": 0.3034013605442177,
"grad_norm": 0.07950888254230533,
"learning_rate": 9.718221735775527e-06,
"loss": 1.7845,
"step": 223
},
{
"epoch": 0.3047619047619048,
"grad_norm": 0.1408011781580588,
"learning_rate": 9.714488457694252e-06,
"loss": 1.7427,
"step": 224
},
{
"epoch": 0.30612244897959184,
"grad_norm": 0.20352696620915875,
"learning_rate": 9.710731337687457e-06,
"loss": 1.7789,
"step": 225
},
{
"epoch": 0.3074829931972789,
"grad_norm": 0.3632349628769343,
"learning_rate": 9.7069503947555e-06,
"loss": 1.7108,
"step": 226
},
{
"epoch": 0.308843537414966,
"grad_norm": 0.08426345577870951,
"learning_rate": 9.70314564801922e-06,
"loss": 1.5991,
"step": 227
},
{
"epoch": 0.31020408163265306,
"grad_norm": 0.07487391878859809,
"learning_rate": 9.699317116719831e-06,
"loss": 1.6637,
"step": 228
},
{
"epoch": 0.3115646258503401,
"grad_norm": 0.11408922719163124,
"learning_rate": 9.695464820218829e-06,
"loss": 1.734,
"step": 229
},
{
"epoch": 0.3129251700680272,
"grad_norm": 0.08379918487601977,
"learning_rate": 9.6915887779979e-06,
"loss": 1.5813,
"step": 230
},
{
"epoch": 0.3142857142857143,
"grad_norm": 0.08052280305893883,
"learning_rate": 9.68768900965881e-06,
"loss": 1.7164,
"step": 231
},
{
"epoch": 0.31564625850340133,
"grad_norm": 0.09154693865406958,
"learning_rate": 9.683765534923315e-06,
"loss": 1.5906,
"step": 232
},
{
"epoch": 0.31700680272108844,
"grad_norm": 0.1156976576631727,
"learning_rate": 9.679818373633054e-06,
"loss": 1.5045,
"step": 233
},
{
"epoch": 0.3183673469387755,
"grad_norm": 0.08364003073959134,
"learning_rate": 9.67584754574946e-06,
"loss": 1.6152,
"step": 234
},
{
"epoch": 0.3197278911564626,
"grad_norm": 0.15089067639580192,
"learning_rate": 9.671853071353645e-06,
"loss": 1.7127,
"step": 235
},
{
"epoch": 0.32108843537414966,
"grad_norm": 0.08835383872875176,
"learning_rate": 9.667834970646309e-06,
"loss": 1.609,
"step": 236
},
{
"epoch": 0.3224489795918367,
"grad_norm": 0.08056944247122935,
"learning_rate": 9.663793263947631e-06,
"loss": 1.6126,
"step": 237
},
{
"epoch": 0.3238095238095238,
"grad_norm": 0.09989875040942321,
"learning_rate": 9.659727971697173e-06,
"loss": 1.8035,
"step": 238
},
{
"epoch": 0.3251700680272109,
"grad_norm": 0.07493814768096231,
"learning_rate": 9.655639114453771e-06,
"loss": 1.813,
"step": 239
},
{
"epoch": 0.32653061224489793,
"grad_norm": 0.0879835131172389,
"learning_rate": 9.651526712895431e-06,
"loss": 1.6926,
"step": 240
},
{
"epoch": 0.32789115646258504,
"grad_norm": 0.1315730713741491,
"learning_rate": 9.647390787819232e-06,
"loss": 1.6993,
"step": 241
},
{
"epoch": 0.3292517006802721,
"grad_norm": 0.08144482513159658,
"learning_rate": 9.643231360141205e-06,
"loss": 1.5821,
"step": 242
},
{
"epoch": 0.3306122448979592,
"grad_norm": 0.11213233561578728,
"learning_rate": 9.639048450896251e-06,
"loss": 1.6491,
"step": 243
},
{
"epoch": 0.33197278911564626,
"grad_norm": 0.10094097099195502,
"learning_rate": 9.63484208123801e-06,
"loss": 1.5492,
"step": 244
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.06549445299823263,
"learning_rate": 9.630612272438771e-06,
"loss": 1.6378,
"step": 245
},
{
"epoch": 0.3346938775510204,
"grad_norm": 0.09866738594204459,
"learning_rate": 9.626359045889356e-06,
"loss": 1.6712,
"step": 246
},
{
"epoch": 0.3360544217687075,
"grad_norm": 0.08348510716160104,
"learning_rate": 9.622082423099013e-06,
"loss": 1.6177,
"step": 247
},
{
"epoch": 0.33741496598639453,
"grad_norm": 0.0912979636888496,
"learning_rate": 9.617782425695314e-06,
"loss": 1.7233,
"step": 248
},
{
"epoch": 0.33877551020408164,
"grad_norm": 0.0730054447204009,
"learning_rate": 9.613459075424033e-06,
"loss": 1.8876,
"step": 249
},
{
"epoch": 0.3401360544217687,
"grad_norm": 0.07708564338244163,
"learning_rate": 9.609112394149052e-06,
"loss": 1.5562,
"step": 250
},
{
"epoch": 0.3414965986394558,
"grad_norm": 0.09288554194206548,
"learning_rate": 9.604742403852232e-06,
"loss": 1.7512,
"step": 251
},
{
"epoch": 0.34285714285714286,
"grad_norm": 0.09029592521774728,
"learning_rate": 9.600349126633317e-06,
"loss": 1.4964,
"step": 252
},
{
"epoch": 0.3442176870748299,
"grad_norm": 0.09268667220104135,
"learning_rate": 9.595932584709815e-06,
"loss": 1.5166,
"step": 253
},
{
"epoch": 0.345578231292517,
"grad_norm": 0.09895745636981777,
"learning_rate": 9.59149280041689e-06,
"loss": 1.5862,
"step": 254
},
{
"epoch": 0.3469387755102041,
"grad_norm": 0.0918028367563306,
"learning_rate": 9.587029796207246e-06,
"loss": 1.6704,
"step": 255
},
{
"epoch": 0.34829931972789113,
"grad_norm": 0.08184214411162406,
"learning_rate": 9.582543594651006e-06,
"loss": 1.5767,
"step": 256
},
{
"epoch": 0.34965986394557824,
"grad_norm": 0.10380891219881462,
"learning_rate": 9.578034218435618e-06,
"loss": 1.7974,
"step": 257
},
{
"epoch": 0.3510204081632653,
"grad_norm": 0.07763979596015386,
"learning_rate": 9.573501690365718e-06,
"loss": 1.6754,
"step": 258
},
{
"epoch": 0.3523809523809524,
"grad_norm": 0.0974676538104556,
"learning_rate": 9.568946033363032e-06,
"loss": 1.7312,
"step": 259
},
{
"epoch": 0.35374149659863946,
"grad_norm": 0.08571433404959614,
"learning_rate": 9.564367270466247e-06,
"loss": 1.5805,
"step": 260
},
{
"epoch": 0.3551020408163265,
"grad_norm": 0.0818876254257716,
"learning_rate": 9.559765424830903e-06,
"loss": 1.7883,
"step": 261
},
{
"epoch": 0.3564625850340136,
"grad_norm": 0.0803950448066755,
"learning_rate": 9.555140519729273e-06,
"loss": 1.7474,
"step": 262
},
{
"epoch": 0.3578231292517007,
"grad_norm": 0.0843722550488927,
"learning_rate": 9.550492578550246e-06,
"loss": 1.5564,
"step": 263
},
{
"epoch": 0.35918367346938773,
"grad_norm": 0.07803324012301414,
"learning_rate": 9.545821624799205e-06,
"loss": 1.4879,
"step": 264
},
{
"epoch": 0.36054421768707484,
"grad_norm": 0.08256837435665124,
"learning_rate": 9.541127682097916e-06,
"loss": 1.7395,
"step": 265
},
{
"epoch": 0.3619047619047619,
"grad_norm": 0.08789940175813588,
"learning_rate": 9.536410774184397e-06,
"loss": 1.6602,
"step": 266
},
{
"epoch": 0.363265306122449,
"grad_norm": 0.10680679462820718,
"learning_rate": 9.531670924912814e-06,
"loss": 1.4675,
"step": 267
},
{
"epoch": 0.36462585034013606,
"grad_norm": 0.07750163404143413,
"learning_rate": 9.526908158253345e-06,
"loss": 1.7119,
"step": 268
},
{
"epoch": 0.3659863945578231,
"grad_norm": 0.07081517755227287,
"learning_rate": 9.522122498292066e-06,
"loss": 1.6457,
"step": 269
},
{
"epoch": 0.3673469387755102,
"grad_norm": 0.10501830739522407,
"learning_rate": 9.517313969230826e-06,
"loss": 1.6398,
"step": 270
},
{
"epoch": 0.3687074829931973,
"grad_norm": 0.08143325631533527,
"learning_rate": 9.512482595387131e-06,
"loss": 1.6122,
"step": 271
},
{
"epoch": 0.37006802721088433,
"grad_norm": 0.08749817297460229,
"learning_rate": 9.507628401194015e-06,
"loss": 1.7328,
"step": 272
},
{
"epoch": 0.37142857142857144,
"grad_norm": 0.10516442293102286,
"learning_rate": 9.50275141119992e-06,
"loss": 1.5773,
"step": 273
},
{
"epoch": 0.3727891156462585,
"grad_norm": 0.07808789214492048,
"learning_rate": 9.497851650068561e-06,
"loss": 1.6635,
"step": 274
},
{
"epoch": 0.3741496598639456,
"grad_norm": 0.17789525917278,
"learning_rate": 9.492929142578823e-06,
"loss": 1.9121,
"step": 275
},
{
"epoch": 0.37551020408163266,
"grad_norm": 0.11269638220817121,
"learning_rate": 9.487983913624616e-06,
"loss": 1.7355,
"step": 276
},
{
"epoch": 0.3768707482993197,
"grad_norm": 0.08409991854502387,
"learning_rate": 9.483015988214757e-06,
"loss": 1.7628,
"step": 277
},
{
"epoch": 0.3782312925170068,
"grad_norm": 0.07365655913118094,
"learning_rate": 9.478025391472841e-06,
"loss": 1.8144,
"step": 278
},
{
"epoch": 0.3795918367346939,
"grad_norm": 0.12778690022382183,
"learning_rate": 9.473012148637121e-06,
"loss": 1.4851,
"step": 279
},
{
"epoch": 0.38095238095238093,
"grad_norm": 0.0809734685514322,
"learning_rate": 9.467976285060369e-06,
"loss": 1.7698,
"step": 280
},
{
"epoch": 0.38231292517006804,
"grad_norm": 0.08513218534341072,
"learning_rate": 9.462917826209757e-06,
"loss": 1.6411,
"step": 281
},
{
"epoch": 0.3836734693877551,
"grad_norm": 0.09008342709764051,
"learning_rate": 9.457836797666722e-06,
"loss": 1.694,
"step": 282
},
{
"epoch": 0.38503401360544215,
"grad_norm": 0.10502121760541401,
"learning_rate": 9.452733225126845e-06,
"loss": 1.6999,
"step": 283
},
{
"epoch": 0.38639455782312926,
"grad_norm": 0.07343666145039363,
"learning_rate": 9.44760713439971e-06,
"loss": 1.8164,
"step": 284
},
{
"epoch": 0.3877551020408163,
"grad_norm": 0.08974960620233877,
"learning_rate": 9.442458551408784e-06,
"loss": 1.8539,
"step": 285
},
{
"epoch": 0.3891156462585034,
"grad_norm": 0.07202421778040075,
"learning_rate": 9.437287502191275e-06,
"loss": 1.5453,
"step": 286
},
{
"epoch": 0.3904761904761905,
"grad_norm": 0.09076340883522513,
"learning_rate": 9.43209401289801e-06,
"loss": 1.7088,
"step": 287
},
{
"epoch": 0.39183673469387753,
"grad_norm": 0.08425026537963505,
"learning_rate": 9.426878109793301e-06,
"loss": 1.4451,
"step": 288
},
{
"epoch": 0.39319727891156464,
"grad_norm": 0.09510405982528822,
"learning_rate": 9.421639819254806e-06,
"loss": 1.7913,
"step": 289
},
{
"epoch": 0.3945578231292517,
"grad_norm": 0.09616833843974483,
"learning_rate": 9.416379167773403e-06,
"loss": 1.649,
"step": 290
},
{
"epoch": 0.39591836734693875,
"grad_norm": 0.08286660774561835,
"learning_rate": 9.41109618195305e-06,
"loss": 1.9277,
"step": 291
},
{
"epoch": 0.39727891156462586,
"grad_norm": 0.07769697723139755,
"learning_rate": 9.405790888510655e-06,
"loss": 1.7279,
"step": 292
},
{
"epoch": 0.3986394557823129,
"grad_norm": 0.09006259560100753,
"learning_rate": 9.400463314275942e-06,
"loss": 1.6039,
"step": 293
},
{
"epoch": 0.4,
"grad_norm": 0.09012228611455256,
"learning_rate": 9.39511348619131e-06,
"loss": 1.7865,
"step": 294
},
{
"epoch": 0.4013605442176871,
"grad_norm": 0.09086893554478499,
"learning_rate": 9.389741431311694e-06,
"loss": 1.6225,
"step": 295
},
{
"epoch": 0.40272108843537413,
"grad_norm": 0.1067223732758171,
"learning_rate": 9.384347176804441e-06,
"loss": 1.8657,
"step": 296
},
{
"epoch": 0.40408163265306124,
"grad_norm": 0.09585514004003831,
"learning_rate": 9.378930749949166e-06,
"loss": 1.6826,
"step": 297
},
{
"epoch": 0.4054421768707483,
"grad_norm": 0.08230460323355868,
"learning_rate": 9.373492178137606e-06,
"loss": 1.8107,
"step": 298
},
{
"epoch": 0.40680272108843535,
"grad_norm": 0.08574613920900533,
"learning_rate": 9.368031488873492e-06,
"loss": 1.5687,
"step": 299
},
{
"epoch": 0.40816326530612246,
"grad_norm": 0.07911826770354155,
"learning_rate": 9.36254870977241e-06,
"loss": 1.8716,
"step": 300
},
{
"epoch": 0.4095238095238095,
"grad_norm": 0.08540782320726534,
"learning_rate": 9.357043868561653e-06,
"loss": 1.7997,
"step": 301
},
{
"epoch": 0.4108843537414966,
"grad_norm": 0.08850925770828139,
"learning_rate": 9.351516993080088e-06,
"loss": 1.6299,
"step": 302
},
{
"epoch": 0.4122448979591837,
"grad_norm": 0.10611011164886108,
"learning_rate": 9.34596811127801e-06,
"loss": 1.5621,
"step": 303
},
{
"epoch": 0.41360544217687073,
"grad_norm": 0.08382183266579103,
"learning_rate": 9.340397251217009e-06,
"loss": 1.4407,
"step": 304
},
{
"epoch": 0.41496598639455784,
"grad_norm": 0.09119095993947843,
"learning_rate": 9.334804441069819e-06,
"loss": 1.7161,
"step": 305
},
{
"epoch": 0.4163265306122449,
"grad_norm": 0.11113995697266435,
"learning_rate": 9.329189709120175e-06,
"loss": 1.4126,
"step": 306
},
{
"epoch": 0.41768707482993195,
"grad_norm": 0.08080147586730375,
"learning_rate": 9.323553083762681e-06,
"loss": 1.7303,
"step": 307
},
{
"epoch": 0.41904761904761906,
"grad_norm": 0.10411476203370171,
"learning_rate": 9.31789459350266e-06,
"loss": 1.6408,
"step": 308
},
{
"epoch": 0.4204081632653061,
"grad_norm": 0.0859822277343485,
"learning_rate": 9.312214266956003e-06,
"loss": 1.6534,
"step": 309
},
{
"epoch": 0.4217687074829932,
"grad_norm": 0.08604098070322257,
"learning_rate": 9.306512132849035e-06,
"loss": 1.6252,
"step": 310
},
{
"epoch": 0.4231292517006803,
"grad_norm": 0.0876425921649466,
"learning_rate": 9.300788220018363e-06,
"loss": 1.5096,
"step": 311
},
{
"epoch": 0.42448979591836733,
"grad_norm": 0.08371274563563173,
"learning_rate": 9.295042557410736e-06,
"loss": 1.7352,
"step": 312
},
{
"epoch": 0.42585034013605444,
"grad_norm": 0.13500827487489858,
"learning_rate": 9.28927517408289e-06,
"loss": 1.7902,
"step": 313
},
{
"epoch": 0.4272108843537415,
"grad_norm": 0.08754620765852711,
"learning_rate": 9.28348609920141e-06,
"loss": 1.6862,
"step": 314
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.1028946128162606,
"learning_rate": 9.27767536204258e-06,
"loss": 1.6907,
"step": 315
},
{
"epoch": 0.42993197278911566,
"grad_norm": 0.08692259223714764,
"learning_rate": 9.271842991992231e-06,
"loss": 1.638,
"step": 316
},
{
"epoch": 0.4312925170068027,
"grad_norm": 0.09542502295403,
"learning_rate": 9.26598901854559e-06,
"loss": 1.7157,
"step": 317
},
{
"epoch": 0.4326530612244898,
"grad_norm": 0.09080935991935338,
"learning_rate": 9.260113471307148e-06,
"loss": 1.5851,
"step": 318
},
{
"epoch": 0.4340136054421769,
"grad_norm": 0.09865728458566181,
"learning_rate": 9.254216379990487e-06,
"loss": 1.8897,
"step": 319
},
{
"epoch": 0.43537414965986393,
"grad_norm": 0.10060290482375228,
"learning_rate": 9.248297774418147e-06,
"loss": 1.5605,
"step": 320
},
{
"epoch": 0.43673469387755104,
"grad_norm": 0.09013488977054508,
"learning_rate": 9.242357684521467e-06,
"loss": 1.5582,
"step": 321
},
{
"epoch": 0.4380952380952381,
"grad_norm": 0.12978917401977683,
"learning_rate": 9.236396140340435e-06,
"loss": 1.5953,
"step": 322
},
{
"epoch": 0.43945578231292515,
"grad_norm": 0.09012186897343348,
"learning_rate": 9.230413172023538e-06,
"loss": 1.6678,
"step": 323
},
{
"epoch": 0.44081632653061226,
"grad_norm": 0.07610784588472097,
"learning_rate": 9.224408809827609e-06,
"loss": 1.6697,
"step": 324
},
{
"epoch": 0.4421768707482993,
"grad_norm": 0.08134414493064245,
"learning_rate": 9.218383084117671e-06,
"loss": 1.6543,
"step": 325
},
{
"epoch": 0.4435374149659864,
"grad_norm": 0.07969199857220131,
"learning_rate": 9.212336025366789e-06,
"loss": 1.7372,
"step": 326
},
{
"epoch": 0.4448979591836735,
"grad_norm": 0.08833397568245774,
"learning_rate": 9.206267664155906e-06,
"loss": 1.5033,
"step": 327
},
{
"epoch": 0.44625850340136053,
"grad_norm": 0.09013321784578471,
"learning_rate": 9.200178031173706e-06,
"loss": 1.7467,
"step": 328
},
{
"epoch": 0.44761904761904764,
"grad_norm": 0.08492129873211993,
"learning_rate": 9.194067157216436e-06,
"loss": 1.6346,
"step": 329
},
{
"epoch": 0.4489795918367347,
"grad_norm": 0.08924441822794496,
"learning_rate": 9.187935073187768e-06,
"loss": 1.5647,
"step": 330
},
{
"epoch": 0.45034013605442175,
"grad_norm": 0.2238269401441882,
"learning_rate": 9.181781810098638e-06,
"loss": 1.9641,
"step": 331
},
{
"epoch": 0.45170068027210886,
"grad_norm": 0.08505567098719835,
"learning_rate": 9.175607399067086e-06,
"loss": 1.723,
"step": 332
},
{
"epoch": 0.4530612244897959,
"grad_norm": 0.09479499636885363,
"learning_rate": 9.1694118713181e-06,
"loss": 1.7358,
"step": 333
},
{
"epoch": 0.454421768707483,
"grad_norm": 0.09532298638421347,
"learning_rate": 9.163195258183457e-06,
"loss": 1.652,
"step": 334
},
{
"epoch": 0.4557823129251701,
"grad_norm": 0.1435078453546247,
"learning_rate": 9.156957591101573e-06,
"loss": 1.8876,
"step": 335
},
{
"epoch": 0.45714285714285713,
"grad_norm": 0.08623460392050768,
"learning_rate": 9.150698901617326e-06,
"loss": 1.6408,
"step": 336
},
{
"epoch": 0.45850340136054424,
"grad_norm": 0.08103596406817255,
"learning_rate": 9.144419221381919e-06,
"loss": 1.582,
"step": 337
},
{
"epoch": 0.4598639455782313,
"grad_norm": 0.09624169989634061,
"learning_rate": 9.138118582152704e-06,
"loss": 1.7272,
"step": 338
},
{
"epoch": 0.46122448979591835,
"grad_norm": 0.06914577882609961,
"learning_rate": 9.131797015793026e-06,
"loss": 1.6864,
"step": 339
},
{
"epoch": 0.46258503401360546,
"grad_norm": 0.09682023357095138,
"learning_rate": 9.125454554272057e-06,
"loss": 1.5849,
"step": 340
},
{
"epoch": 0.4639455782312925,
"grad_norm": 0.08865054891775723,
"learning_rate": 9.119091229664648e-06,
"loss": 1.4716,
"step": 341
},
{
"epoch": 0.46530612244897956,
"grad_norm": 0.09112991799939003,
"learning_rate": 9.112707074151152e-06,
"loss": 1.6393,
"step": 342
},
{
"epoch": 0.4666666666666667,
"grad_norm": 0.09319883560181061,
"learning_rate": 9.106302120017272e-06,
"loss": 1.7619,
"step": 343
},
{
"epoch": 0.46802721088435373,
"grad_norm": 0.101061665585339,
"learning_rate": 9.099876399653885e-06,
"loss": 1.6286,
"step": 344
},
{
"epoch": 0.46938775510204084,
"grad_norm": 0.09755047037445551,
"learning_rate": 9.093429945556895e-06,
"loss": 1.6591,
"step": 345
},
{
"epoch": 0.4707482993197279,
"grad_norm": 0.0831755062746902,
"learning_rate": 9.086962790327057e-06,
"loss": 1.7728,
"step": 346
},
{
"epoch": 0.47210884353741495,
"grad_norm": 0.0981445280966388,
"learning_rate": 9.08047496666981e-06,
"loss": 1.6489,
"step": 347
},
{
"epoch": 0.47346938775510206,
"grad_norm": 0.0965447984049988,
"learning_rate": 9.073966507395123e-06,
"loss": 1.7807,
"step": 348
},
{
"epoch": 0.4748299319727891,
"grad_norm": 0.08805192634428297,
"learning_rate": 9.06743744541732e-06,
"loss": 1.6932,
"step": 349
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.07486129688956443,
"learning_rate": 9.060887813754914e-06,
"loss": 1.6828,
"step": 350
},
{
"epoch": 0.4775510204081633,
"grad_norm": 0.08844318223023526,
"learning_rate": 9.054317645530449e-06,
"loss": 1.5791,
"step": 351
},
{
"epoch": 0.47891156462585033,
"grad_norm": 0.08590901507611927,
"learning_rate": 9.047726973970317e-06,
"loss": 1.8916,
"step": 352
},
{
"epoch": 0.48027210884353744,
"grad_norm": 0.08275711528371654,
"learning_rate": 9.041115832404605e-06,
"loss": 1.6376,
"step": 353
},
{
"epoch": 0.4816326530612245,
"grad_norm": 0.13882142863507996,
"learning_rate": 9.03448425426692e-06,
"loss": 1.5496,
"step": 354
},
{
"epoch": 0.48299319727891155,
"grad_norm": 0.09033930791413211,
"learning_rate": 9.027832273094213e-06,
"loss": 1.8207,
"step": 355
},
{
"epoch": 0.48435374149659866,
"grad_norm": 0.08776380923196873,
"learning_rate": 9.021159922526623e-06,
"loss": 1.734,
"step": 356
},
{
"epoch": 0.4857142857142857,
"grad_norm": 0.0871797042611384,
"learning_rate": 9.014467236307303e-06,
"loss": 1.7255,
"step": 357
},
{
"epoch": 0.48707482993197276,
"grad_norm": 0.0832915611192237,
"learning_rate": 9.007754248282236e-06,
"loss": 1.6354,
"step": 358
},
{
"epoch": 0.4884353741496599,
"grad_norm": 0.11961492483949776,
"learning_rate": 9.001020992400086e-06,
"loss": 1.4193,
"step": 359
},
{
"epoch": 0.4897959183673469,
"grad_norm": 0.0794404046382169,
"learning_rate": 8.994267502712007e-06,
"loss": 1.728,
"step": 360
},
{
"epoch": 0.49115646258503404,
"grad_norm": 0.13335980644840648,
"learning_rate": 8.987493813371481e-06,
"loss": 1.6729,
"step": 361
},
{
"epoch": 0.4925170068027211,
"grad_norm": 0.08449457187355984,
"learning_rate": 8.980699958634147e-06,
"loss": 1.6142,
"step": 362
},
{
"epoch": 0.49387755102040815,
"grad_norm": 0.09270436024065673,
"learning_rate": 8.973885972857616e-06,
"loss": 1.6753,
"step": 363
},
{
"epoch": 0.49523809523809526,
"grad_norm": 0.08033630058062084,
"learning_rate": 8.96705189050131e-06,
"loss": 1.6269,
"step": 364
},
{
"epoch": 0.4965986394557823,
"grad_norm": 0.4381615422407444,
"learning_rate": 8.96019774612628e-06,
"loss": 1.6131,
"step": 365
},
{
"epoch": 0.49795918367346936,
"grad_norm": 0.09462169653835782,
"learning_rate": 8.953323574395038e-06,
"loss": 1.5629,
"step": 366
},
{
"epoch": 0.4993197278911565,
"grad_norm": 0.13595506352030012,
"learning_rate": 8.946429410071373e-06,
"loss": 1.5593,
"step": 367
},
{
"epoch": 0.5006802721088436,
"grad_norm": 0.10253342646979322,
"learning_rate": 8.939515288020182e-06,
"loss": 1.6281,
"step": 368
},
{
"epoch": 0.5006802721088436,
"eval_loss": 1.7046507596969604,
"eval_runtime": 76.5686,
"eval_samples_per_second": 53.194,
"eval_steps_per_second": 6.661,
"step": 368
},
{
"epoch": 0.5020408163265306,
"grad_norm": 0.08764026597495381,
"learning_rate": 8.932581243207289e-06,
"loss": 1.5909,
"step": 369
},
{
"epoch": 0.5034013605442177,
"grad_norm": 0.07898445718193145,
"learning_rate": 8.925627310699275e-06,
"loss": 1.761,
"step": 370
},
{
"epoch": 0.5047619047619047,
"grad_norm": 0.10577594540101636,
"learning_rate": 8.918653525663295e-06,
"loss": 1.695,
"step": 371
},
{
"epoch": 0.5061224489795918,
"grad_norm": 0.110620128076388,
"learning_rate": 8.911659923366897e-06,
"loss": 1.7043,
"step": 372
},
{
"epoch": 0.507482993197279,
"grad_norm": 0.08992699882848652,
"learning_rate": 8.904646539177852e-06,
"loss": 1.674,
"step": 373
},
{
"epoch": 0.508843537414966,
"grad_norm": 0.0971954506200858,
"learning_rate": 8.897613408563972e-06,
"loss": 1.6565,
"step": 374
},
{
"epoch": 0.5102040816326531,
"grad_norm": 0.14199688300843188,
"learning_rate": 8.89056056709293e-06,
"loss": 1.4402,
"step": 375
},
{
"epoch": 0.5115646258503401,
"grad_norm": 0.1034458287611268,
"learning_rate": 8.883488050432073e-06,
"loss": 1.6606,
"step": 376
},
{
"epoch": 0.5129251700680272,
"grad_norm": 0.09270052668783225,
"learning_rate": 8.87639589434826e-06,
"loss": 1.7067,
"step": 377
},
{
"epoch": 0.5142857142857142,
"grad_norm": 0.08063860406008341,
"learning_rate": 8.869284134707659e-06,
"loss": 1.7683,
"step": 378
},
{
"epoch": 0.5156462585034014,
"grad_norm": 0.08700140267251555,
"learning_rate": 8.862152807475584e-06,
"loss": 1.7135,
"step": 379
},
{
"epoch": 0.5170068027210885,
"grad_norm": 0.09374652236807894,
"learning_rate": 8.8550019487163e-06,
"loss": 1.5927,
"step": 380
},
{
"epoch": 0.5183673469387755,
"grad_norm": 0.09628602949564133,
"learning_rate": 8.847831594592851e-06,
"loss": 1.6169,
"step": 381
},
{
"epoch": 0.5197278911564626,
"grad_norm": 0.09133344115171436,
"learning_rate": 8.840641781366867e-06,
"loss": 1.6077,
"step": 382
},
{
"epoch": 0.5210884353741496,
"grad_norm": 0.0898264889709745,
"learning_rate": 8.83343254539839e-06,
"loss": 1.7476,
"step": 383
},
{
"epoch": 0.5224489795918368,
"grad_norm": 0.09463509154870942,
"learning_rate": 8.826203923145687e-06,
"loss": 1.6178,
"step": 384
},
{
"epoch": 0.5238095238095238,
"grad_norm": 0.086933970419064,
"learning_rate": 8.818955951165059e-06,
"loss": 1.6544,
"step": 385
},
{
"epoch": 0.5251700680272109,
"grad_norm": 0.08686966519278672,
"learning_rate": 8.811688666110663e-06,
"loss": 1.7268,
"step": 386
},
{
"epoch": 0.5265306122448979,
"grad_norm": 0.10219001251324068,
"learning_rate": 8.80440210473433e-06,
"loss": 1.6538,
"step": 387
},
{
"epoch": 0.527891156462585,
"grad_norm": 0.08199431219832622,
"learning_rate": 8.797096303885374e-06,
"loss": 1.6524,
"step": 388
},
{
"epoch": 0.5292517006802722,
"grad_norm": 0.09053864014656055,
"learning_rate": 8.789771300510397e-06,
"loss": 1.5971,
"step": 389
},
{
"epoch": 0.5306122448979592,
"grad_norm": 0.08221661934582344,
"learning_rate": 8.782427131653121e-06,
"loss": 1.6643,
"step": 390
},
{
"epoch": 0.5319727891156463,
"grad_norm": 0.08888879198967926,
"learning_rate": 8.77506383445419e-06,
"loss": 1.7855,
"step": 391
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.08638631079426494,
"learning_rate": 8.767681446150977e-06,
"loss": 1.8028,
"step": 392
},
{
"epoch": 0.5346938775510204,
"grad_norm": 0.07669664542559732,
"learning_rate": 8.76028000407741e-06,
"loss": 1.5725,
"step": 393
},
{
"epoch": 0.5360544217687074,
"grad_norm": 0.098633841165777,
"learning_rate": 8.752859545663766e-06,
"loss": 1.6692,
"step": 394
},
{
"epoch": 0.5374149659863946,
"grad_norm": 0.09002401371176637,
"learning_rate": 8.745420108436498e-06,
"loss": 1.7636,
"step": 395
},
{
"epoch": 0.5387755102040817,
"grad_norm": 0.0952379865665774,
"learning_rate": 8.737961730018034e-06,
"loss": 1.5664,
"step": 396
},
{
"epoch": 0.5401360544217687,
"grad_norm": 0.09623958141543322,
"learning_rate": 8.730484448126594e-06,
"loss": 1.5345,
"step": 397
},
{
"epoch": 0.5414965986394558,
"grad_norm": 0.1502927654981511,
"learning_rate": 8.722988300575992e-06,
"loss": 1.7841,
"step": 398
},
{
"epoch": 0.5428571428571428,
"grad_norm": 0.09222431588896472,
"learning_rate": 8.71547332527545e-06,
"loss": 1.9559,
"step": 399
},
{
"epoch": 0.54421768707483,
"grad_norm": 0.0937414921537921,
"learning_rate": 8.707939560229406e-06,
"loss": 1.7022,
"step": 400
},
{
"epoch": 0.545578231292517,
"grad_norm": 0.11025083403359445,
"learning_rate": 8.700387043537319e-06,
"loss": 1.4365,
"step": 401
},
{
"epoch": 0.5469387755102041,
"grad_norm": 0.08441282686563067,
"learning_rate": 8.692815813393483e-06,
"loss": 1.6488,
"step": 402
},
{
"epoch": 0.5482993197278911,
"grad_norm": 0.3399602286969607,
"learning_rate": 8.68522590808682e-06,
"loss": 1.6829,
"step": 403
},
{
"epoch": 0.5496598639455782,
"grad_norm": 0.0924431892667502,
"learning_rate": 8.677617366000705e-06,
"loss": 1.7404,
"step": 404
},
{
"epoch": 0.5510204081632653,
"grad_norm": 0.09778253267340173,
"learning_rate": 8.669990225612754e-06,
"loss": 1.7674,
"step": 405
},
{
"epoch": 0.5523809523809524,
"grad_norm": 0.09471837896307483,
"learning_rate": 8.662344525494643e-06,
"loss": 1.6406,
"step": 406
},
{
"epoch": 0.5537414965986395,
"grad_norm": 0.08997298756417596,
"learning_rate": 8.654680304311908e-06,
"loss": 1.7875,
"step": 407
},
{
"epoch": 0.5551020408163265,
"grad_norm": 0.08849454584462031,
"learning_rate": 8.646997600823743e-06,
"loss": 1.5942,
"step": 408
},
{
"epoch": 0.5564625850340136,
"grad_norm": 0.10974758415682516,
"learning_rate": 8.639296453882816e-06,
"loss": 1.5229,
"step": 409
},
{
"epoch": 0.5578231292517006,
"grad_norm": 0.09669599431293863,
"learning_rate": 8.631576902435063e-06,
"loss": 1.7031,
"step": 410
},
{
"epoch": 0.5591836734693878,
"grad_norm": 0.10337335565704198,
"learning_rate": 8.623838985519498e-06,
"loss": 1.6138,
"step": 411
},
{
"epoch": 0.5605442176870749,
"grad_norm": 0.08963980024729686,
"learning_rate": 8.616082742268005e-06,
"loss": 1.6527,
"step": 412
},
{
"epoch": 0.5619047619047619,
"grad_norm": 0.13730489095006465,
"learning_rate": 8.608308211905159e-06,
"loss": 1.5823,
"step": 413
},
{
"epoch": 0.563265306122449,
"grad_norm": 0.08745105418294691,
"learning_rate": 8.600515433748003e-06,
"loss": 1.6647,
"step": 414
},
{
"epoch": 0.564625850340136,
"grad_norm": 0.19321095274209943,
"learning_rate": 8.592704447205872e-06,
"loss": 1.5218,
"step": 415
},
{
"epoch": 0.5659863945578232,
"grad_norm": 0.10518406439497321,
"learning_rate": 8.584875291780178e-06,
"loss": 1.5199,
"step": 416
},
{
"epoch": 0.5673469387755102,
"grad_norm": 0.09495102640071197,
"learning_rate": 8.577028007064218e-06,
"loss": 1.6623,
"step": 417
},
{
"epoch": 0.5687074829931973,
"grad_norm": 0.13232264409241218,
"learning_rate": 8.569162632742973e-06,
"loss": 1.606,
"step": 418
},
{
"epoch": 0.5700680272108843,
"grad_norm": 0.09382708523657708,
"learning_rate": 8.561279208592902e-06,
"loss": 1.6563,
"step": 419
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.10154097299186025,
"learning_rate": 8.553377774481748e-06,
"loss": 1.5177,
"step": 420
},
{
"epoch": 0.5727891156462585,
"grad_norm": 0.09008372077918206,
"learning_rate": 8.545458370368336e-06,
"loss": 1.5358,
"step": 421
},
{
"epoch": 0.5741496598639456,
"grad_norm": 0.11141318964471479,
"learning_rate": 8.53752103630236e-06,
"loss": 1.5975,
"step": 422
},
{
"epoch": 0.5755102040816327,
"grad_norm": 0.11594947645844504,
"learning_rate": 8.529565812424195e-06,
"loss": 1.5417,
"step": 423
},
{
"epoch": 0.5768707482993197,
"grad_norm": 0.1115243557715104,
"learning_rate": 8.521592738964689e-06,
"loss": 1.6912,
"step": 424
},
{
"epoch": 0.5782312925170068,
"grad_norm": 0.10294673717555994,
"learning_rate": 8.513601856244951e-06,
"loss": 1.6883,
"step": 425
},
{
"epoch": 0.5795918367346938,
"grad_norm": 0.12234481799540493,
"learning_rate": 8.505593204676162e-06,
"loss": 1.6903,
"step": 426
},
{
"epoch": 0.580952380952381,
"grad_norm": 0.1164390253206002,
"learning_rate": 8.497566824759359e-06,
"loss": 1.6433,
"step": 427
},
{
"epoch": 0.582312925170068,
"grad_norm": 0.10041564805248225,
"learning_rate": 8.489522757085234e-06,
"loss": 1.5482,
"step": 428
},
{
"epoch": 0.5836734693877551,
"grad_norm": 0.10770970938141446,
"learning_rate": 8.481461042333929e-06,
"loss": 1.6092,
"step": 429
},
{
"epoch": 0.5850340136054422,
"grad_norm": 0.08821548122605594,
"learning_rate": 8.473381721274832e-06,
"loss": 1.5793,
"step": 430
},
{
"epoch": 0.5863945578231292,
"grad_norm": 0.089232378501521,
"learning_rate": 8.465284834766365e-06,
"loss": 1.6233,
"step": 431
},
{
"epoch": 0.5877551020408164,
"grad_norm": 0.10250742081774146,
"learning_rate": 8.457170423755786e-06,
"loss": 1.5625,
"step": 432
},
{
"epoch": 0.5891156462585034,
"grad_norm": 0.1202037568410257,
"learning_rate": 8.449038529278976e-06,
"loss": 1.5843,
"step": 433
},
{
"epoch": 0.5904761904761905,
"grad_norm": 0.1323622148062186,
"learning_rate": 8.440889192460232e-06,
"loss": 1.808,
"step": 434
},
{
"epoch": 0.5918367346938775,
"grad_norm": 0.12495144537173114,
"learning_rate": 8.432722454512057e-06,
"loss": 1.9389,
"step": 435
},
{
"epoch": 0.5931972789115646,
"grad_norm": 0.09699121967438322,
"learning_rate": 8.424538356734957e-06,
"loss": 1.7367,
"step": 436
},
{
"epoch": 0.5945578231292517,
"grad_norm": 0.09801443283628167,
"learning_rate": 8.416336940517229e-06,
"loss": 1.6276,
"step": 437
},
{
"epoch": 0.5959183673469388,
"grad_norm": 0.10487807929341668,
"learning_rate": 8.408118247334755e-06,
"loss": 1.5578,
"step": 438
},
{
"epoch": 0.5972789115646259,
"grad_norm": 0.09364043239294455,
"learning_rate": 8.399882318750785e-06,
"loss": 1.5889,
"step": 439
},
{
"epoch": 0.5986394557823129,
"grad_norm": 0.10020651855110954,
"learning_rate": 8.391629196415733e-06,
"loss": 1.607,
"step": 440
},
{
"epoch": 0.6,
"grad_norm": 0.0899206896336364,
"learning_rate": 8.383358922066965e-06,
"loss": 1.5508,
"step": 441
},
{
"epoch": 0.601360544217687,
"grad_norm": 0.09009526459055692,
"learning_rate": 8.375071537528587e-06,
"loss": 1.6629,
"step": 442
},
{
"epoch": 0.6027210884353742,
"grad_norm": 0.09413295136140958,
"learning_rate": 8.366767084711232e-06,
"loss": 1.6568,
"step": 443
},
{
"epoch": 0.6040816326530613,
"grad_norm": 0.22548893033360673,
"learning_rate": 8.358445605611856e-06,
"loss": 1.7594,
"step": 444
},
{
"epoch": 0.6054421768707483,
"grad_norm": 0.12757276275079013,
"learning_rate": 8.350107142313513e-06,
"loss": 1.4311,
"step": 445
},
{
"epoch": 0.6068027210884354,
"grad_norm": 0.09965051028904835,
"learning_rate": 8.34175173698515e-06,
"loss": 1.6618,
"step": 446
},
{
"epoch": 0.6081632653061224,
"grad_norm": 0.09666753861412604,
"learning_rate": 8.333379431881398e-06,
"loss": 1.6729,
"step": 447
},
{
"epoch": 0.6095238095238096,
"grad_norm": 0.09235423529974252,
"learning_rate": 8.324990269342345e-06,
"loss": 1.7872,
"step": 448
},
{
"epoch": 0.6108843537414966,
"grad_norm": 0.0969813250335363,
"learning_rate": 8.316584291793337e-06,
"loss": 1.5299,
"step": 449
},
{
"epoch": 0.6122448979591837,
"grad_norm": 0.08468781546097646,
"learning_rate": 8.30816154174475e-06,
"loss": 1.7958,
"step": 450
},
{
"epoch": 0.6136054421768707,
"grad_norm": 0.08812390382328499,
"learning_rate": 8.299722061791788e-06,
"loss": 1.7292,
"step": 451
},
{
"epoch": 0.6149659863945578,
"grad_norm": 0.08809745700239427,
"learning_rate": 8.291265894614253e-06,
"loss": 1.758,
"step": 452
},
{
"epoch": 0.6163265306122448,
"grad_norm": 0.09490957390276747,
"learning_rate": 8.282793082976343e-06,
"loss": 1.5475,
"step": 453
},
{
"epoch": 0.617687074829932,
"grad_norm": 0.11060404799058053,
"learning_rate": 8.274303669726427e-06,
"loss": 1.6792,
"step": 454
},
{
"epoch": 0.6190476190476191,
"grad_norm": 0.09540512150005957,
"learning_rate": 8.265797697796831e-06,
"loss": 1.6685,
"step": 455
},
{
"epoch": 0.6204081632653061,
"grad_norm": 0.12873788369707956,
"learning_rate": 8.257275210203621e-06,
"loss": 1.5043,
"step": 456
},
{
"epoch": 0.6217687074829932,
"grad_norm": 0.6499840705042578,
"learning_rate": 8.248736250046389e-06,
"loss": 1.7548,
"step": 457
},
{
"epoch": 0.6231292517006802,
"grad_norm": 0.08484061508587025,
"learning_rate": 8.240180860508027e-06,
"loss": 1.8159,
"step": 458
},
{
"epoch": 0.6244897959183674,
"grad_norm": 0.09906794908825851,
"learning_rate": 8.231609084854513e-06,
"loss": 1.7116,
"step": 459
},
{
"epoch": 0.6258503401360545,
"grad_norm": 0.0939998149984381,
"learning_rate": 8.223020966434695e-06,
"loss": 1.7448,
"step": 460
},
{
"epoch": 0.6272108843537415,
"grad_norm": 0.09983102806108725,
"learning_rate": 8.214416548680065e-06,
"loss": 1.7284,
"step": 461
},
{
"epoch": 0.6285714285714286,
"grad_norm": 0.1276929064149538,
"learning_rate": 8.205795875104549e-06,
"loss": 1.5541,
"step": 462
},
{
"epoch": 0.6299319727891156,
"grad_norm": 0.0971007724405045,
"learning_rate": 8.197158989304277e-06,
"loss": 1.749,
"step": 463
},
{
"epoch": 0.6312925170068027,
"grad_norm": 0.09931797410533018,
"learning_rate": 8.188505934957368e-06,
"loss": 1.7908,
"step": 464
},
{
"epoch": 0.6326530612244898,
"grad_norm": 0.10420469743883767,
"learning_rate": 8.179836755823707e-06,
"loss": 1.7156,
"step": 465
},
{
"epoch": 0.6340136054421769,
"grad_norm": 0.09471973183090822,
"learning_rate": 8.171151495744726e-06,
"loss": 1.6598,
"step": 466
},
{
"epoch": 0.6353741496598639,
"grad_norm": 0.08008463828970452,
"learning_rate": 8.162450198643184e-06,
"loss": 1.8476,
"step": 467
},
{
"epoch": 0.636734693877551,
"grad_norm": 0.09336869405905367,
"learning_rate": 8.153732908522933e-06,
"loss": 1.677,
"step": 468
},
{
"epoch": 0.638095238095238,
"grad_norm": 0.10567938348565759,
"learning_rate": 8.144999669468714e-06,
"loss": 1.6987,
"step": 469
},
{
"epoch": 0.6394557823129252,
"grad_norm": 0.08460548198303047,
"learning_rate": 8.136250525645916e-06,
"loss": 1.8206,
"step": 470
},
{
"epoch": 0.6408163265306123,
"grad_norm": 0.0993816872028852,
"learning_rate": 8.127485521300366e-06,
"loss": 1.6618,
"step": 471
},
{
"epoch": 0.6421768707482993,
"grad_norm": 0.1043492135258874,
"learning_rate": 8.118704700758103e-06,
"loss": 1.641,
"step": 472
},
{
"epoch": 0.6435374149659864,
"grad_norm": 0.0880711093010387,
"learning_rate": 8.109908108425142e-06,
"loss": 1.8376,
"step": 473
},
{
"epoch": 0.6448979591836734,
"grad_norm": 0.09271039929379037,
"learning_rate": 8.101095788787266e-06,
"loss": 1.6914,
"step": 474
},
{
"epoch": 0.6462585034013606,
"grad_norm": 0.10199949996500506,
"learning_rate": 8.092267786409788e-06,
"loss": 1.6264,
"step": 475
},
{
"epoch": 0.6476190476190476,
"grad_norm": 0.09927764715965239,
"learning_rate": 8.08342414593734e-06,
"loss": 1.7495,
"step": 476
},
{
"epoch": 0.6489795918367347,
"grad_norm": 0.11011814904081185,
"learning_rate": 8.07456491209363e-06,
"loss": 1.8044,
"step": 477
},
{
"epoch": 0.6503401360544218,
"grad_norm": 0.10592895331787452,
"learning_rate": 8.065690129681224e-06,
"loss": 1.5279,
"step": 478
},
{
"epoch": 0.6517006802721088,
"grad_norm": 0.09546781848020944,
"learning_rate": 8.056799843581326e-06,
"loss": 1.5599,
"step": 479
},
{
"epoch": 0.6530612244897959,
"grad_norm": 0.09305244725756194,
"learning_rate": 8.04789409875354e-06,
"loss": 1.7328,
"step": 480
},
{
"epoch": 0.654421768707483,
"grad_norm": 0.10356673246511783,
"learning_rate": 8.038972940235647e-06,
"loss": 1.6317,
"step": 481
},
{
"epoch": 0.6557823129251701,
"grad_norm": 0.08355158958046802,
"learning_rate": 8.030036413143382e-06,
"loss": 1.823,
"step": 482
},
{
"epoch": 0.6571428571428571,
"grad_norm": 0.13429081910577273,
"learning_rate": 8.021084562670193e-06,
"loss": 1.765,
"step": 483
},
{
"epoch": 0.6585034013605442,
"grad_norm": 0.10977848927929611,
"learning_rate": 8.012117434087032e-06,
"loss": 1.7983,
"step": 484
},
{
"epoch": 0.6598639455782312,
"grad_norm": 0.10127247770991282,
"learning_rate": 8.003135072742106e-06,
"loss": 1.7146,
"step": 485
},
{
"epoch": 0.6612244897959184,
"grad_norm": 0.08420649737520919,
"learning_rate": 7.994137524060656e-06,
"loss": 1.7273,
"step": 486
},
{
"epoch": 0.6625850340136055,
"grad_norm": 0.12998243453204658,
"learning_rate": 7.985124833544737e-06,
"loss": 1.7116,
"step": 487
},
{
"epoch": 0.6639455782312925,
"grad_norm": 0.12572177154996375,
"learning_rate": 7.976097046772971e-06,
"loss": 1.5875,
"step": 488
},
{
"epoch": 0.6653061224489796,
"grad_norm": 0.09257333856558611,
"learning_rate": 7.967054209400325e-06,
"loss": 1.6259,
"step": 489
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.15599333098996487,
"learning_rate": 7.95799636715788e-06,
"loss": 1.6602,
"step": 490
},
{
"epoch": 0.6680272108843538,
"grad_norm": 0.09770606447991517,
"learning_rate": 7.948923565852597e-06,
"loss": 1.6867,
"step": 491
},
{
"epoch": 0.6693877551020408,
"grad_norm": 0.1303849167483497,
"learning_rate": 7.939835851367097e-06,
"loss": 1.6583,
"step": 492
},
{
"epoch": 0.6707482993197279,
"grad_norm": 0.10864319676746689,
"learning_rate": 7.930733269659405e-06,
"loss": 1.6832,
"step": 493
},
{
"epoch": 0.672108843537415,
"grad_norm": 0.11083699474291044,
"learning_rate": 7.921615866762743e-06,
"loss": 1.7117,
"step": 494
},
{
"epoch": 0.673469387755102,
"grad_norm": 0.13148294366762545,
"learning_rate": 7.912483688785281e-06,
"loss": 1.5234,
"step": 495
},
{
"epoch": 0.6748299319727891,
"grad_norm": 0.11365843655270903,
"learning_rate": 7.903336781909911e-06,
"loss": 1.7783,
"step": 496
},
{
"epoch": 0.6761904761904762,
"grad_norm": 0.12277794049039219,
"learning_rate": 7.89417519239401e-06,
"loss": 1.6908,
"step": 497
},
{
"epoch": 0.6775510204081633,
"grad_norm": 0.11413949336950495,
"learning_rate": 7.884998966569206e-06,
"loss": 1.5654,
"step": 498
},
{
"epoch": 0.6789115646258503,
"grad_norm": 0.10494845825770326,
"learning_rate": 7.87580815084115e-06,
"loss": 1.5383,
"step": 499
},
{
"epoch": 0.6802721088435374,
"grad_norm": 0.09020097927859841,
"learning_rate": 7.866602791689272e-06,
"loss": 1.596,
"step": 500
},
{
"epoch": 0.6816326530612244,
"grad_norm": 0.08585696598298209,
"learning_rate": 7.857382935666554e-06,
"loss": 1.7307,
"step": 501
},
{
"epoch": 0.6829931972789116,
"grad_norm": 0.1010505486947337,
"learning_rate": 7.848148629399287e-06,
"loss": 1.6699,
"step": 502
},
{
"epoch": 0.6843537414965987,
"grad_norm": 0.11201229130009283,
"learning_rate": 7.838899919586841e-06,
"loss": 1.6521,
"step": 503
},
{
"epoch": 0.6857142857142857,
"grad_norm": 0.10009251853964633,
"learning_rate": 7.82963685300143e-06,
"loss": 1.6464,
"step": 504
},
{
"epoch": 0.6870748299319728,
"grad_norm": 0.10676770701676057,
"learning_rate": 7.820359476487866e-06,
"loss": 1.4472,
"step": 505
},
{
"epoch": 0.6884353741496598,
"grad_norm": 0.10987902455707375,
"learning_rate": 7.811067836963337e-06,
"loss": 1.6637,
"step": 506
},
{
"epoch": 0.689795918367347,
"grad_norm": 0.10593286251613945,
"learning_rate": 7.801761981417152e-06,
"loss": 1.714,
"step": 507
},
{
"epoch": 0.691156462585034,
"grad_norm": 0.11171983777047709,
"learning_rate": 7.792441956910523e-06,
"loss": 1.5948,
"step": 508
},
{
"epoch": 0.6925170068027211,
"grad_norm": 0.10208550757531555,
"learning_rate": 7.783107810576306e-06,
"loss": 1.7267,
"step": 509
},
{
"epoch": 0.6938775510204082,
"grad_norm": 0.11315560578865806,
"learning_rate": 7.773759589618782e-06,
"loss": 1.5995,
"step": 510
},
{
"epoch": 0.6952380952380952,
"grad_norm": 0.10501036944532048,
"learning_rate": 7.764397341313403e-06,
"loss": 1.4624,
"step": 511
},
{
"epoch": 0.6965986394557823,
"grad_norm": 0.08619282794989483,
"learning_rate": 7.755021113006567e-06,
"loss": 1.7983,
"step": 512
},
{
"epoch": 0.6979591836734694,
"grad_norm": 0.09109224074659802,
"learning_rate": 7.745630952115365e-06,
"loss": 1.6753,
"step": 513
},
{
"epoch": 0.6993197278911565,
"grad_norm": 0.11332860880884088,
"learning_rate": 7.736226906127344e-06,
"loss": 1.7472,
"step": 514
},
{
"epoch": 0.7006802721088435,
"grad_norm": 0.11046061973302813,
"learning_rate": 7.726809022600284e-06,
"loss": 1.6219,
"step": 515
},
{
"epoch": 0.7020408163265306,
"grad_norm": 0.09720494400811149,
"learning_rate": 7.71737734916193e-06,
"loss": 1.7941,
"step": 516
},
{
"epoch": 0.7034013605442176,
"grad_norm": 0.13391377607878455,
"learning_rate": 7.70793193350977e-06,
"loss": 1.7904,
"step": 517
},
{
"epoch": 0.7047619047619048,
"grad_norm": 0.12625091217599146,
"learning_rate": 7.69847282341079e-06,
"loss": 1.663,
"step": 518
},
{
"epoch": 0.7061224489795919,
"grad_norm": 0.12406607142823292,
"learning_rate": 7.68900006670123e-06,
"loss": 1.6766,
"step": 519
},
{
"epoch": 0.7074829931972789,
"grad_norm": 0.09475581974666519,
"learning_rate": 7.679513711286338e-06,
"loss": 1.7449,
"step": 520
},
{
"epoch": 0.708843537414966,
"grad_norm": 0.12489019591501085,
"learning_rate": 7.670013805140143e-06,
"loss": 1.7526,
"step": 521
},
{
"epoch": 0.710204081632653,
"grad_norm": 0.09091443476585943,
"learning_rate": 7.660500396305194e-06,
"loss": 1.66,
"step": 522
},
{
"epoch": 0.7115646258503401,
"grad_norm": 0.0843896278465122,
"learning_rate": 7.650973532892325e-06,
"loss": 1.5741,
"step": 523
},
{
"epoch": 0.7129251700680272,
"grad_norm": 0.12842839378581072,
"learning_rate": 7.641433263080418e-06,
"loss": 1.5639,
"step": 524
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.12573336181400996,
"learning_rate": 7.631879635116152e-06,
"loss": 1.5261,
"step": 525
},
{
"epoch": 0.7156462585034014,
"grad_norm": 0.08357817960907306,
"learning_rate": 7.622312697313754e-06,
"loss": 1.624,
"step": 526
},
{
"epoch": 0.7170068027210884,
"grad_norm": 0.08826301097458839,
"learning_rate": 7.612732498054769e-06,
"loss": 1.7131,
"step": 527
},
{
"epoch": 0.7183673469387755,
"grad_norm": 0.1030553177752236,
"learning_rate": 7.603139085787801e-06,
"loss": 1.76,
"step": 528
},
{
"epoch": 0.7197278911564626,
"grad_norm": 0.10248942600565085,
"learning_rate": 7.5935325090282785e-06,
"loss": 1.6537,
"step": 529
},
{
"epoch": 0.7210884353741497,
"grad_norm": 0.10935378440777248,
"learning_rate": 7.583912816358203e-06,
"loss": 1.7441,
"step": 530
},
{
"epoch": 0.7224489795918367,
"grad_norm": 0.10474278332332895,
"learning_rate": 7.574280056425907e-06,
"loss": 1.5672,
"step": 531
},
{
"epoch": 0.7238095238095238,
"grad_norm": 0.10337823669221702,
"learning_rate": 7.564634277945803e-06,
"loss": 1.7301,
"step": 532
},
{
"epoch": 0.7251700680272108,
"grad_norm": 0.1021232974428954,
"learning_rate": 7.554975529698143e-06,
"loss": 1.8401,
"step": 533
},
{
"epoch": 0.726530612244898,
"grad_norm": 0.10873875060914909,
"learning_rate": 7.54530386052877e-06,
"loss": 1.6832,
"step": 534
},
{
"epoch": 0.7278911564625851,
"grad_norm": 0.13142427274605614,
"learning_rate": 7.5356193193488655e-06,
"loss": 1.6824,
"step": 535
},
{
"epoch": 0.7292517006802721,
"grad_norm": 0.10578967924155344,
"learning_rate": 7.525921955134714e-06,
"loss": 1.6128,
"step": 536
},
{
"epoch": 0.7306122448979592,
"grad_norm": 0.1428684712147148,
"learning_rate": 7.5162118169274424e-06,
"loss": 1.6909,
"step": 537
},
{
"epoch": 0.7319727891156462,
"grad_norm": 0.12364317671003774,
"learning_rate": 7.506488953832779e-06,
"loss": 1.5894,
"step": 538
},
{
"epoch": 0.7333333333333333,
"grad_norm": 0.12070521447402052,
"learning_rate": 7.4967534150208066e-06,
"loss": 1.6316,
"step": 539
},
{
"epoch": 0.7346938775510204,
"grad_norm": 0.11805699210709567,
"learning_rate": 7.487005249725705e-06,
"loss": 1.773,
"step": 540
},
{
"epoch": 0.7360544217687075,
"grad_norm": 0.10518788665559951,
"learning_rate": 7.477244507245517e-06,
"loss": 1.5496,
"step": 541
},
{
"epoch": 0.7374149659863946,
"grad_norm": 0.10115679657141723,
"learning_rate": 7.4674712369418815e-06,
"loss": 1.6332,
"step": 542
},
{
"epoch": 0.7387755102040816,
"grad_norm": 0.13249620123305414,
"learning_rate": 7.457685488239799e-06,
"loss": 1.5464,
"step": 543
},
{
"epoch": 0.7401360544217687,
"grad_norm": 0.10912963076663591,
"learning_rate": 7.44788731062737e-06,
"loss": 1.6537,
"step": 544
},
{
"epoch": 0.7414965986394558,
"grad_norm": 0.0903784233977396,
"learning_rate": 7.438076753655557e-06,
"loss": 1.7509,
"step": 545
},
{
"epoch": 0.7428571428571429,
"grad_norm": 0.11534137833637581,
"learning_rate": 7.4282538669379186e-06,
"loss": 1.8423,
"step": 546
},
{
"epoch": 0.7442176870748299,
"grad_norm": 0.10673090155703877,
"learning_rate": 7.418418700150373e-06,
"loss": 1.5147,
"step": 547
},
{
"epoch": 0.745578231292517,
"grad_norm": 0.11078677610408977,
"learning_rate": 7.408571303030939e-06,
"loss": 1.598,
"step": 548
},
{
"epoch": 0.746938775510204,
"grad_norm": 0.20089268434188828,
"learning_rate": 7.398711725379486e-06,
"loss": 1.6854,
"step": 549
},
{
"epoch": 0.7482993197278912,
"grad_norm": 0.11465528276050316,
"learning_rate": 7.388840017057479e-06,
"loss": 1.7166,
"step": 550
},
{
"epoch": 0.7496598639455783,
"grad_norm": 0.17554597081264026,
"learning_rate": 7.378956227987738e-06,
"loss": 1.7621,
"step": 551
},
{
"epoch": 0.7510204081632653,
"grad_norm": 0.12001150745168689,
"learning_rate": 7.369060408154166e-06,
"loss": 1.6292,
"step": 552
},
{
"epoch": 0.7510204081632653,
"eval_loss": 1.695604681968689,
"eval_runtime": 76.6065,
"eval_samples_per_second": 53.168,
"eval_steps_per_second": 6.657,
"step": 552
},
{
"epoch": 0.7523809523809524,
"grad_norm": 0.08810105735536967,
"learning_rate": 7.35915260760152e-06,
"loss": 1.7169,
"step": 553
},
{
"epoch": 0.7537414965986394,
"grad_norm": 0.1670593915051014,
"learning_rate": 7.349232876435135e-06,
"loss": 1.5579,
"step": 554
},
{
"epoch": 0.7551020408163265,
"grad_norm": 0.11454927724613997,
"learning_rate": 7.3393012648206865e-06,
"loss": 1.7283,
"step": 555
},
{
"epoch": 0.7564625850340136,
"grad_norm": 0.969049897753043,
"learning_rate": 7.329357822983929e-06,
"loss": 1.7205,
"step": 556
},
{
"epoch": 0.7578231292517007,
"grad_norm": 0.14156929561935405,
"learning_rate": 7.319402601210448e-06,
"loss": 1.6642,
"step": 557
},
{
"epoch": 0.7591836734693878,
"grad_norm": 0.0984076585901956,
"learning_rate": 7.3094356498453955e-06,
"loss": 1.5543,
"step": 558
},
{
"epoch": 0.7605442176870748,
"grad_norm": 0.1065633653807634,
"learning_rate": 7.299457019293248e-06,
"loss": 1.6024,
"step": 559
},
{
"epoch": 0.7619047619047619,
"grad_norm": 0.10385154170413329,
"learning_rate": 7.289466760017543e-06,
"loss": 1.6121,
"step": 560
},
{
"epoch": 0.763265306122449,
"grad_norm": 0.09706098665043214,
"learning_rate": 7.279464922540626e-06,
"loss": 1.6291,
"step": 561
},
{
"epoch": 0.7646258503401361,
"grad_norm": 0.1005953850437904,
"learning_rate": 7.269451557443396e-06,
"loss": 1.5871,
"step": 562
},
{
"epoch": 0.7659863945578231,
"grad_norm": 0.12409919864299088,
"learning_rate": 7.2594267153650525e-06,
"loss": 1.8507,
"step": 563
},
{
"epoch": 0.7673469387755102,
"grad_norm": 0.17577412545797552,
"learning_rate": 7.249390447002827e-06,
"loss": 1.6741,
"step": 564
},
{
"epoch": 0.7687074829931972,
"grad_norm": 0.10625351066460387,
"learning_rate": 7.239342803111744e-06,
"loss": 1.6995,
"step": 565
},
{
"epoch": 0.7700680272108843,
"grad_norm": 0.10141675080832888,
"learning_rate": 7.229283834504351e-06,
"loss": 1.7018,
"step": 566
},
{
"epoch": 0.7714285714285715,
"grad_norm": 0.10351854356175742,
"learning_rate": 7.21921359205047e-06,
"loss": 1.6347,
"step": 567
},
{
"epoch": 0.7727891156462585,
"grad_norm": 0.097783522808633,
"learning_rate": 7.209132126676934e-06,
"loss": 1.63,
"step": 568
},
{
"epoch": 0.7741496598639456,
"grad_norm": 0.1032937811881391,
"learning_rate": 7.199039489367334e-06,
"loss": 1.7088,
"step": 569
},
{
"epoch": 0.7755102040816326,
"grad_norm": 0.12703981258728564,
"learning_rate": 7.188935731161756e-06,
"loss": 1.5488,
"step": 570
},
{
"epoch": 0.7768707482993197,
"grad_norm": 0.09603678350259663,
"learning_rate": 7.178820903156532e-06,
"loss": 1.7006,
"step": 571
},
{
"epoch": 0.7782312925170068,
"grad_norm": 0.10137848978882107,
"learning_rate": 7.168695056503967e-06,
"loss": 1.5343,
"step": 572
},
{
"epoch": 0.7795918367346939,
"grad_norm": 0.09168277145084121,
"learning_rate": 7.1585582424121005e-06,
"loss": 1.7654,
"step": 573
},
{
"epoch": 0.780952380952381,
"grad_norm": 0.10323000544095898,
"learning_rate": 7.148410512144425e-06,
"loss": 1.6613,
"step": 574
},
{
"epoch": 0.782312925170068,
"grad_norm": 0.09038228211904249,
"learning_rate": 7.138251917019645e-06,
"loss": 1.7182,
"step": 575
},
{
"epoch": 0.7836734693877551,
"grad_norm": 0.12949706977298758,
"learning_rate": 7.1280825084114065e-06,
"loss": 1.5075,
"step": 576
},
{
"epoch": 0.7850340136054422,
"grad_norm": 0.10182930523451113,
"learning_rate": 7.117902337748045e-06,
"loss": 1.5249,
"step": 577
},
{
"epoch": 0.7863945578231293,
"grad_norm": 0.1014784602992118,
"learning_rate": 7.107711456512316e-06,
"loss": 1.5699,
"step": 578
},
{
"epoch": 0.7877551020408163,
"grad_norm": 0.08920528739055578,
"learning_rate": 7.097509916241145e-06,
"loss": 1.7604,
"step": 579
},
{
"epoch": 0.7891156462585034,
"grad_norm": 0.14771070265391437,
"learning_rate": 7.08729776852536e-06,
"loss": 1.8294,
"step": 580
},
{
"epoch": 0.7904761904761904,
"grad_norm": 0.10437668016493229,
"learning_rate": 7.0770750650094335e-06,
"loss": 1.5263,
"step": 581
},
{
"epoch": 0.7918367346938775,
"grad_norm": 0.09453693869246346,
"learning_rate": 7.066841857391215e-06,
"loss": 1.7625,
"step": 582
},
{
"epoch": 0.7931972789115647,
"grad_norm": 0.1024749155019567,
"learning_rate": 7.056598197421686e-06,
"loss": 1.6953,
"step": 583
},
{
"epoch": 0.7945578231292517,
"grad_norm": 0.09967417247994723,
"learning_rate": 7.046344136904675e-06,
"loss": 1.5067,
"step": 584
},
{
"epoch": 0.7959183673469388,
"grad_norm": 0.10667510272444436,
"learning_rate": 7.036079727696618e-06,
"loss": 1.6966,
"step": 585
},
{
"epoch": 0.7972789115646258,
"grad_norm": 0.13727868268941767,
"learning_rate": 7.025805021706276e-06,
"loss": 1.6554,
"step": 586
},
{
"epoch": 0.7986394557823129,
"grad_norm": 0.09241703194497365,
"learning_rate": 7.0155200708944915e-06,
"loss": 1.7987,
"step": 587
},
{
"epoch": 0.8,
"grad_norm": 0.11984006315877656,
"learning_rate": 7.005224927273913e-06,
"loss": 1.7059,
"step": 588
},
{
"epoch": 0.8013605442176871,
"grad_norm": 0.12763094516083248,
"learning_rate": 6.9949196429087355e-06,
"loss": 1.8147,
"step": 589
},
{
"epoch": 0.8027210884353742,
"grad_norm": 0.13522443685391508,
"learning_rate": 6.984604269914437e-06,
"loss": 1.63,
"step": 590
},
{
"epoch": 0.8040816326530612,
"grad_norm": 0.10116021604335325,
"learning_rate": 6.974278860457515e-06,
"loss": 1.5963,
"step": 591
},
{
"epoch": 0.8054421768707483,
"grad_norm": 0.10234671686214797,
"learning_rate": 6.963943466755225e-06,
"loss": 1.7491,
"step": 592
},
{
"epoch": 0.8068027210884354,
"grad_norm": 0.09993833464748478,
"learning_rate": 6.953598141075315e-06,
"loss": 1.8742,
"step": 593
},
{
"epoch": 0.8081632653061225,
"grad_norm": 0.10194114128873782,
"learning_rate": 6.943242935735757e-06,
"loss": 1.8295,
"step": 594
},
{
"epoch": 0.8095238095238095,
"grad_norm": 0.11091877191061177,
"learning_rate": 6.932877903104487e-06,
"loss": 1.7282,
"step": 595
},
{
"epoch": 0.8108843537414966,
"grad_norm": 0.11885618706334389,
"learning_rate": 6.922503095599142e-06,
"loss": 1.7013,
"step": 596
},
{
"epoch": 0.8122448979591836,
"grad_norm": 0.128832076499377,
"learning_rate": 6.912118565686789e-06,
"loss": 1.6604,
"step": 597
},
{
"epoch": 0.8136054421768707,
"grad_norm": 0.149361446054299,
"learning_rate": 6.901724365883665e-06,
"loss": 1.5922,
"step": 598
},
{
"epoch": 0.8149659863945579,
"grad_norm": 0.10349101338251292,
"learning_rate": 6.89132054875491e-06,
"loss": 1.7742,
"step": 599
},
{
"epoch": 0.8163265306122449,
"grad_norm": 0.10814162257394357,
"learning_rate": 6.8809071669142946e-06,
"loss": 1.6099,
"step": 600
},
{
"epoch": 0.817687074829932,
"grad_norm": 0.09463224320256956,
"learning_rate": 6.870484273023967e-06,
"loss": 1.5986,
"step": 601
},
{
"epoch": 0.819047619047619,
"grad_norm": 0.10081423766781461,
"learning_rate": 6.8600519197941725e-06,
"loss": 1.7488,
"step": 602
},
{
"epoch": 0.8204081632653061,
"grad_norm": 0.10845349706729722,
"learning_rate": 6.849610159983003e-06,
"loss": 1.6419,
"step": 603
},
{
"epoch": 0.8217687074829932,
"grad_norm": 0.1220236426902432,
"learning_rate": 6.839159046396109e-06,
"loss": 1.6193,
"step": 604
},
{
"epoch": 0.8231292517006803,
"grad_norm": 0.11987974076038253,
"learning_rate": 6.828698631886455e-06,
"loss": 1.6836,
"step": 605
},
{
"epoch": 0.8244897959183674,
"grad_norm": 0.12488732229202766,
"learning_rate": 6.8182289693540375e-06,
"loss": 1.6057,
"step": 606
},
{
"epoch": 0.8258503401360544,
"grad_norm": 0.10061110967809365,
"learning_rate": 6.807750111745619e-06,
"loss": 1.6481,
"step": 607
},
{
"epoch": 0.8272108843537415,
"grad_norm": 0.10585066880846151,
"learning_rate": 6.797262112054469e-06,
"loss": 1.6665,
"step": 608
},
{
"epoch": 0.8285714285714286,
"grad_norm": 0.10134141638102989,
"learning_rate": 6.786765023320085e-06,
"loss": 1.5092,
"step": 609
},
{
"epoch": 0.8299319727891157,
"grad_norm": 0.11227140982751639,
"learning_rate": 6.776258898627932e-06,
"loss": 1.7522,
"step": 610
},
{
"epoch": 0.8312925170068027,
"grad_norm": 0.08967360374579285,
"learning_rate": 6.765743791109172e-06,
"loss": 1.7738,
"step": 611
},
{
"epoch": 0.8326530612244898,
"grad_norm": 0.12577528209737293,
"learning_rate": 6.755219753940389e-06,
"loss": 1.6958,
"step": 612
},
{
"epoch": 0.8340136054421768,
"grad_norm": 0.1026220373525454,
"learning_rate": 6.744686840343333e-06,
"loss": 1.7081,
"step": 613
},
{
"epoch": 0.8353741496598639,
"grad_norm": 0.1616576311932069,
"learning_rate": 6.734145103584638e-06,
"loss": 1.7878,
"step": 614
},
{
"epoch": 0.8367346938775511,
"grad_norm": 0.11606031099035814,
"learning_rate": 6.72359459697556e-06,
"loss": 1.704,
"step": 615
},
{
"epoch": 0.8380952380952381,
"grad_norm": 0.11405134687997359,
"learning_rate": 6.713035373871711e-06,
"loss": 1.6157,
"step": 616
},
{
"epoch": 0.8394557823129252,
"grad_norm": 0.11855263544564604,
"learning_rate": 6.702467487672771e-06,
"loss": 1.7325,
"step": 617
},
{
"epoch": 0.8408163265306122,
"grad_norm": 0.10438275658978799,
"learning_rate": 6.691890991822243e-06,
"loss": 1.6522,
"step": 618
},
{
"epoch": 0.8421768707482993,
"grad_norm": 0.10374577213787556,
"learning_rate": 6.681305939807165e-06,
"loss": 1.6307,
"step": 619
},
{
"epoch": 0.8435374149659864,
"grad_norm": 0.09336351222681238,
"learning_rate": 6.670712385157846e-06,
"loss": 1.5821,
"step": 620
},
{
"epoch": 0.8448979591836735,
"grad_norm": 0.12997638800995778,
"learning_rate": 6.660110381447593e-06,
"loss": 1.672,
"step": 621
},
{
"epoch": 0.8462585034013606,
"grad_norm": 0.09931623670142671,
"learning_rate": 6.649499982292441e-06,
"loss": 1.6305,
"step": 622
},
{
"epoch": 0.8476190476190476,
"grad_norm": 0.1164641533977229,
"learning_rate": 6.638881241350884e-06,
"loss": 1.6891,
"step": 623
},
{
"epoch": 0.8489795918367347,
"grad_norm": 0.09080847058543648,
"learning_rate": 6.628254212323601e-06,
"loss": 1.5685,
"step": 624
},
{
"epoch": 0.8503401360544217,
"grad_norm": 0.09554345130938086,
"learning_rate": 6.617618948953186e-06,
"loss": 1.8238,
"step": 625
},
{
"epoch": 0.8517006802721089,
"grad_norm": 0.11850979230452785,
"learning_rate": 6.606975505023874e-06,
"loss": 1.5686,
"step": 626
},
{
"epoch": 0.8530612244897959,
"grad_norm": 0.11391698206139707,
"learning_rate": 6.596323934361268e-06,
"loss": 1.6122,
"step": 627
},
{
"epoch": 0.854421768707483,
"grad_norm": 0.1259510837772197,
"learning_rate": 6.5856642908320745e-06,
"loss": 1.5638,
"step": 628
},
{
"epoch": 0.85578231292517,
"grad_norm": 0.3467765021073813,
"learning_rate": 6.574996628343824e-06,
"loss": 1.7503,
"step": 629
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.11342921408966115,
"learning_rate": 6.564321000844598e-06,
"loss": 1.6653,
"step": 630
},
{
"epoch": 0.8585034013605443,
"grad_norm": 0.10000357784567925,
"learning_rate": 6.553637462322759e-06,
"loss": 1.5783,
"step": 631
},
{
"epoch": 0.8598639455782313,
"grad_norm": 0.1067459993698176,
"learning_rate": 6.5429460668066825e-06,
"loss": 1.7222,
"step": 632
},
{
"epoch": 0.8612244897959184,
"grad_norm": 0.1669410376966875,
"learning_rate": 6.5322468683644665e-06,
"loss": 1.6325,
"step": 633
},
{
"epoch": 0.8625850340136054,
"grad_norm": 0.107409115579869,
"learning_rate": 6.5215399211036815e-06,
"loss": 1.5369,
"step": 634
},
{
"epoch": 0.8639455782312925,
"grad_norm": 0.1076970404147157,
"learning_rate": 6.510825279171077e-06,
"loss": 1.7722,
"step": 635
},
{
"epoch": 0.8653061224489796,
"grad_norm": 0.15949100738404212,
"learning_rate": 6.5001029967523195e-06,
"loss": 1.5295,
"step": 636
},
{
"epoch": 0.8666666666666667,
"grad_norm": 0.13488187893377007,
"learning_rate": 6.489373128071714e-06,
"loss": 1.6053,
"step": 637
},
{
"epoch": 0.8680272108843538,
"grad_norm": 0.1117979139478637,
"learning_rate": 6.4786357273919296e-06,
"loss": 1.6219,
"step": 638
},
{
"epoch": 0.8693877551020408,
"grad_norm": 0.11161396243370707,
"learning_rate": 6.467890849013728e-06,
"loss": 1.6193,
"step": 639
},
{
"epoch": 0.8707482993197279,
"grad_norm": 0.10619453607937132,
"learning_rate": 6.4571385472756835e-06,
"loss": 1.6587,
"step": 640
},
{
"epoch": 0.8721088435374149,
"grad_norm": 0.1092897734952719,
"learning_rate": 6.446378876553914e-06,
"loss": 1.5463,
"step": 641
},
{
"epoch": 0.8734693877551021,
"grad_norm": 0.10030800864141241,
"learning_rate": 6.4356118912618025e-06,
"loss": 1.6678,
"step": 642
},
{
"epoch": 0.8748299319727891,
"grad_norm": 0.10968908293389731,
"learning_rate": 6.424837645849724e-06,
"loss": 1.6558,
"step": 643
},
{
"epoch": 0.8761904761904762,
"grad_norm": 0.10586492564834864,
"learning_rate": 6.41405619480477e-06,
"loss": 1.6116,
"step": 644
},
{
"epoch": 0.8775510204081632,
"grad_norm": 0.10414471406609932,
"learning_rate": 6.403267592650466e-06,
"loss": 1.5987,
"step": 645
},
{
"epoch": 0.8789115646258503,
"grad_norm": 0.11102655800428667,
"learning_rate": 6.39247189394651e-06,
"loss": 1.5676,
"step": 646
},
{
"epoch": 0.8802721088435375,
"grad_norm": 0.1414944802203078,
"learning_rate": 6.381669153288485e-06,
"loss": 1.5632,
"step": 647
},
{
"epoch": 0.8816326530612245,
"grad_norm": 0.11327036672893112,
"learning_rate": 6.370859425307583e-06,
"loss": 1.6175,
"step": 648
},
{
"epoch": 0.8829931972789116,
"grad_norm": 0.10646525745122841,
"learning_rate": 6.360042764670337e-06,
"loss": 1.6644,
"step": 649
},
{
"epoch": 0.8843537414965986,
"grad_norm": 0.13888676328597274,
"learning_rate": 6.349219226078338e-06,
"loss": 1.707,
"step": 650
},
{
"epoch": 0.8857142857142857,
"grad_norm": 0.12373888508470542,
"learning_rate": 6.3383888642679585e-06,
"loss": 1.562,
"step": 651
},
{
"epoch": 0.8870748299319728,
"grad_norm": 0.1150370310262787,
"learning_rate": 6.327551734010079e-06,
"loss": 1.5981,
"step": 652
},
{
"epoch": 0.8884353741496599,
"grad_norm": 0.1333929036077325,
"learning_rate": 6.3167078901098064e-06,
"loss": 1.6216,
"step": 653
},
{
"epoch": 0.889795918367347,
"grad_norm": 0.09937346566987916,
"learning_rate": 6.305857387406204e-06,
"loss": 1.7385,
"step": 654
},
{
"epoch": 0.891156462585034,
"grad_norm": 0.10542763073830541,
"learning_rate": 6.295000280772004e-06,
"loss": 1.5687,
"step": 655
},
{
"epoch": 0.8925170068027211,
"grad_norm": 0.11666549211549145,
"learning_rate": 6.2841366251133405e-06,
"loss": 1.674,
"step": 656
},
{
"epoch": 0.8938775510204081,
"grad_norm": 0.1010611811906902,
"learning_rate": 6.273266475369466e-06,
"loss": 1.8506,
"step": 657
},
{
"epoch": 0.8952380952380953,
"grad_norm": 0.12229341089698531,
"learning_rate": 6.262389886512475e-06,
"loss": 1.6744,
"step": 658
},
{
"epoch": 0.8965986394557823,
"grad_norm": 0.1127681300190243,
"learning_rate": 6.251506913547021e-06,
"loss": 1.5399,
"step": 659
},
{
"epoch": 0.8979591836734694,
"grad_norm": 0.12397997048973994,
"learning_rate": 6.240617611510049e-06,
"loss": 1.5651,
"step": 660
},
{
"epoch": 0.8993197278911564,
"grad_norm": 0.11295244013894742,
"learning_rate": 6.229722035470509e-06,
"loss": 1.6198,
"step": 661
},
{
"epoch": 0.9006802721088435,
"grad_norm": 0.1075660765121038,
"learning_rate": 6.21882024052908e-06,
"loss": 1.6066,
"step": 662
},
{
"epoch": 0.9020408163265307,
"grad_norm": 0.1202834632226026,
"learning_rate": 6.2079122818178885e-06,
"loss": 1.7857,
"step": 663
},
{
"epoch": 0.9034013605442177,
"grad_norm": 0.10694066024862514,
"learning_rate": 6.196998214500236e-06,
"loss": 1.7661,
"step": 664
},
{
"epoch": 0.9047619047619048,
"grad_norm": 0.10432718122763324,
"learning_rate": 6.186078093770312e-06,
"loss": 1.5971,
"step": 665
},
{
"epoch": 0.9061224489795918,
"grad_norm": 0.11861992306540457,
"learning_rate": 6.1751519748529235e-06,
"loss": 1.5868,
"step": 666
},
{
"epoch": 0.9074829931972789,
"grad_norm": 0.10675270335750815,
"learning_rate": 6.164219913003208e-06,
"loss": 1.7003,
"step": 667
},
{
"epoch": 0.908843537414966,
"grad_norm": 0.11325117184223481,
"learning_rate": 6.153281963506359e-06,
"loss": 1.5944,
"step": 668
},
{
"epoch": 0.9102040816326531,
"grad_norm": 0.10366546297175218,
"learning_rate": 6.142338181677344e-06,
"loss": 1.8128,
"step": 669
},
{
"epoch": 0.9115646258503401,
"grad_norm": 0.10444277193505327,
"learning_rate": 6.131388622860627e-06,
"loss": 1.8767,
"step": 670
},
{
"epoch": 0.9129251700680272,
"grad_norm": 0.10972076945530858,
"learning_rate": 6.1204333424298835e-06,
"loss": 1.7049,
"step": 671
},
{
"epoch": 0.9142857142857143,
"grad_norm": 0.10751143314986424,
"learning_rate": 6.10947239578773e-06,
"loss": 1.7047,
"step": 672
},
{
"epoch": 0.9156462585034013,
"grad_norm": 0.10664892548031508,
"learning_rate": 6.098505838365431e-06,
"loss": 1.7452,
"step": 673
},
{
"epoch": 0.9170068027210885,
"grad_norm": 0.10573801367962304,
"learning_rate": 6.087533725622631e-06,
"loss": 1.6404,
"step": 674
},
{
"epoch": 0.9183673469387755,
"grad_norm": 0.11119140472745474,
"learning_rate": 6.076556113047066e-06,
"loss": 1.7246,
"step": 675
},
{
"epoch": 0.9197278911564626,
"grad_norm": 0.09707326009557425,
"learning_rate": 6.065573056154289e-06,
"loss": 1.6736,
"step": 676
},
{
"epoch": 0.9210884353741496,
"grad_norm": 0.14286264459704873,
"learning_rate": 6.05458461048738e-06,
"loss": 1.7604,
"step": 677
},
{
"epoch": 0.9224489795918367,
"grad_norm": 0.10186789751886853,
"learning_rate": 6.043590831616677e-06,
"loss": 1.52,
"step": 678
},
{
"epoch": 0.9238095238095239,
"grad_norm": 0.08610978660077238,
"learning_rate": 6.032591775139483e-06,
"loss": 1.6948,
"step": 679
},
{
"epoch": 0.9251700680272109,
"grad_norm": 0.14039977263434622,
"learning_rate": 6.0215874966797935e-06,
"loss": 1.7652,
"step": 680
},
{
"epoch": 0.926530612244898,
"grad_norm": 0.09777556431038441,
"learning_rate": 6.0105780518880156e-06,
"loss": 1.5695,
"step": 681
},
{
"epoch": 0.927891156462585,
"grad_norm": 0.10076506193217513,
"learning_rate": 5.999563496440678e-06,
"loss": 1.5797,
"step": 682
},
{
"epoch": 0.9292517006802721,
"grad_norm": 0.13645736281296353,
"learning_rate": 5.988543886040157e-06,
"loss": 1.6124,
"step": 683
},
{
"epoch": 0.9306122448979591,
"grad_norm": 0.18938296093195647,
"learning_rate": 5.977519276414393e-06,
"loss": 1.7377,
"step": 684
},
{
"epoch": 0.9319727891156463,
"grad_norm": 0.11671931887431021,
"learning_rate": 5.966489723316609e-06,
"loss": 1.57,
"step": 685
},
{
"epoch": 0.9333333333333333,
"grad_norm": 0.13192750753599222,
"learning_rate": 5.955455282525027e-06,
"loss": 1.5089,
"step": 686
},
{
"epoch": 0.9346938775510204,
"grad_norm": 0.1279138939941876,
"learning_rate": 5.944416009842585e-06,
"loss": 1.4862,
"step": 687
},
{
"epoch": 0.9360544217687075,
"grad_norm": 0.1368328421716675,
"learning_rate": 5.933371961096661e-06,
"loss": 1.7591,
"step": 688
},
{
"epoch": 0.9374149659863945,
"grad_norm": 0.09459226556926907,
"learning_rate": 5.92232319213878e-06,
"loss": 1.7315,
"step": 689
},
{
"epoch": 0.9387755102040817,
"grad_norm": 0.0944762256396259,
"learning_rate": 5.9112697588443456e-06,
"loss": 1.6664,
"step": 690
},
{
"epoch": 0.9401360544217687,
"grad_norm": 0.11165381365887841,
"learning_rate": 5.900211717112343e-06,
"loss": 1.512,
"step": 691
},
{
"epoch": 0.9414965986394558,
"grad_norm": 0.10026925859750246,
"learning_rate": 5.889149122865067e-06,
"loss": 1.8164,
"step": 692
},
{
"epoch": 0.9428571428571428,
"grad_norm": 0.14168969817408006,
"learning_rate": 5.8780820320478325e-06,
"loss": 1.7176,
"step": 693
},
{
"epoch": 0.9442176870748299,
"grad_norm": 0.11535232171729691,
"learning_rate": 5.867010500628698e-06,
"loss": 1.6684,
"step": 694
},
{
"epoch": 0.9455782312925171,
"grad_norm": 0.10755021118232244,
"learning_rate": 5.855934584598175e-06,
"loss": 1.7584,
"step": 695
},
{
"epoch": 0.9469387755102041,
"grad_norm": 0.11779529743978591,
"learning_rate": 5.844854339968952e-06,
"loss": 1.6906,
"step": 696
},
{
"epoch": 0.9482993197278912,
"grad_norm": 0.11394418134579978,
"learning_rate": 5.8337698227756035e-06,
"loss": 1.6403,
"step": 697
},
{
"epoch": 0.9496598639455782,
"grad_norm": 0.1136124987259348,
"learning_rate": 5.822681089074315e-06,
"loss": 1.5563,
"step": 698
},
{
"epoch": 0.9510204081632653,
"grad_norm": 0.10296990279179531,
"learning_rate": 5.811588194942593e-06,
"loss": 1.7407,
"step": 699
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.11976200431163776,
"learning_rate": 5.800491196478989e-06,
"loss": 1.4828,
"step": 700
},
{
"epoch": 0.9537414965986395,
"grad_norm": 0.11272704179233839,
"learning_rate": 5.789390149802802e-06,
"loss": 1.602,
"step": 701
},
{
"epoch": 0.9551020408163265,
"grad_norm": 0.23740154863087562,
"learning_rate": 5.778285111053812e-06,
"loss": 1.6265,
"step": 702
},
{
"epoch": 0.9564625850340136,
"grad_norm": 0.11973657664057448,
"learning_rate": 5.767176136391982e-06,
"loss": 1.5886,
"step": 703
},
{
"epoch": 0.9578231292517007,
"grad_norm": 0.11604087204409168,
"learning_rate": 5.756063281997183e-06,
"loss": 1.6891,
"step": 704
},
{
"epoch": 0.9591836734693877,
"grad_norm": 0.10628214364332488,
"learning_rate": 5.744946604068904e-06,
"loss": 1.6309,
"step": 705
},
{
"epoch": 0.9605442176870749,
"grad_norm": 0.09908640691690514,
"learning_rate": 5.733826158825973e-06,
"loss": 1.6741,
"step": 706
},
{
"epoch": 0.9619047619047619,
"grad_norm": 0.10347727809388245,
"learning_rate": 5.722702002506264e-06,
"loss": 1.6104,
"step": 707
},
{
"epoch": 0.963265306122449,
"grad_norm": 0.10424856238627535,
"learning_rate": 5.711574191366427e-06,
"loss": 1.6592,
"step": 708
},
{
"epoch": 0.964625850340136,
"grad_norm": 0.09391458584535348,
"learning_rate": 5.700442781681588e-06,
"loss": 1.7451,
"step": 709
},
{
"epoch": 0.9659863945578231,
"grad_norm": 0.1114698322409562,
"learning_rate": 5.689307829745074e-06,
"loss": 1.5695,
"step": 710
},
{
"epoch": 0.9673469387755103,
"grad_norm": 0.1263756308873154,
"learning_rate": 5.678169391868128e-06,
"loss": 1.7918,
"step": 711
},
{
"epoch": 0.9687074829931973,
"grad_norm": 0.1068286657604504,
"learning_rate": 5.6670275243796194e-06,
"loss": 1.6695,
"step": 712
},
{
"epoch": 0.9700680272108844,
"grad_norm": 0.08766062542171628,
"learning_rate": 5.65588228362576e-06,
"loss": 1.8529,
"step": 713
},
{
"epoch": 0.9714285714285714,
"grad_norm": 0.10061266375776706,
"learning_rate": 5.6447337259698245e-06,
"loss": 1.8285,
"step": 714
},
{
"epoch": 0.9727891156462585,
"grad_norm": 0.12075281384282141,
"learning_rate": 5.633581907791858e-06,
"loss": 1.7784,
"step": 715
},
{
"epoch": 0.9741496598639455,
"grad_norm": 0.1332470465992645,
"learning_rate": 5.6224268854884e-06,
"loss": 1.675,
"step": 716
},
{
"epoch": 0.9755102040816327,
"grad_norm": 0.1283354676957953,
"learning_rate": 5.611268715472187e-06,
"loss": 1.4725,
"step": 717
},
{
"epoch": 0.9768707482993197,
"grad_norm": 0.16904870188773163,
"learning_rate": 5.600107454171879e-06,
"loss": 1.6237,
"step": 718
},
{
"epoch": 0.9782312925170068,
"grad_norm": 0.1259321663471572,
"learning_rate": 5.5889431580317655e-06,
"loss": 1.663,
"step": 719
},
{
"epoch": 0.9795918367346939,
"grad_norm": 0.10967294130093679,
"learning_rate": 5.577775883511489e-06,
"loss": 1.6294,
"step": 720
},
{
"epoch": 0.9809523809523809,
"grad_norm": 0.17546450235130967,
"learning_rate": 5.566605687085749e-06,
"loss": 1.5841,
"step": 721
},
{
"epoch": 0.9823129251700681,
"grad_norm": 0.12265102828756684,
"learning_rate": 5.555432625244024e-06,
"loss": 1.4919,
"step": 722
},
{
"epoch": 0.9836734693877551,
"grad_norm": 0.10886604673890071,
"learning_rate": 5.5442567544902805e-06,
"loss": 1.6385,
"step": 723
},
{
"epoch": 0.9850340136054422,
"grad_norm": 0.11421831431853076,
"learning_rate": 5.533078131342695e-06,
"loss": 1.6341,
"step": 724
},
{
"epoch": 0.9863945578231292,
"grad_norm": 0.09054819655341287,
"learning_rate": 5.5218968123333594e-06,
"loss": 1.624,
"step": 725
},
{
"epoch": 0.9877551020408163,
"grad_norm": 0.10971827184516575,
"learning_rate": 5.510712854008001e-06,
"loss": 1.5447,
"step": 726
},
{
"epoch": 0.9891156462585035,
"grad_norm": 0.12112012725777507,
"learning_rate": 5.499526312925693e-06,
"loss": 1.7353,
"step": 727
},
{
"epoch": 0.9904761904761905,
"grad_norm": 0.10541432689931088,
"learning_rate": 5.488337245658569e-06,
"loss": 1.6583,
"step": 728
},
{
"epoch": 0.9918367346938776,
"grad_norm": 0.14736400599380917,
"learning_rate": 5.477145708791543e-06,
"loss": 1.6641,
"step": 729
},
{
"epoch": 0.9931972789115646,
"grad_norm": 0.10683105881710352,
"learning_rate": 5.4659517589220135e-06,
"loss": 1.4082,
"step": 730
},
{
"epoch": 0.9945578231292517,
"grad_norm": 0.1212803888010618,
"learning_rate": 5.454755452659583e-06,
"loss": 1.7298,
"step": 731
},
{
"epoch": 0.9959183673469387,
"grad_norm": 0.114898199928806,
"learning_rate": 5.443556846625773e-06,
"loss": 1.6922,
"step": 732
},
{
"epoch": 0.9972789115646259,
"grad_norm": 0.17977169651896993,
"learning_rate": 5.432355997453729e-06,
"loss": 1.6933,
"step": 733
},
{
"epoch": 0.998639455782313,
"grad_norm": 0.13478171843239625,
"learning_rate": 5.42115296178795e-06,
"loss": 1.758,
"step": 734
},
{
"epoch": 1.0,
"grad_norm": 0.10983192009638694,
"learning_rate": 5.409947796283982e-06,
"loss": 1.6745,
"step": 735
},
{
"epoch": 1.0013605442176872,
"grad_norm": 0.15728756550496747,
"learning_rate": 5.398740557608151e-06,
"loss": 1.5976,
"step": 736
},
{
"epoch": 1.0013605442176872,
"eval_loss": 1.6908553838729858,
"eval_runtime": 76.8223,
"eval_samples_per_second": 53.018,
"eval_steps_per_second": 6.639,
"step": 736
},
{
"epoch": 1.002721088435374,
"grad_norm": 0.10538368349304568,
"learning_rate": 5.38753130243726e-06,
"loss": 1.6615,
"step": 737
},
{
"epoch": 1.0040816326530613,
"grad_norm": 0.11186773529092461,
"learning_rate": 5.376320087458316e-06,
"loss": 1.686,
"step": 738
},
{
"epoch": 1.0054421768707482,
"grad_norm": 0.09819212487389627,
"learning_rate": 5.365106969368235e-06,
"loss": 1.6144,
"step": 739
},
{
"epoch": 1.0068027210884354,
"grad_norm": 0.10858676538086111,
"learning_rate": 5.353892004873554e-06,
"loss": 1.7423,
"step": 740
},
{
"epoch": 1.0081632653061225,
"grad_norm": 0.11968981856525805,
"learning_rate": 5.34267525069015e-06,
"loss": 1.6532,
"step": 741
},
{
"epoch": 1.0095238095238095,
"grad_norm": 0.09936611311145167,
"learning_rate": 5.331456763542954e-06,
"loss": 1.8078,
"step": 742
},
{
"epoch": 1.0108843537414967,
"grad_norm": 0.10022311903878614,
"learning_rate": 5.3202366001656535e-06,
"loss": 1.5739,
"step": 743
},
{
"epoch": 1.0122448979591836,
"grad_norm": 0.4103772585061572,
"learning_rate": 5.309014817300422e-06,
"loss": 1.6617,
"step": 744
},
{
"epoch": 1.0136054421768708,
"grad_norm": 0.10960071410597617,
"learning_rate": 5.297791471697614e-06,
"loss": 1.5742,
"step": 745
},
{
"epoch": 1.014965986394558,
"grad_norm": 0.1310811762519516,
"learning_rate": 5.286566620115493e-06,
"loss": 1.7022,
"step": 746
},
{
"epoch": 1.0163265306122449,
"grad_norm": 0.15568202046790616,
"learning_rate": 5.2753403193199374e-06,
"loss": 1.592,
"step": 747
},
{
"epoch": 1.017687074829932,
"grad_norm": 0.11383339823280172,
"learning_rate": 5.264112626084153e-06,
"loss": 1.6331,
"step": 748
},
{
"epoch": 1.019047619047619,
"grad_norm": 0.11927282820236593,
"learning_rate": 5.2528835971883876e-06,
"loss": 1.7091,
"step": 749
},
{
"epoch": 1.0204081632653061,
"grad_norm": 0.11606987091212463,
"learning_rate": 5.241653289419647e-06,
"loss": 1.8403,
"step": 750
},
{
"epoch": 1.021768707482993,
"grad_norm": 0.16512841788961088,
"learning_rate": 5.230421759571398e-06,
"loss": 1.785,
"step": 751
},
{
"epoch": 1.0231292517006803,
"grad_norm": 0.12135979152340173,
"learning_rate": 5.219189064443296e-06,
"loss": 1.5237,
"step": 752
},
{
"epoch": 1.0244897959183674,
"grad_norm": 0.12001388963231435,
"learning_rate": 5.207955260840879e-06,
"loss": 1.6265,
"step": 753
},
{
"epoch": 1.0258503401360544,
"grad_norm": 0.14241973320076035,
"learning_rate": 5.1967204055753e-06,
"loss": 1.6843,
"step": 754
},
{
"epoch": 1.0272108843537415,
"grad_norm": 0.3503178069734055,
"learning_rate": 5.185484555463026e-06,
"loss": 1.8022,
"step": 755
},
{
"epoch": 1.0285714285714285,
"grad_norm": 0.10584406401171531,
"learning_rate": 5.17424776732556e-06,
"loss": 1.6713,
"step": 756
},
{
"epoch": 1.0299319727891156,
"grad_norm": 0.11525477562466327,
"learning_rate": 5.163010097989138e-06,
"loss": 1.73,
"step": 757
},
{
"epoch": 1.0312925170068028,
"grad_norm": 0.13500765537475423,
"learning_rate": 5.151771604284465e-06,
"loss": 1.405,
"step": 758
},
{
"epoch": 1.0326530612244897,
"grad_norm": 0.18732245961092447,
"learning_rate": 5.140532343046406e-06,
"loss": 1.5587,
"step": 759
},
{
"epoch": 1.034013605442177,
"grad_norm": 0.10827851940146976,
"learning_rate": 5.129292371113712e-06,
"loss": 1.7328,
"step": 760
},
{
"epoch": 1.0353741496598639,
"grad_norm": 0.11027741963001209,
"learning_rate": 5.118051745328725e-06,
"loss": 1.6382,
"step": 761
},
{
"epoch": 1.036734693877551,
"grad_norm": 0.09919441836217119,
"learning_rate": 5.1068105225370975e-06,
"loss": 1.6855,
"step": 762
},
{
"epoch": 1.0380952380952382,
"grad_norm": 0.13830797432010328,
"learning_rate": 5.095568759587497e-06,
"loss": 1.7411,
"step": 763
},
{
"epoch": 1.0394557823129251,
"grad_norm": 0.0972282280148536,
"learning_rate": 5.084326513331328e-06,
"loss": 1.621,
"step": 764
},
{
"epoch": 1.0408163265306123,
"grad_norm": 0.10578341535327669,
"learning_rate": 5.0730838406224324e-06,
"loss": 1.6273,
"step": 765
},
{
"epoch": 1.0421768707482992,
"grad_norm": 0.10463652066026084,
"learning_rate": 5.061840798316815e-06,
"loss": 1.725,
"step": 766
},
{
"epoch": 1.0435374149659864,
"grad_norm": 0.11942338757679548,
"learning_rate": 5.0505974432723445e-06,
"loss": 1.5898,
"step": 767
},
{
"epoch": 1.0448979591836736,
"grad_norm": 0.13689962498964267,
"learning_rate": 5.039353832348477e-06,
"loss": 1.5068,
"step": 768
},
{
"epoch": 1.0462585034013605,
"grad_norm": 0.13678139029155267,
"learning_rate": 5.028110022405955e-06,
"loss": 1.7158,
"step": 769
},
{
"epoch": 1.0476190476190477,
"grad_norm": 0.1000363922457913,
"learning_rate": 5.0168660703065354e-06,
"loss": 1.741,
"step": 770
},
{
"epoch": 1.0489795918367346,
"grad_norm": 0.1188815368304704,
"learning_rate": 5.005622032912687e-06,
"loss": 1.6623,
"step": 771
},
{
"epoch": 1.0503401360544218,
"grad_norm": 0.12019243813821356,
"learning_rate": 4.994377967087316e-06,
"loss": 1.5774,
"step": 772
},
{
"epoch": 1.051700680272109,
"grad_norm": 0.12450888223025358,
"learning_rate": 4.983133929693467e-06,
"loss": 1.5663,
"step": 773
},
{
"epoch": 1.0530612244897959,
"grad_norm": 0.10816898093198718,
"learning_rate": 4.971889977594048e-06,
"loss": 1.6911,
"step": 774
},
{
"epoch": 1.054421768707483,
"grad_norm": 1.729544315460825,
"learning_rate": 4.960646167651524e-06,
"loss": 1.7524,
"step": 775
},
{
"epoch": 1.05578231292517,
"grad_norm": 0.1133964092151596,
"learning_rate": 4.949402556727655e-06,
"loss": 1.7612,
"step": 776
},
{
"epoch": 1.0571428571428572,
"grad_norm": 0.12041549603751739,
"learning_rate": 4.9381592016831856e-06,
"loss": 1.5116,
"step": 777
},
{
"epoch": 1.0585034013605443,
"grad_norm": 0.11576436512487191,
"learning_rate": 4.9269161593775675e-06,
"loss": 1.5329,
"step": 778
},
{
"epoch": 1.0598639455782313,
"grad_norm": 0.12679277289105279,
"learning_rate": 4.915673486668673e-06,
"loss": 1.5506,
"step": 779
},
{
"epoch": 1.0612244897959184,
"grad_norm": 0.11679477613827106,
"learning_rate": 4.904431240412503e-06,
"loss": 1.5008,
"step": 780
},
{
"epoch": 1.0625850340136054,
"grad_norm": 0.10065169200286987,
"learning_rate": 4.893189477462905e-06,
"loss": 1.7685,
"step": 781
},
{
"epoch": 1.0639455782312925,
"grad_norm": 0.10287181892423168,
"learning_rate": 4.881948254671277e-06,
"loss": 1.6379,
"step": 782
},
{
"epoch": 1.0653061224489795,
"grad_norm": 0.10896624508987876,
"learning_rate": 4.870707628886291e-06,
"loss": 1.5234,
"step": 783
},
{
"epoch": 1.0666666666666667,
"grad_norm": 0.10915491188092791,
"learning_rate": 4.859467656953596e-06,
"loss": 1.5865,
"step": 784
},
{
"epoch": 1.0680272108843538,
"grad_norm": 0.1275517896298314,
"learning_rate": 4.8482283957155355e-06,
"loss": 1.7069,
"step": 785
},
{
"epoch": 1.0693877551020408,
"grad_norm": 0.10862109015230759,
"learning_rate": 4.836989902010863e-06,
"loss": 1.7682,
"step": 786
},
{
"epoch": 1.070748299319728,
"grad_norm": 0.09805361474836685,
"learning_rate": 4.825752232674441e-06,
"loss": 1.6228,
"step": 787
},
{
"epoch": 1.0721088435374149,
"grad_norm": 0.12481176224206536,
"learning_rate": 4.814515444536975e-06,
"loss": 1.5027,
"step": 788
},
{
"epoch": 1.073469387755102,
"grad_norm": 0.11390040747547601,
"learning_rate": 4.8032795944247e-06,
"loss": 1.5168,
"step": 789
},
{
"epoch": 1.0748299319727892,
"grad_norm": 0.13458744184978433,
"learning_rate": 4.792044739159124e-06,
"loss": 1.5188,
"step": 790
},
{
"epoch": 1.0761904761904761,
"grad_norm": 0.12294480568667183,
"learning_rate": 4.780810935556707e-06,
"loss": 1.3946,
"step": 791
},
{
"epoch": 1.0775510204081633,
"grad_norm": 0.13732274998725952,
"learning_rate": 4.7695782404286045e-06,
"loss": 1.6201,
"step": 792
},
{
"epoch": 1.0789115646258503,
"grad_norm": 0.09919081899157949,
"learning_rate": 4.758346710580355e-06,
"loss": 1.7815,
"step": 793
},
{
"epoch": 1.0802721088435374,
"grad_norm": 0.10000541950613748,
"learning_rate": 4.747116402811612e-06,
"loss": 1.8098,
"step": 794
},
{
"epoch": 1.0816326530612246,
"grad_norm": 0.11420683474567415,
"learning_rate": 4.735887373915848e-06,
"loss": 1.4835,
"step": 795
},
{
"epoch": 1.0829931972789115,
"grad_norm": 0.15140465279516127,
"learning_rate": 4.724659680680063e-06,
"loss": 1.5029,
"step": 796
},
{
"epoch": 1.0843537414965987,
"grad_norm": 0.11208117898027617,
"learning_rate": 4.713433379884508e-06,
"loss": 1.7194,
"step": 797
},
{
"epoch": 1.0857142857142856,
"grad_norm": 0.16855436651508154,
"learning_rate": 4.7022085283023875e-06,
"loss": 1.7491,
"step": 798
},
{
"epoch": 1.0870748299319728,
"grad_norm": 0.11595366549169164,
"learning_rate": 4.690985182699581e-06,
"loss": 1.6313,
"step": 799
},
{
"epoch": 1.08843537414966,
"grad_norm": 0.09503431401550515,
"learning_rate": 4.679763399834347e-06,
"loss": 1.5984,
"step": 800
},
{
"epoch": 1.089795918367347,
"grad_norm": 0.10896290135710643,
"learning_rate": 4.668543236457049e-06,
"loss": 1.7379,
"step": 801
},
{
"epoch": 1.091156462585034,
"grad_norm": 0.11312340154871599,
"learning_rate": 4.657324749309851e-06,
"loss": 1.7817,
"step": 802
},
{
"epoch": 1.092517006802721,
"grad_norm": 0.11398862476288807,
"learning_rate": 4.646107995126447e-06,
"loss": 1.6113,
"step": 803
},
{
"epoch": 1.0938775510204082,
"grad_norm": 0.11528891846651022,
"learning_rate": 4.634893030631767e-06,
"loss": 1.6745,
"step": 804
},
{
"epoch": 1.0952380952380953,
"grad_norm": 0.11540329784321814,
"learning_rate": 4.623679912541683e-06,
"loss": 1.6443,
"step": 805
},
{
"epoch": 1.0965986394557823,
"grad_norm": 0.11457602895195093,
"learning_rate": 4.612468697562741e-06,
"loss": 1.5109,
"step": 806
},
{
"epoch": 1.0979591836734695,
"grad_norm": 0.108466968677861,
"learning_rate": 4.6012594423918505e-06,
"loss": 1.6285,
"step": 807
},
{
"epoch": 1.0993197278911564,
"grad_norm": 0.11392368141416775,
"learning_rate": 4.5900522037160205e-06,
"loss": 1.524,
"step": 808
},
{
"epoch": 1.1006802721088436,
"grad_norm": 0.10657614384924384,
"learning_rate": 4.578847038212052e-06,
"loss": 1.7741,
"step": 809
},
{
"epoch": 1.1020408163265305,
"grad_norm": 0.11158460621214396,
"learning_rate": 4.567644002546273e-06,
"loss": 1.6648,
"step": 810
},
{
"epoch": 1.1034013605442177,
"grad_norm": 0.11249423664710494,
"learning_rate": 4.556443153374229e-06,
"loss": 1.5484,
"step": 811
},
{
"epoch": 1.1047619047619048,
"grad_norm": 0.18597431248705018,
"learning_rate": 4.5452445473404175e-06,
"loss": 1.4591,
"step": 812
},
{
"epoch": 1.1061224489795918,
"grad_norm": 0.1146701110937394,
"learning_rate": 4.534048241077987e-06,
"loss": 1.7267,
"step": 813
},
{
"epoch": 1.107482993197279,
"grad_norm": 0.10820118015434177,
"learning_rate": 4.522854291208458e-06,
"loss": 1.7739,
"step": 814
},
{
"epoch": 1.1088435374149659,
"grad_norm": 0.09878497379061389,
"learning_rate": 4.511662754341433e-06,
"loss": 1.6488,
"step": 815
},
{
"epoch": 1.110204081632653,
"grad_norm": 0.12970442140488536,
"learning_rate": 4.50047368707431e-06,
"loss": 1.4903,
"step": 816
},
{
"epoch": 1.1115646258503402,
"grad_norm": 0.10382954289821261,
"learning_rate": 4.489287145992002e-06,
"loss": 1.8014,
"step": 817
},
{
"epoch": 1.1129251700680272,
"grad_norm": 0.12667728251125698,
"learning_rate": 4.478103187666642e-06,
"loss": 1.6478,
"step": 818
},
{
"epoch": 1.1142857142857143,
"grad_norm": 0.13500564587776376,
"learning_rate": 4.4669218686573065e-06,
"loss": 1.6239,
"step": 819
},
{
"epoch": 1.1156462585034013,
"grad_norm": 0.1244506846499162,
"learning_rate": 4.45574324550972e-06,
"loss": 1.718,
"step": 820
},
{
"epoch": 1.1170068027210884,
"grad_norm": 0.11873853458664813,
"learning_rate": 4.444567374755978e-06,
"loss": 1.7092,
"step": 821
},
{
"epoch": 1.1183673469387756,
"grad_norm": 0.12208418986043022,
"learning_rate": 4.433394312914253e-06,
"loss": 1.7201,
"step": 822
},
{
"epoch": 1.1197278911564625,
"grad_norm": 0.13523682008047516,
"learning_rate": 4.4222241164885114e-06,
"loss": 1.6248,
"step": 823
},
{
"epoch": 1.1210884353741497,
"grad_norm": 0.11246329856395026,
"learning_rate": 4.411056841968236e-06,
"loss": 1.5008,
"step": 824
},
{
"epoch": 1.1224489795918366,
"grad_norm": 0.12873507179822666,
"learning_rate": 4.3998925458281225e-06,
"loss": 1.6518,
"step": 825
},
{
"epoch": 1.1238095238095238,
"grad_norm": 0.13059910987895082,
"learning_rate": 4.388731284527816e-06,
"loss": 1.6547,
"step": 826
},
{
"epoch": 1.125170068027211,
"grad_norm": 0.13485735751834116,
"learning_rate": 4.377573114511602e-06,
"loss": 1.5989,
"step": 827
},
{
"epoch": 1.126530612244898,
"grad_norm": 0.12101663226880303,
"learning_rate": 4.366418092208144e-06,
"loss": 1.6142,
"step": 828
},
{
"epoch": 1.127891156462585,
"grad_norm": 0.11607507412451191,
"learning_rate": 4.355266274030177e-06,
"loss": 1.6316,
"step": 829
},
{
"epoch": 1.129251700680272,
"grad_norm": 0.10911266136424244,
"learning_rate": 4.344117716374241e-06,
"loss": 1.5342,
"step": 830
},
{
"epoch": 1.1306122448979592,
"grad_norm": 0.12731437170330562,
"learning_rate": 4.332972475620381e-06,
"loss": 1.6973,
"step": 831
},
{
"epoch": 1.1319727891156464,
"grad_norm": 0.09762084699859862,
"learning_rate": 4.321830608131872e-06,
"loss": 1.6633,
"step": 832
},
{
"epoch": 1.1333333333333333,
"grad_norm": 0.12978891884133562,
"learning_rate": 4.310692170254927e-06,
"loss": 1.6098,
"step": 833
},
{
"epoch": 1.1346938775510205,
"grad_norm": 0.12415908074295716,
"learning_rate": 4.299557218318413e-06,
"loss": 1.6307,
"step": 834
},
{
"epoch": 1.1360544217687074,
"grad_norm": 0.2173948925361782,
"learning_rate": 4.2884258086335755e-06,
"loss": 1.5907,
"step": 835
},
{
"epoch": 1.1374149659863946,
"grad_norm": 0.11912082507711948,
"learning_rate": 4.277297997493737e-06,
"loss": 1.5837,
"step": 836
},
{
"epoch": 1.1387755102040815,
"grad_norm": 0.19450908692643373,
"learning_rate": 4.266173841174031e-06,
"loss": 1.7324,
"step": 837
},
{
"epoch": 1.1401360544217687,
"grad_norm": 0.11032655579441553,
"learning_rate": 4.255053395931097e-06,
"loss": 1.7134,
"step": 838
},
{
"epoch": 1.1414965986394559,
"grad_norm": 0.10178083734022528,
"learning_rate": 4.243936718002818e-06,
"loss": 1.6472,
"step": 839
},
{
"epoch": 1.1428571428571428,
"grad_norm": 0.10102775840788815,
"learning_rate": 4.23282386360802e-06,
"loss": 1.6893,
"step": 840
},
{
"epoch": 1.14421768707483,
"grad_norm": 0.13340734842779356,
"learning_rate": 4.22171488894619e-06,
"loss": 1.582,
"step": 841
},
{
"epoch": 1.1455782312925171,
"grad_norm": 0.39827971211004304,
"learning_rate": 4.2106098501972e-06,
"loss": 1.5918,
"step": 842
},
{
"epoch": 1.146938775510204,
"grad_norm": 0.12809647826224843,
"learning_rate": 4.1995088035210126e-06,
"loss": 1.6786,
"step": 843
},
{
"epoch": 1.1482993197278912,
"grad_norm": 0.11881853753159395,
"learning_rate": 4.1884118050574084e-06,
"loss": 1.6218,
"step": 844
},
{
"epoch": 1.1496598639455782,
"grad_norm": 0.13188862048286334,
"learning_rate": 4.177318910925686e-06,
"loss": 1.6943,
"step": 845
},
{
"epoch": 1.1510204081632653,
"grad_norm": 0.11973362482121355,
"learning_rate": 4.1662301772244e-06,
"loss": 1.3518,
"step": 846
},
{
"epoch": 1.1523809523809523,
"grad_norm": 0.1265246682564999,
"learning_rate": 4.15514566003105e-06,
"loss": 1.717,
"step": 847
},
{
"epoch": 1.1537414965986394,
"grad_norm": 0.12163131262108975,
"learning_rate": 4.144065415401825e-06,
"loss": 1.6591,
"step": 848
},
{
"epoch": 1.1551020408163266,
"grad_norm": 0.11281334037077397,
"learning_rate": 4.132989499371303e-06,
"loss": 1.7488,
"step": 849
},
{
"epoch": 1.1564625850340136,
"grad_norm": 0.116733746797477,
"learning_rate": 4.1219179679521675e-06,
"loss": 1.4784,
"step": 850
},
{
"epoch": 1.1578231292517007,
"grad_norm": 0.10824656908525278,
"learning_rate": 4.110850877134935e-06,
"loss": 1.6377,
"step": 851
},
{
"epoch": 1.1591836734693877,
"grad_norm": 0.13121005190396953,
"learning_rate": 4.099788282887658e-06,
"loss": 1.7325,
"step": 852
},
{
"epoch": 1.1605442176870748,
"grad_norm": 0.10756631934140032,
"learning_rate": 4.088730241155657e-06,
"loss": 1.7462,
"step": 853
},
{
"epoch": 1.161904761904762,
"grad_norm": 0.17347249647835658,
"learning_rate": 4.077676807861221e-06,
"loss": 1.4988,
"step": 854
},
{
"epoch": 1.163265306122449,
"grad_norm": 0.10923737210751697,
"learning_rate": 4.066628038903341e-06,
"loss": 1.753,
"step": 855
},
{
"epoch": 1.164625850340136,
"grad_norm": 0.12401952453107581,
"learning_rate": 4.055583990157416e-06,
"loss": 1.6061,
"step": 856
},
{
"epoch": 1.165986394557823,
"grad_norm": 0.16613834484616688,
"learning_rate": 4.044544717474974e-06,
"loss": 1.6004,
"step": 857
},
{
"epoch": 1.1673469387755102,
"grad_norm": 0.10830889852190581,
"learning_rate": 4.033510276683392e-06,
"loss": 1.8508,
"step": 858
},
{
"epoch": 1.1687074829931974,
"grad_norm": 0.11570096470891132,
"learning_rate": 4.022480723585608e-06,
"loss": 1.7405,
"step": 859
},
{
"epoch": 1.1700680272108843,
"grad_norm": 0.13085816560126146,
"learning_rate": 4.011456113959845e-06,
"loss": 1.6882,
"step": 860
},
{
"epoch": 1.1714285714285715,
"grad_norm": 0.09457110643687838,
"learning_rate": 4.000436503559324e-06,
"loss": 1.7382,
"step": 861
},
{
"epoch": 1.1727891156462584,
"grad_norm": 0.11447572869844899,
"learning_rate": 3.989421948111987e-06,
"loss": 1.7447,
"step": 862
},
{
"epoch": 1.1741496598639456,
"grad_norm": 0.10219706136235482,
"learning_rate": 3.978412503320207e-06,
"loss": 1.6079,
"step": 863
},
{
"epoch": 1.1755102040816325,
"grad_norm": 0.09697745892488446,
"learning_rate": 3.967408224860518e-06,
"loss": 1.7658,
"step": 864
},
{
"epoch": 1.1768707482993197,
"grad_norm": 0.11935090111442251,
"learning_rate": 3.956409168383325e-06,
"loss": 1.642,
"step": 865
},
{
"epoch": 1.1782312925170069,
"grad_norm": 0.09593894653017256,
"learning_rate": 3.94541538951262e-06,
"loss": 1.6837,
"step": 866
},
{
"epoch": 1.1795918367346938,
"grad_norm": 0.17924060912455603,
"learning_rate": 3.934426943845712e-06,
"loss": 1.7668,
"step": 867
},
{
"epoch": 1.180952380952381,
"grad_norm": 0.11737602024296209,
"learning_rate": 3.923443886952934e-06,
"loss": 1.5348,
"step": 868
},
{
"epoch": 1.1823129251700681,
"grad_norm": 0.09917281103783915,
"learning_rate": 3.912466274377371e-06,
"loss": 1.6659,
"step": 869
},
{
"epoch": 1.183673469387755,
"grad_norm": 0.10815570929741564,
"learning_rate": 3.901494161634571e-06,
"loss": 1.5029,
"step": 870
},
{
"epoch": 1.1850340136054422,
"grad_norm": 0.09777863761181164,
"learning_rate": 3.890527604212273e-06,
"loss": 1.7809,
"step": 871
},
{
"epoch": 1.1863945578231292,
"grad_norm": 0.12446664048031521,
"learning_rate": 3.879566657570118e-06,
"loss": 1.6918,
"step": 872
},
{
"epoch": 1.1877551020408164,
"grad_norm": 0.11809808630659835,
"learning_rate": 3.868611377139375e-06,
"loss": 1.5293,
"step": 873
},
{
"epoch": 1.1891156462585033,
"grad_norm": 0.11626193699249888,
"learning_rate": 3.857661818322657e-06,
"loss": 1.5724,
"step": 874
},
{
"epoch": 1.1904761904761905,
"grad_norm": 0.10675553850073687,
"learning_rate": 3.846718036493642e-06,
"loss": 1.7026,
"step": 875
},
{
"epoch": 1.1918367346938776,
"grad_norm": 0.25785039953724753,
"learning_rate": 3.835780086996794e-06,
"loss": 1.816,
"step": 876
},
{
"epoch": 1.1931972789115646,
"grad_norm": 0.09919492633891115,
"learning_rate": 3.824848025147078e-06,
"loss": 1.8633,
"step": 877
},
{
"epoch": 1.1945578231292517,
"grad_norm": 0.13021628412390385,
"learning_rate": 3.81392190622969e-06,
"loss": 1.6292,
"step": 878
},
{
"epoch": 1.1959183673469387,
"grad_norm": 0.1063338899612945,
"learning_rate": 3.8030017854997654e-06,
"loss": 1.7619,
"step": 879
},
{
"epoch": 1.1972789115646258,
"grad_norm": 0.1400157862890337,
"learning_rate": 3.7920877181821136e-06,
"loss": 1.5179,
"step": 880
},
{
"epoch": 1.198639455782313,
"grad_norm": 0.1386445404000312,
"learning_rate": 3.781179759470921e-06,
"loss": 1.6624,
"step": 881
},
{
"epoch": 1.2,
"grad_norm": 0.1069092489728771,
"learning_rate": 3.7702779645294907e-06,
"loss": 1.8113,
"step": 882
},
{
"epoch": 1.2013605442176871,
"grad_norm": 0.1190549983688599,
"learning_rate": 3.759382388489952e-06,
"loss": 1.6425,
"step": 883
},
{
"epoch": 1.202721088435374,
"grad_norm": 0.11079642382956639,
"learning_rate": 3.74849308645298e-06,
"loss": 1.6609,
"step": 884
},
{
"epoch": 1.2040816326530612,
"grad_norm": 0.14105617751955826,
"learning_rate": 3.7376101134875278e-06,
"loss": 1.55,
"step": 885
},
{
"epoch": 1.2054421768707484,
"grad_norm": 0.14448379899642935,
"learning_rate": 3.7267335246305346e-06,
"loss": 1.778,
"step": 886
},
{
"epoch": 1.2068027210884353,
"grad_norm": 0.10874354603342407,
"learning_rate": 3.715863374886661e-06,
"loss": 1.6611,
"step": 887
},
{
"epoch": 1.2081632653061225,
"grad_norm": 0.10802721342705796,
"learning_rate": 3.7049997192279976e-06,
"loss": 1.5966,
"step": 888
},
{
"epoch": 1.2095238095238094,
"grad_norm": 0.12233010145027158,
"learning_rate": 3.6941426125937992e-06,
"loss": 1.5311,
"step": 889
},
{
"epoch": 1.2108843537414966,
"grad_norm": 0.10009553518935768,
"learning_rate": 3.6832921098901952e-06,
"loss": 1.5145,
"step": 890
},
{
"epoch": 1.2122448979591836,
"grad_norm": 0.11901267626986793,
"learning_rate": 3.6724482659899226e-06,
"loss": 1.7466,
"step": 891
},
{
"epoch": 1.2136054421768707,
"grad_norm": 0.12962797644138863,
"learning_rate": 3.661611135732043e-06,
"loss": 1.5964,
"step": 892
},
{
"epoch": 1.2149659863945579,
"grad_norm": 0.10733667423090783,
"learning_rate": 3.6507807739216628e-06,
"loss": 1.7763,
"step": 893
},
{
"epoch": 1.2163265306122448,
"grad_norm": 0.11144056418847463,
"learning_rate": 3.6399572353296642e-06,
"loss": 1.7047,
"step": 894
},
{
"epoch": 1.217687074829932,
"grad_norm": 0.08875006973132216,
"learning_rate": 3.6291405746924186e-06,
"loss": 1.7604,
"step": 895
},
{
"epoch": 1.2190476190476192,
"grad_norm": 0.13024552194401284,
"learning_rate": 3.6183308467115174e-06,
"loss": 1.6061,
"step": 896
},
{
"epoch": 1.220408163265306,
"grad_norm": 0.12857972357080566,
"learning_rate": 3.6075281060534917e-06,
"loss": 1.6149,
"step": 897
},
{
"epoch": 1.2217687074829933,
"grad_norm": 0.11603998479755155,
"learning_rate": 3.5967324073495363e-06,
"loss": 1.6111,
"step": 898
},
{
"epoch": 1.2231292517006802,
"grad_norm": 0.11178507529251955,
"learning_rate": 3.585943805195232e-06,
"loss": 1.6971,
"step": 899
},
{
"epoch": 1.2244897959183674,
"grad_norm": 0.14802404367937794,
"learning_rate": 3.575162354150276e-06,
"loss": 1.7452,
"step": 900
},
{
"epoch": 1.2258503401360543,
"grad_norm": 0.09565639349194041,
"learning_rate": 3.5643881087381983e-06,
"loss": 1.7094,
"step": 901
},
{
"epoch": 1.2272108843537415,
"grad_norm": 0.12259182420540694,
"learning_rate": 3.553621123446087e-06,
"loss": 1.4945,
"step": 902
},
{
"epoch": 1.2285714285714286,
"grad_norm": 0.09891155424272953,
"learning_rate": 3.542861452724318e-06,
"loss": 1.7481,
"step": 903
},
{
"epoch": 1.2299319727891156,
"grad_norm": 0.12577917367601982,
"learning_rate": 3.5321091509862733e-06,
"loss": 1.658,
"step": 904
},
{
"epoch": 1.2312925170068028,
"grad_norm": 0.11110224185718393,
"learning_rate": 3.521364272608071e-06,
"loss": 1.7805,
"step": 905
},
{
"epoch": 1.2326530612244897,
"grad_norm": 1.029133559209144,
"learning_rate": 3.5106268719282863e-06,
"loss": 1.6974,
"step": 906
},
{
"epoch": 1.2340136054421769,
"grad_norm": 0.11635973055276655,
"learning_rate": 3.499897003247682e-06,
"loss": 1.6067,
"step": 907
},
{
"epoch": 1.235374149659864,
"grad_norm": 0.1374434820581917,
"learning_rate": 3.489174720828924e-06,
"loss": 1.4329,
"step": 908
},
{
"epoch": 1.236734693877551,
"grad_norm": 0.12005946386037955,
"learning_rate": 3.4784600788963197e-06,
"loss": 1.6061,
"step": 909
},
{
"epoch": 1.2380952380952381,
"grad_norm": 0.2566914570489289,
"learning_rate": 3.4677531316355343e-06,
"loss": 1.6285,
"step": 910
},
{
"epoch": 1.239455782312925,
"grad_norm": 0.12805153195567712,
"learning_rate": 3.4570539331933196e-06,
"loss": 1.6518,
"step": 911
},
{
"epoch": 1.2408163265306122,
"grad_norm": 0.12193109943671782,
"learning_rate": 3.4463625376772415e-06,
"loss": 1.7769,
"step": 912
},
{
"epoch": 1.2421768707482994,
"grad_norm": 0.11785291933334519,
"learning_rate": 3.4356789991554036e-06,
"loss": 1.7037,
"step": 913
},
{
"epoch": 1.2435374149659864,
"grad_norm": 0.13098314516857928,
"learning_rate": 3.425003371656178e-06,
"loss": 1.6332,
"step": 914
},
{
"epoch": 1.2448979591836735,
"grad_norm": 0.11058534722726451,
"learning_rate": 3.4143357091679276e-06,
"loss": 1.8928,
"step": 915
},
{
"epoch": 1.2462585034013605,
"grad_norm": 0.1304628400810422,
"learning_rate": 3.403676065638735e-06,
"loss": 1.5842,
"step": 916
},
{
"epoch": 1.2476190476190476,
"grad_norm": 0.12141593155806699,
"learning_rate": 3.393024494976128e-06,
"loss": 1.6872,
"step": 917
},
{
"epoch": 1.2489795918367346,
"grad_norm": 0.1180901086883869,
"learning_rate": 3.3823810510468146e-06,
"loss": 1.4999,
"step": 918
},
{
"epoch": 1.2503401360544217,
"grad_norm": 0.1426231115449843,
"learning_rate": 3.3717457876763994e-06,
"loss": 1.7262,
"step": 919
},
{
"epoch": 1.251700680272109,
"grad_norm": 0.11420848866165159,
"learning_rate": 3.361118758649116e-06,
"loss": 1.617,
"step": 920
},
{
"epoch": 1.251700680272109,
"eval_loss": 1.6881320476531982,
"eval_runtime": 76.6095,
"eval_samples_per_second": 53.166,
"eval_steps_per_second": 6.657,
"step": 920
}
],
"logging_steps": 1,
"max_steps": 1470,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 184,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.289148227243213e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}