yuzhounie's picture
End of training
5fd4507 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.993417721518987,
"eval_steps": 500,
"global_step": 987,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0030379746835443038,
"grad_norm": 0.7371621381648503,
"learning_rate": 1.0101010101010103e-07,
"loss": 1.2715,
"step": 1
},
{
"epoch": 0.0060759493670886075,
"grad_norm": 0.7121243856267792,
"learning_rate": 2.0202020202020205e-07,
"loss": 1.2093,
"step": 2
},
{
"epoch": 0.009113924050632912,
"grad_norm": 0.6895641068732327,
"learning_rate": 3.0303030303030305e-07,
"loss": 1.1612,
"step": 3
},
{
"epoch": 0.012151898734177215,
"grad_norm": 0.7547606851238392,
"learning_rate": 4.040404040404041e-07,
"loss": 1.2536,
"step": 4
},
{
"epoch": 0.015189873417721518,
"grad_norm": 0.7458825481381227,
"learning_rate": 5.05050505050505e-07,
"loss": 1.2661,
"step": 5
},
{
"epoch": 0.018227848101265823,
"grad_norm": 0.714548498437573,
"learning_rate": 6.060606060606061e-07,
"loss": 1.2054,
"step": 6
},
{
"epoch": 0.021265822784810127,
"grad_norm": 0.793863863197656,
"learning_rate": 7.070707070707071e-07,
"loss": 1.2581,
"step": 7
},
{
"epoch": 0.02430379746835443,
"grad_norm": 0.7243675712297406,
"learning_rate": 8.080808080808082e-07,
"loss": 1.2837,
"step": 8
},
{
"epoch": 0.027341772151898733,
"grad_norm": 0.6869719885583234,
"learning_rate": 9.090909090909091e-07,
"loss": 1.1936,
"step": 9
},
{
"epoch": 0.030379746835443037,
"grad_norm": 0.7740032140637244,
"learning_rate": 1.01010101010101e-06,
"loss": 1.2754,
"step": 10
},
{
"epoch": 0.033417721518987344,
"grad_norm": 0.6974671694839412,
"learning_rate": 1.111111111111111e-06,
"loss": 1.2239,
"step": 11
},
{
"epoch": 0.03645569620253165,
"grad_norm": 0.7177684057673752,
"learning_rate": 1.2121212121212122e-06,
"loss": 1.2018,
"step": 12
},
{
"epoch": 0.03949367088607595,
"grad_norm": 0.6643882501789584,
"learning_rate": 1.3131313131313134e-06,
"loss": 1.1589,
"step": 13
},
{
"epoch": 0.042531645569620254,
"grad_norm": 0.5987712476363073,
"learning_rate": 1.4141414141414143e-06,
"loss": 1.161,
"step": 14
},
{
"epoch": 0.04556962025316456,
"grad_norm": 0.4443403364468464,
"learning_rate": 1.5151515151515152e-06,
"loss": 1.0032,
"step": 15
},
{
"epoch": 0.04860759493670886,
"grad_norm": 0.4588586719890799,
"learning_rate": 1.6161616161616164e-06,
"loss": 1.031,
"step": 16
},
{
"epoch": 0.051645569620253164,
"grad_norm": 0.4586292669176633,
"learning_rate": 1.7171717171717173e-06,
"loss": 1.0449,
"step": 17
},
{
"epoch": 0.05468354430379747,
"grad_norm": 0.45980959938717203,
"learning_rate": 1.8181818181818183e-06,
"loss": 1.0574,
"step": 18
},
{
"epoch": 0.05772151898734177,
"grad_norm": 0.38993134077596253,
"learning_rate": 1.9191919191919192e-06,
"loss": 1.0204,
"step": 19
},
{
"epoch": 0.060759493670886074,
"grad_norm": 0.28486435349906397,
"learning_rate": 2.02020202020202e-06,
"loss": 0.9535,
"step": 20
},
{
"epoch": 0.06379746835443038,
"grad_norm": 0.3428358025619933,
"learning_rate": 2.1212121212121216e-06,
"loss": 0.9261,
"step": 21
},
{
"epoch": 0.06683544303797469,
"grad_norm": 0.3956603548338995,
"learning_rate": 2.222222222222222e-06,
"loss": 0.9615,
"step": 22
},
{
"epoch": 0.06987341772151899,
"grad_norm": 0.3420435076099625,
"learning_rate": 2.3232323232323234e-06,
"loss": 0.9009,
"step": 23
},
{
"epoch": 0.0729113924050633,
"grad_norm": 0.344749396549744,
"learning_rate": 2.4242424242424244e-06,
"loss": 1.0114,
"step": 24
},
{
"epoch": 0.0759493670886076,
"grad_norm": 0.3499048703598331,
"learning_rate": 2.5252525252525258e-06,
"loss": 0.8631,
"step": 25
},
{
"epoch": 0.0789873417721519,
"grad_norm": 0.3509790956435934,
"learning_rate": 2.6262626262626267e-06,
"loss": 0.8845,
"step": 26
},
{
"epoch": 0.0820253164556962,
"grad_norm": 0.32805162453584236,
"learning_rate": 2.7272727272727272e-06,
"loss": 0.9578,
"step": 27
},
{
"epoch": 0.08506329113924051,
"grad_norm": 0.28390672295466896,
"learning_rate": 2.8282828282828286e-06,
"loss": 0.9361,
"step": 28
},
{
"epoch": 0.08810126582278481,
"grad_norm": 0.3332420487428435,
"learning_rate": 2.9292929292929295e-06,
"loss": 0.9165,
"step": 29
},
{
"epoch": 0.09113924050632911,
"grad_norm": 0.3403684512401233,
"learning_rate": 3.0303030303030305e-06,
"loss": 0.8838,
"step": 30
},
{
"epoch": 0.09417721518987342,
"grad_norm": 0.3400158932791219,
"learning_rate": 3.131313131313132e-06,
"loss": 0.9255,
"step": 31
},
{
"epoch": 0.09721518987341772,
"grad_norm": 0.3548904001791882,
"learning_rate": 3.232323232323233e-06,
"loss": 0.9086,
"step": 32
},
{
"epoch": 0.10025316455696202,
"grad_norm": 0.3668768693049186,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.9455,
"step": 33
},
{
"epoch": 0.10329113924050633,
"grad_norm": 0.2927732263854858,
"learning_rate": 3.4343434343434347e-06,
"loss": 0.883,
"step": 34
},
{
"epoch": 0.10632911392405063,
"grad_norm": 0.2973135953536413,
"learning_rate": 3.5353535353535356e-06,
"loss": 0.9051,
"step": 35
},
{
"epoch": 0.10936708860759493,
"grad_norm": 0.2708836297442042,
"learning_rate": 3.6363636363636366e-06,
"loss": 0.8701,
"step": 36
},
{
"epoch": 0.11240506329113924,
"grad_norm": 0.22016073179409937,
"learning_rate": 3.737373737373738e-06,
"loss": 0.8791,
"step": 37
},
{
"epoch": 0.11544303797468354,
"grad_norm": 0.21138154937295342,
"learning_rate": 3.8383838383838385e-06,
"loss": 0.897,
"step": 38
},
{
"epoch": 0.11848101265822784,
"grad_norm": 0.19856129432930908,
"learning_rate": 3.93939393939394e-06,
"loss": 0.8636,
"step": 39
},
{
"epoch": 0.12151898734177215,
"grad_norm": 0.20801873378861047,
"learning_rate": 4.04040404040404e-06,
"loss": 0.8969,
"step": 40
},
{
"epoch": 0.12455696202531645,
"grad_norm": 0.23001046959007862,
"learning_rate": 4.141414141414142e-06,
"loss": 0.7474,
"step": 41
},
{
"epoch": 0.12759493670886077,
"grad_norm": 0.1876660825037611,
"learning_rate": 4.242424242424243e-06,
"loss": 0.8376,
"step": 42
},
{
"epoch": 0.13063291139240507,
"grad_norm": 0.20820658861653177,
"learning_rate": 4.343434343434344e-06,
"loss": 0.8653,
"step": 43
},
{
"epoch": 0.13367088607594937,
"grad_norm": 0.19688388810132487,
"learning_rate": 4.444444444444444e-06,
"loss": 0.7882,
"step": 44
},
{
"epoch": 0.13670886075949368,
"grad_norm": 0.1976424340452257,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.8823,
"step": 45
},
{
"epoch": 0.13974683544303798,
"grad_norm": 0.21760560999512202,
"learning_rate": 4.646464646464647e-06,
"loss": 0.8675,
"step": 46
},
{
"epoch": 0.14278481012658228,
"grad_norm": 0.19046598302412762,
"learning_rate": 4.747474747474748e-06,
"loss": 0.8358,
"step": 47
},
{
"epoch": 0.1458227848101266,
"grad_norm": 0.17689714015085062,
"learning_rate": 4.848484848484849e-06,
"loss": 0.8077,
"step": 48
},
{
"epoch": 0.1488607594936709,
"grad_norm": 0.18328478564193937,
"learning_rate": 4.94949494949495e-06,
"loss": 0.8297,
"step": 49
},
{
"epoch": 0.1518987341772152,
"grad_norm": 0.1774577565120857,
"learning_rate": 5.0505050505050515e-06,
"loss": 0.823,
"step": 50
},
{
"epoch": 0.1549367088607595,
"grad_norm": 0.1703091618116316,
"learning_rate": 5.151515151515152e-06,
"loss": 0.8012,
"step": 51
},
{
"epoch": 0.1579746835443038,
"grad_norm": 0.18450831545374227,
"learning_rate": 5.252525252525253e-06,
"loss": 0.8383,
"step": 52
},
{
"epoch": 0.1610126582278481,
"grad_norm": 0.17118317616212286,
"learning_rate": 5.353535353535354e-06,
"loss": 0.7941,
"step": 53
},
{
"epoch": 0.1640506329113924,
"grad_norm": 0.16625903565725803,
"learning_rate": 5.4545454545454545e-06,
"loss": 0.8171,
"step": 54
},
{
"epoch": 0.1670886075949367,
"grad_norm": 0.185571990325072,
"learning_rate": 5.555555555555557e-06,
"loss": 0.8695,
"step": 55
},
{
"epoch": 0.17012658227848101,
"grad_norm": 0.16307355925570016,
"learning_rate": 5.656565656565657e-06,
"loss": 0.7919,
"step": 56
},
{
"epoch": 0.17316455696202532,
"grad_norm": 0.1628781595142016,
"learning_rate": 5.7575757575757586e-06,
"loss": 0.8181,
"step": 57
},
{
"epoch": 0.17620253164556962,
"grad_norm": 0.15474465013909197,
"learning_rate": 5.858585858585859e-06,
"loss": 0.8249,
"step": 58
},
{
"epoch": 0.17924050632911392,
"grad_norm": 0.1639462614140778,
"learning_rate": 5.95959595959596e-06,
"loss": 0.834,
"step": 59
},
{
"epoch": 0.18227848101265823,
"grad_norm": 0.18542685941142933,
"learning_rate": 6.060606060606061e-06,
"loss": 0.8308,
"step": 60
},
{
"epoch": 0.18531645569620253,
"grad_norm": 0.15537864217455533,
"learning_rate": 6.1616161616161615e-06,
"loss": 0.773,
"step": 61
},
{
"epoch": 0.18835443037974683,
"grad_norm": 0.17436631962110918,
"learning_rate": 6.262626262626264e-06,
"loss": 0.8021,
"step": 62
},
{
"epoch": 0.19139240506329114,
"grad_norm": 0.1752248753112386,
"learning_rate": 6.363636363636364e-06,
"loss": 0.8753,
"step": 63
},
{
"epoch": 0.19443037974683544,
"grad_norm": 0.1657769895142651,
"learning_rate": 6.464646464646466e-06,
"loss": 0.8567,
"step": 64
},
{
"epoch": 0.19746835443037974,
"grad_norm": 0.15913039516725547,
"learning_rate": 6.565656565656566e-06,
"loss": 0.7868,
"step": 65
},
{
"epoch": 0.20050632911392405,
"grad_norm": 0.17115941323518508,
"learning_rate": 6.666666666666667e-06,
"loss": 0.8183,
"step": 66
},
{
"epoch": 0.20354430379746835,
"grad_norm": 0.16253020468901574,
"learning_rate": 6.767676767676769e-06,
"loss": 0.8366,
"step": 67
},
{
"epoch": 0.20658227848101265,
"grad_norm": 0.15683183443144275,
"learning_rate": 6.868686868686869e-06,
"loss": 0.7998,
"step": 68
},
{
"epoch": 0.20962025316455696,
"grad_norm": 0.14324369162375836,
"learning_rate": 6.969696969696971e-06,
"loss": 0.8049,
"step": 69
},
{
"epoch": 0.21265822784810126,
"grad_norm": 0.15382265145278182,
"learning_rate": 7.070707070707071e-06,
"loss": 0.7915,
"step": 70
},
{
"epoch": 0.21569620253164556,
"grad_norm": 0.15335613187813119,
"learning_rate": 7.171717171717172e-06,
"loss": 0.8056,
"step": 71
},
{
"epoch": 0.21873417721518987,
"grad_norm": 0.1537382161073269,
"learning_rate": 7.272727272727273e-06,
"loss": 0.7986,
"step": 72
},
{
"epoch": 0.22177215189873417,
"grad_norm": 0.17379425501074097,
"learning_rate": 7.373737373737374e-06,
"loss": 0.8681,
"step": 73
},
{
"epoch": 0.22481012658227847,
"grad_norm": 0.15843515964361574,
"learning_rate": 7.474747474747476e-06,
"loss": 0.7744,
"step": 74
},
{
"epoch": 0.22784810126582278,
"grad_norm": 0.1751604463972058,
"learning_rate": 7.5757575757575764e-06,
"loss": 0.7857,
"step": 75
},
{
"epoch": 0.23088607594936708,
"grad_norm": 0.15053732125197122,
"learning_rate": 7.676767676767677e-06,
"loss": 0.8095,
"step": 76
},
{
"epoch": 0.23392405063291138,
"grad_norm": 0.1532282564562081,
"learning_rate": 7.77777777777778e-06,
"loss": 0.7742,
"step": 77
},
{
"epoch": 0.2369620253164557,
"grad_norm": 0.16579482428765188,
"learning_rate": 7.87878787878788e-06,
"loss": 0.7522,
"step": 78
},
{
"epoch": 0.24,
"grad_norm": 0.15723235084773576,
"learning_rate": 7.97979797979798e-06,
"loss": 0.8157,
"step": 79
},
{
"epoch": 0.2430379746835443,
"grad_norm": 0.14591068220696474,
"learning_rate": 8.08080808080808e-06,
"loss": 0.8117,
"step": 80
},
{
"epoch": 0.2460759493670886,
"grad_norm": 0.1506364803840211,
"learning_rate": 8.181818181818183e-06,
"loss": 0.7671,
"step": 81
},
{
"epoch": 0.2491139240506329,
"grad_norm": 0.172379076159448,
"learning_rate": 8.282828282828283e-06,
"loss": 0.8397,
"step": 82
},
{
"epoch": 0.2521518987341772,
"grad_norm": 0.15429561913459985,
"learning_rate": 8.383838383838384e-06,
"loss": 0.7627,
"step": 83
},
{
"epoch": 0.25518987341772154,
"grad_norm": 0.16409881360689876,
"learning_rate": 8.484848484848486e-06,
"loss": 0.8123,
"step": 84
},
{
"epoch": 0.2582278481012658,
"grad_norm": 0.15262847761041756,
"learning_rate": 8.585858585858587e-06,
"loss": 0.7831,
"step": 85
},
{
"epoch": 0.26126582278481014,
"grad_norm": 0.14775969810777986,
"learning_rate": 8.686868686868687e-06,
"loss": 0.7792,
"step": 86
},
{
"epoch": 0.2643037974683544,
"grad_norm": 0.16227868746752167,
"learning_rate": 8.787878787878788e-06,
"loss": 0.7481,
"step": 87
},
{
"epoch": 0.26734177215189875,
"grad_norm": 0.14781371340131963,
"learning_rate": 8.888888888888888e-06,
"loss": 0.7692,
"step": 88
},
{
"epoch": 0.270379746835443,
"grad_norm": 0.15246159049852878,
"learning_rate": 8.98989898989899e-06,
"loss": 0.7777,
"step": 89
},
{
"epoch": 0.27341772151898736,
"grad_norm": 0.15261383672120973,
"learning_rate": 9.090909090909091e-06,
"loss": 0.7863,
"step": 90
},
{
"epoch": 0.27645569620253163,
"grad_norm": 0.1555370312321936,
"learning_rate": 9.191919191919193e-06,
"loss": 0.7787,
"step": 91
},
{
"epoch": 0.27949367088607596,
"grad_norm": 0.15449424911977483,
"learning_rate": 9.292929292929294e-06,
"loss": 0.7489,
"step": 92
},
{
"epoch": 0.28253164556962024,
"grad_norm": 0.1517952399280153,
"learning_rate": 9.393939393939396e-06,
"loss": 0.7824,
"step": 93
},
{
"epoch": 0.28556962025316457,
"grad_norm": 0.14848789600098466,
"learning_rate": 9.494949494949497e-06,
"loss": 0.7366,
"step": 94
},
{
"epoch": 0.28860759493670884,
"grad_norm": 0.1578591812892815,
"learning_rate": 9.595959595959597e-06,
"loss": 0.7596,
"step": 95
},
{
"epoch": 0.2916455696202532,
"grad_norm": 0.15400695563654634,
"learning_rate": 9.696969696969698e-06,
"loss": 0.7449,
"step": 96
},
{
"epoch": 0.29468354430379745,
"grad_norm": 0.1622217545636064,
"learning_rate": 9.797979797979798e-06,
"loss": 0.7852,
"step": 97
},
{
"epoch": 0.2977215189873418,
"grad_norm": 0.1571414989310096,
"learning_rate": 9.8989898989899e-06,
"loss": 0.798,
"step": 98
},
{
"epoch": 0.30075949367088606,
"grad_norm": 0.15369871304753976,
"learning_rate": 1e-05,
"loss": 0.8095,
"step": 99
},
{
"epoch": 0.3037974683544304,
"grad_norm": 0.1560154961387438,
"learning_rate": 9.999968709437563e-06,
"loss": 0.7465,
"step": 100
},
{
"epoch": 0.30683544303797466,
"grad_norm": 0.15573381688358523,
"learning_rate": 9.999874838141888e-06,
"loss": 0.774,
"step": 101
},
{
"epoch": 0.309873417721519,
"grad_norm": 0.15879548729475618,
"learning_rate": 9.999718387287891e-06,
"loss": 0.7867,
"step": 102
},
{
"epoch": 0.31291139240506327,
"grad_norm": 0.1362205521765718,
"learning_rate": 9.999499358833745e-06,
"loss": 0.7392,
"step": 103
},
{
"epoch": 0.3159493670886076,
"grad_norm": 0.1554346438640984,
"learning_rate": 9.99921775552086e-06,
"loss": 0.7829,
"step": 104
},
{
"epoch": 0.3189873417721519,
"grad_norm": 0.1504818128917585,
"learning_rate": 9.998873580873848e-06,
"loss": 0.7264,
"step": 105
},
{
"epoch": 0.3220253164556962,
"grad_norm": 0.16944580979135326,
"learning_rate": 9.998466839200474e-06,
"loss": 0.7611,
"step": 106
},
{
"epoch": 0.3250632911392405,
"grad_norm": 0.15892279236814086,
"learning_rate": 9.99799753559161e-06,
"loss": 0.805,
"step": 107
},
{
"epoch": 0.3281012658227848,
"grad_norm": 0.14533324182381482,
"learning_rate": 9.997465675921163e-06,
"loss": 0.7302,
"step": 108
},
{
"epoch": 0.3311392405063291,
"grad_norm": 0.1593407263801221,
"learning_rate": 9.99687126684601e-06,
"loss": 0.7783,
"step": 109
},
{
"epoch": 0.3341772151898734,
"grad_norm": 0.15734266679164916,
"learning_rate": 9.99621431580591e-06,
"loss": 0.7519,
"step": 110
},
{
"epoch": 0.3372151898734177,
"grad_norm": 0.15737748293585987,
"learning_rate": 9.99549483102341e-06,
"loss": 0.7683,
"step": 111
},
{
"epoch": 0.34025316455696203,
"grad_norm": 0.16682595959306512,
"learning_rate": 9.994712821503737e-06,
"loss": 0.7963,
"step": 112
},
{
"epoch": 0.3432911392405063,
"grad_norm": 0.1437250564900874,
"learning_rate": 9.993868297034709e-06,
"loss": 0.7697,
"step": 113
},
{
"epoch": 0.34632911392405064,
"grad_norm": 0.15783291002114871,
"learning_rate": 9.992961268186575e-06,
"loss": 0.7735,
"step": 114
},
{
"epoch": 0.3493670886075949,
"grad_norm": 0.1609814436777924,
"learning_rate": 9.991991746311916e-06,
"loss": 0.7612,
"step": 115
},
{
"epoch": 0.35240506329113924,
"grad_norm": 0.15950583014151243,
"learning_rate": 9.990959743545487e-06,
"loss": 0.7478,
"step": 116
},
{
"epoch": 0.3554430379746835,
"grad_norm": 0.15464679834142525,
"learning_rate": 9.989865272804064e-06,
"loss": 0.739,
"step": 117
},
{
"epoch": 0.35848101265822785,
"grad_norm": 0.14458722845111444,
"learning_rate": 9.988708347786288e-06,
"loss": 0.7105,
"step": 118
},
{
"epoch": 0.3615189873417721,
"grad_norm": 0.15076065245614487,
"learning_rate": 9.9874889829725e-06,
"loss": 0.7009,
"step": 119
},
{
"epoch": 0.36455696202531646,
"grad_norm": 0.148555033903784,
"learning_rate": 9.986207193624537e-06,
"loss": 0.7433,
"step": 120
},
{
"epoch": 0.3675949367088608,
"grad_norm": 0.16560295407333225,
"learning_rate": 9.984862995785564e-06,
"loss": 0.8137,
"step": 121
},
{
"epoch": 0.37063291139240506,
"grad_norm": 0.14662078483089422,
"learning_rate": 9.983456406279866e-06,
"loss": 0.7226,
"step": 122
},
{
"epoch": 0.3736708860759494,
"grad_norm": 0.1634791948015343,
"learning_rate": 9.981987442712634e-06,
"loss": 0.7661,
"step": 123
},
{
"epoch": 0.37670886075949367,
"grad_norm": 0.14887460537724473,
"learning_rate": 9.980456123469743e-06,
"loss": 0.7335,
"step": 124
},
{
"epoch": 0.379746835443038,
"grad_norm": 0.15631938007232998,
"learning_rate": 9.978862467717532e-06,
"loss": 0.798,
"step": 125
},
{
"epoch": 0.3827848101265823,
"grad_norm": 0.15378166270358495,
"learning_rate": 9.977206495402554e-06,
"loss": 0.7716,
"step": 126
},
{
"epoch": 0.3858227848101266,
"grad_norm": 0.15944325856222438,
"learning_rate": 9.97548822725133e-06,
"loss": 0.8056,
"step": 127
},
{
"epoch": 0.3888607594936709,
"grad_norm": 0.15806585106656948,
"learning_rate": 9.973707684770095e-06,
"loss": 0.7598,
"step": 128
},
{
"epoch": 0.3918987341772152,
"grad_norm": 0.15876456210482656,
"learning_rate": 9.971864890244514e-06,
"loss": 0.7618,
"step": 129
},
{
"epoch": 0.3949367088607595,
"grad_norm": 0.15077324094201847,
"learning_rate": 9.96995986673942e-06,
"loss": 0.7586,
"step": 130
},
{
"epoch": 0.3979746835443038,
"grad_norm": 0.16205966422080534,
"learning_rate": 9.967992638098517e-06,
"loss": 0.7411,
"step": 131
},
{
"epoch": 0.4010126582278481,
"grad_norm": 0.15759028020464438,
"learning_rate": 9.965963228944077e-06,
"loss": 0.762,
"step": 132
},
{
"epoch": 0.4040506329113924,
"grad_norm": 0.1609850388940067,
"learning_rate": 9.963871664676647e-06,
"loss": 0.7903,
"step": 133
},
{
"epoch": 0.4070886075949367,
"grad_norm": 0.15140219730566665,
"learning_rate": 9.961717971474714e-06,
"loss": 0.7086,
"step": 134
},
{
"epoch": 0.41012658227848103,
"grad_norm": 0.14653263161882255,
"learning_rate": 9.959502176294384e-06,
"loss": 0.7214,
"step": 135
},
{
"epoch": 0.4131645569620253,
"grad_norm": 0.14811066816184854,
"learning_rate": 9.957224306869053e-06,
"loss": 0.7414,
"step": 136
},
{
"epoch": 0.41620253164556964,
"grad_norm": 0.15595540327858165,
"learning_rate": 9.954884391709043e-06,
"loss": 0.7836,
"step": 137
},
{
"epoch": 0.4192405063291139,
"grad_norm": 0.14108541773046018,
"learning_rate": 9.95248246010126e-06,
"loss": 0.7478,
"step": 138
},
{
"epoch": 0.42227848101265825,
"grad_norm": 0.15906810703639698,
"learning_rate": 9.950018542108818e-06,
"loss": 0.7606,
"step": 139
},
{
"epoch": 0.4253164556962025,
"grad_norm": 0.1528383149198257,
"learning_rate": 9.947492668570675e-06,
"loss": 0.725,
"step": 140
},
{
"epoch": 0.42835443037974685,
"grad_norm": 0.1546681429241187,
"learning_rate": 9.944904871101227e-06,
"loss": 0.7558,
"step": 141
},
{
"epoch": 0.43139240506329113,
"grad_norm": 0.15492591908933548,
"learning_rate": 9.94225518208993e-06,
"loss": 0.7417,
"step": 142
},
{
"epoch": 0.43443037974683546,
"grad_norm": 0.14717883243941593,
"learning_rate": 9.939543634700891e-06,
"loss": 0.6828,
"step": 143
},
{
"epoch": 0.43746835443037974,
"grad_norm": 0.14977763365685373,
"learning_rate": 9.936770262872444e-06,
"loss": 0.7945,
"step": 144
},
{
"epoch": 0.44050632911392407,
"grad_norm": 0.15682104947747366,
"learning_rate": 9.933935101316735e-06,
"loss": 0.7532,
"step": 145
},
{
"epoch": 0.44354430379746834,
"grad_norm": 0.14584023263645782,
"learning_rate": 9.931038185519285e-06,
"loss": 0.7403,
"step": 146
},
{
"epoch": 0.4465822784810127,
"grad_norm": 0.14915701695822176,
"learning_rate": 9.928079551738542e-06,
"loss": 0.7232,
"step": 147
},
{
"epoch": 0.44962025316455695,
"grad_norm": 0.1521889025235044,
"learning_rate": 9.925059237005437e-06,
"loss": 0.7788,
"step": 148
},
{
"epoch": 0.4526582278481013,
"grad_norm": 0.14655442772334604,
"learning_rate": 9.9219772791229e-06,
"loss": 0.7531,
"step": 149
},
{
"epoch": 0.45569620253164556,
"grad_norm": 0.15488348310624783,
"learning_rate": 9.91883371666542e-06,
"loss": 0.7865,
"step": 150
},
{
"epoch": 0.4587341772151899,
"grad_norm": 0.15556737214012695,
"learning_rate": 9.915628588978522e-06,
"loss": 0.783,
"step": 151
},
{
"epoch": 0.46177215189873416,
"grad_norm": 0.14407916946359384,
"learning_rate": 9.912361936178312e-06,
"loss": 0.7452,
"step": 152
},
{
"epoch": 0.4648101265822785,
"grad_norm": 0.1515203148718222,
"learning_rate": 9.909033799150947e-06,
"loss": 0.74,
"step": 153
},
{
"epoch": 0.46784810126582277,
"grad_norm": 0.161850312633236,
"learning_rate": 9.90564421955214e-06,
"loss": 0.733,
"step": 154
},
{
"epoch": 0.4708860759493671,
"grad_norm": 0.1567309547943677,
"learning_rate": 9.902193239806634e-06,
"loss": 0.7717,
"step": 155
},
{
"epoch": 0.4739240506329114,
"grad_norm": 0.14856948096609965,
"learning_rate": 9.898680903107668e-06,
"loss": 0.7391,
"step": 156
},
{
"epoch": 0.4769620253164557,
"grad_norm": 0.15989687538267836,
"learning_rate": 9.895107253416434e-06,
"loss": 0.827,
"step": 157
},
{
"epoch": 0.48,
"grad_norm": 0.1497020361061319,
"learning_rate": 9.891472335461537e-06,
"loss": 0.7261,
"step": 158
},
{
"epoch": 0.4830379746835443,
"grad_norm": 0.14671522059197523,
"learning_rate": 9.887776194738433e-06,
"loss": 0.7388,
"step": 159
},
{
"epoch": 0.4860759493670886,
"grad_norm": 0.15967144252922383,
"learning_rate": 9.884018877508844e-06,
"loss": 0.7581,
"step": 160
},
{
"epoch": 0.4891139240506329,
"grad_norm": 0.14510079743012144,
"learning_rate": 9.8802004308002e-06,
"loss": 0.7631,
"step": 161
},
{
"epoch": 0.4921518987341772,
"grad_norm": 0.14504835357709286,
"learning_rate": 9.876320902405041e-06,
"loss": 0.727,
"step": 162
},
{
"epoch": 0.4951898734177215,
"grad_norm": 0.14744404256400437,
"learning_rate": 9.872380340880416e-06,
"loss": 0.7425,
"step": 163
},
{
"epoch": 0.4982278481012658,
"grad_norm": 0.15465095712268317,
"learning_rate": 9.86837879554728e-06,
"loss": 0.784,
"step": 164
},
{
"epoch": 0.5012658227848101,
"grad_norm": 0.15665467457584256,
"learning_rate": 9.864316316489873e-06,
"loss": 0.7657,
"step": 165
},
{
"epoch": 0.5043037974683544,
"grad_norm": 0.15635361845956908,
"learning_rate": 9.860192954555099e-06,
"loss": 0.7587,
"step": 166
},
{
"epoch": 0.5073417721518987,
"grad_norm": 0.15345265613234227,
"learning_rate": 9.856008761351882e-06,
"loss": 0.7701,
"step": 167
},
{
"epoch": 0.5103797468354431,
"grad_norm": 0.1522326179620225,
"learning_rate": 9.851763789250526e-06,
"loss": 0.7705,
"step": 168
},
{
"epoch": 0.5134177215189873,
"grad_norm": 0.1538498652648587,
"learning_rate": 9.847458091382057e-06,
"loss": 0.7369,
"step": 169
},
{
"epoch": 0.5164556962025316,
"grad_norm": 0.14512401623774593,
"learning_rate": 9.843091721637559e-06,
"loss": 0.7332,
"step": 170
},
{
"epoch": 0.5194936708860759,
"grad_norm": 0.15457558414252634,
"learning_rate": 9.838664734667496e-06,
"loss": 0.736,
"step": 171
},
{
"epoch": 0.5225316455696203,
"grad_norm": 0.14212306520883133,
"learning_rate": 9.834177185881033e-06,
"loss": 0.7453,
"step": 172
},
{
"epoch": 0.5255696202531646,
"grad_norm": 0.14603440656705588,
"learning_rate": 9.829629131445342e-06,
"loss": 0.7801,
"step": 173
},
{
"epoch": 0.5286075949367088,
"grad_norm": 0.15306906645304788,
"learning_rate": 9.825020628284896e-06,
"loss": 0.7916,
"step": 174
},
{
"epoch": 0.5316455696202531,
"grad_norm": 0.14991315878050276,
"learning_rate": 9.820351734080754e-06,
"loss": 0.7729,
"step": 175
},
{
"epoch": 0.5346835443037975,
"grad_norm": 0.14695678972714699,
"learning_rate": 9.81562250726985e-06,
"loss": 0.7414,
"step": 176
},
{
"epoch": 0.5377215189873418,
"grad_norm": 0.16546368468082698,
"learning_rate": 9.810833007044247e-06,
"loss": 0.76,
"step": 177
},
{
"epoch": 0.540759493670886,
"grad_norm": 0.14445501453584003,
"learning_rate": 9.805983293350413e-06,
"loss": 0.7533,
"step": 178
},
{
"epoch": 0.5437974683544303,
"grad_norm": 0.15241407553611241,
"learning_rate": 9.801073426888447e-06,
"loss": 0.7512,
"step": 179
},
{
"epoch": 0.5468354430379747,
"grad_norm": 0.14637055604971663,
"learning_rate": 9.796103469111352e-06,
"loss": 0.7161,
"step": 180
},
{
"epoch": 0.549873417721519,
"grad_norm": 0.1476130284956656,
"learning_rate": 9.791073482224229e-06,
"loss": 0.7283,
"step": 181
},
{
"epoch": 0.5529113924050633,
"grad_norm": 0.14070704406258117,
"learning_rate": 9.785983529183533e-06,
"loss": 0.7019,
"step": 182
},
{
"epoch": 0.5559493670886076,
"grad_norm": 0.14820788542442023,
"learning_rate": 9.780833673696255e-06,
"loss": 0.7223,
"step": 183
},
{
"epoch": 0.5589873417721519,
"grad_norm": 0.1573533382049134,
"learning_rate": 9.775623980219149e-06,
"loss": 0.7753,
"step": 184
},
{
"epoch": 0.5620253164556962,
"grad_norm": 0.1564179366024661,
"learning_rate": 9.77035451395791e-06,
"loss": 0.7879,
"step": 185
},
{
"epoch": 0.5650632911392405,
"grad_norm": 0.1466584778846727,
"learning_rate": 9.76502534086636e-06,
"loss": 0.7154,
"step": 186
},
{
"epoch": 0.5681012658227849,
"grad_norm": 0.14713416182841468,
"learning_rate": 9.759636527645633e-06,
"loss": 0.743,
"step": 187
},
{
"epoch": 0.5711392405063291,
"grad_norm": 0.14751084971270478,
"learning_rate": 9.754188141743326e-06,
"loss": 0.722,
"step": 188
},
{
"epoch": 0.5741772151898734,
"grad_norm": 0.1462856046848872,
"learning_rate": 9.74868025135266e-06,
"loss": 0.769,
"step": 189
},
{
"epoch": 0.5772151898734177,
"grad_norm": 0.15188256888820945,
"learning_rate": 9.743112925411633e-06,
"loss": 0.7497,
"step": 190
},
{
"epoch": 0.5802531645569621,
"grad_norm": 0.1412145269579806,
"learning_rate": 9.737486233602149e-06,
"loss": 0.7256,
"step": 191
},
{
"epoch": 0.5832911392405064,
"grad_norm": 0.13894303122510548,
"learning_rate": 9.731800246349149e-06,
"loss": 0.7028,
"step": 192
},
{
"epoch": 0.5863291139240506,
"grad_norm": 0.15164906741349582,
"learning_rate": 9.726055034819726e-06,
"loss": 0.75,
"step": 193
},
{
"epoch": 0.5893670886075949,
"grad_norm": 0.16580118267141683,
"learning_rate": 9.720250670922242e-06,
"loss": 0.7142,
"step": 194
},
{
"epoch": 0.5924050632911393,
"grad_norm": 0.14663787636584952,
"learning_rate": 9.714387227305422e-06,
"loss": 0.7843,
"step": 195
},
{
"epoch": 0.5954430379746836,
"grad_norm": 0.14951469931311676,
"learning_rate": 9.708464777357444e-06,
"loss": 0.7794,
"step": 196
},
{
"epoch": 0.5984810126582278,
"grad_norm": 0.14830789462210187,
"learning_rate": 9.702483395205023e-06,
"loss": 0.6941,
"step": 197
},
{
"epoch": 0.6015189873417721,
"grad_norm": 0.15913553670944275,
"learning_rate": 9.696443155712488e-06,
"loss": 0.7937,
"step": 198
},
{
"epoch": 0.6045569620253165,
"grad_norm": 0.15406874010366034,
"learning_rate": 9.69034413448083e-06,
"loss": 0.7306,
"step": 199
},
{
"epoch": 0.6075949367088608,
"grad_norm": 0.15479127906595072,
"learning_rate": 9.684186407846774e-06,
"loss": 0.7444,
"step": 200
},
{
"epoch": 0.610632911392405,
"grad_norm": 0.1473992261753897,
"learning_rate": 9.677970052881811e-06,
"loss": 0.7291,
"step": 201
},
{
"epoch": 0.6136708860759493,
"grad_norm": 0.1627317899066151,
"learning_rate": 9.67169514739124e-06,
"loss": 0.7472,
"step": 202
},
{
"epoch": 0.6167088607594937,
"grad_norm": 0.14415245040716593,
"learning_rate": 9.665361769913187e-06,
"loss": 0.6764,
"step": 203
},
{
"epoch": 0.619746835443038,
"grad_norm": 0.1532282639172276,
"learning_rate": 9.658969999717631e-06,
"loss": 0.7589,
"step": 204
},
{
"epoch": 0.6227848101265823,
"grad_norm": 0.14807691817541985,
"learning_rate": 9.652519916805406e-06,
"loss": 0.7312,
"step": 205
},
{
"epoch": 0.6258227848101265,
"grad_norm": 0.14134727920980444,
"learning_rate": 9.6460116019072e-06,
"loss": 0.716,
"step": 206
},
{
"epoch": 0.6288607594936709,
"grad_norm": 0.1520848085758245,
"learning_rate": 9.639445136482549e-06,
"loss": 0.726,
"step": 207
},
{
"epoch": 0.6318987341772152,
"grad_norm": 0.15130523844782648,
"learning_rate": 9.632820602718806e-06,
"loss": 0.7231,
"step": 208
},
{
"epoch": 0.6349367088607595,
"grad_norm": 0.15175606335359032,
"learning_rate": 9.62613808353013e-06,
"loss": 0.7642,
"step": 209
},
{
"epoch": 0.6379746835443038,
"grad_norm": 0.15295189011746751,
"learning_rate": 9.619397662556434e-06,
"loss": 0.7533,
"step": 210
},
{
"epoch": 0.6410126582278481,
"grad_norm": 0.14763127465770362,
"learning_rate": 9.612599424162344e-06,
"loss": 0.7357,
"step": 211
},
{
"epoch": 0.6440506329113924,
"grad_norm": 0.15211730113967573,
"learning_rate": 9.60574345343614e-06,
"loss": 0.7379,
"step": 212
},
{
"epoch": 0.6470886075949367,
"grad_norm": 0.15562838145344063,
"learning_rate": 9.598829836188694e-06,
"loss": 0.7353,
"step": 213
},
{
"epoch": 0.650126582278481,
"grad_norm": 0.14905205328328655,
"learning_rate": 9.591858658952396e-06,
"loss": 0.7363,
"step": 214
},
{
"epoch": 0.6531645569620254,
"grad_norm": 0.13994383951137745,
"learning_rate": 9.584830008980068e-06,
"loss": 0.7145,
"step": 215
},
{
"epoch": 0.6562025316455696,
"grad_norm": 0.15269517601933946,
"learning_rate": 9.577743974243875e-06,
"loss": 0.7563,
"step": 216
},
{
"epoch": 0.6592405063291139,
"grad_norm": 0.1449046463380699,
"learning_rate": 9.570600643434217e-06,
"loss": 0.7046,
"step": 217
},
{
"epoch": 0.6622784810126582,
"grad_norm": 0.14626206784218754,
"learning_rate": 9.563400105958638e-06,
"loss": 0.7466,
"step": 218
},
{
"epoch": 0.6653164556962026,
"grad_norm": 0.14925265069599608,
"learning_rate": 9.55614245194068e-06,
"loss": 0.7257,
"step": 219
},
{
"epoch": 0.6683544303797468,
"grad_norm": 0.14900239365835394,
"learning_rate": 9.548827772218772e-06,
"loss": 0.7217,
"step": 220
},
{
"epoch": 0.6713924050632911,
"grad_norm": 0.14241944963133804,
"learning_rate": 9.541456158345094e-06,
"loss": 0.7111,
"step": 221
},
{
"epoch": 0.6744303797468354,
"grad_norm": 0.16303539816736368,
"learning_rate": 9.534027702584425e-06,
"loss": 0.7823,
"step": 222
},
{
"epoch": 0.6774683544303798,
"grad_norm": 0.14757579145970603,
"learning_rate": 9.526542497912984e-06,
"loss": 0.6862,
"step": 223
},
{
"epoch": 0.6805063291139241,
"grad_norm": 0.15177966078318864,
"learning_rate": 9.51900063801728e-06,
"loss": 0.7292,
"step": 224
},
{
"epoch": 0.6835443037974683,
"grad_norm": 0.14879595092520143,
"learning_rate": 9.511402217292927e-06,
"loss": 0.7449,
"step": 225
},
{
"epoch": 0.6865822784810126,
"grad_norm": 0.16778174898039588,
"learning_rate": 9.503747330843468e-06,
"loss": 0.7606,
"step": 226
},
{
"epoch": 0.689620253164557,
"grad_norm": 0.14557534571840097,
"learning_rate": 9.496036074479184e-06,
"loss": 0.6848,
"step": 227
},
{
"epoch": 0.6926582278481013,
"grad_norm": 0.14670373935654174,
"learning_rate": 9.488268544715897e-06,
"loss": 0.7409,
"step": 228
},
{
"epoch": 0.6956962025316455,
"grad_norm": 0.15596118224861075,
"learning_rate": 9.480444838773753e-06,
"loss": 0.7307,
"step": 229
},
{
"epoch": 0.6987341772151898,
"grad_norm": 0.15028654659712917,
"learning_rate": 9.472565054576017e-06,
"loss": 0.7461,
"step": 230
},
{
"epoch": 0.7017721518987342,
"grad_norm": 0.14769647649955192,
"learning_rate": 9.464629290747844e-06,
"loss": 0.7447,
"step": 231
},
{
"epoch": 0.7048101265822785,
"grad_norm": 0.15075594210150053,
"learning_rate": 9.456637646615035e-06,
"loss": 0.7116,
"step": 232
},
{
"epoch": 0.7078481012658228,
"grad_norm": 0.1584095297303283,
"learning_rate": 9.448590222202808e-06,
"loss": 0.7762,
"step": 233
},
{
"epoch": 0.710886075949367,
"grad_norm": 0.15185429153178392,
"learning_rate": 9.440487118234536e-06,
"loss": 0.7322,
"step": 234
},
{
"epoch": 0.7139240506329114,
"grad_norm": 0.15536713995323584,
"learning_rate": 9.432328436130493e-06,
"loss": 0.7402,
"step": 235
},
{
"epoch": 0.7169620253164557,
"grad_norm": 0.1367116030241254,
"learning_rate": 9.42411427800658e-06,
"loss": 0.7325,
"step": 236
},
{
"epoch": 0.72,
"grad_norm": 0.1485314611100739,
"learning_rate": 9.415844746673047e-06,
"loss": 0.721,
"step": 237
},
{
"epoch": 0.7230379746835442,
"grad_norm": 0.1513523719206825,
"learning_rate": 9.40751994563321e-06,
"loss": 0.7038,
"step": 238
},
{
"epoch": 0.7260759493670886,
"grad_norm": 0.14790160472336378,
"learning_rate": 9.399139979082148e-06,
"loss": 0.7623,
"step": 239
},
{
"epoch": 0.7291139240506329,
"grad_norm": 0.14658046245123366,
"learning_rate": 9.390704951905412e-06,
"loss": 0.7277,
"step": 240
},
{
"epoch": 0.7321518987341772,
"grad_norm": 0.14490984139556115,
"learning_rate": 9.382214969677697e-06,
"loss": 0.7079,
"step": 241
},
{
"epoch": 0.7351898734177216,
"grad_norm": 0.15673563004792315,
"learning_rate": 9.37367013866153e-06,
"loss": 0.7749,
"step": 242
},
{
"epoch": 0.7382278481012658,
"grad_norm": 0.1532434622423629,
"learning_rate": 9.365070565805941e-06,
"loss": 0.7542,
"step": 243
},
{
"epoch": 0.7412658227848101,
"grad_norm": 0.141142025643933,
"learning_rate": 9.356416358745119e-06,
"loss": 0.7461,
"step": 244
},
{
"epoch": 0.7443037974683544,
"grad_norm": 0.15458993871915733,
"learning_rate": 9.347707625797062e-06,
"loss": 0.7564,
"step": 245
},
{
"epoch": 0.7473417721518988,
"grad_norm": 0.1525332937728372,
"learning_rate": 9.338944475962236e-06,
"loss": 0.7221,
"step": 246
},
{
"epoch": 0.7503797468354431,
"grad_norm": 0.15582017292256814,
"learning_rate": 9.330127018922195e-06,
"loss": 0.7577,
"step": 247
},
{
"epoch": 0.7534177215189873,
"grad_norm": 0.14627498584216264,
"learning_rate": 9.32125536503821e-06,
"loss": 0.7229,
"step": 248
},
{
"epoch": 0.7564556962025316,
"grad_norm": 0.13749633267698202,
"learning_rate": 9.312329625349903e-06,
"loss": 0.7112,
"step": 249
},
{
"epoch": 0.759493670886076,
"grad_norm": 0.14519207545625734,
"learning_rate": 9.303349911573838e-06,
"loss": 0.7322,
"step": 250
},
{
"epoch": 0.7625316455696203,
"grad_norm": 0.15127490797847287,
"learning_rate": 9.294316336102132e-06,
"loss": 0.7206,
"step": 251
},
{
"epoch": 0.7655696202531646,
"grad_norm": 0.15385069737322865,
"learning_rate": 9.285229012001047e-06,
"loss": 0.7861,
"step": 252
},
{
"epoch": 0.7686075949367088,
"grad_norm": 0.1392811216176761,
"learning_rate": 9.276088053009578e-06,
"loss": 0.715,
"step": 253
},
{
"epoch": 0.7716455696202532,
"grad_norm": 0.14390845029461158,
"learning_rate": 9.266893573538023e-06,
"loss": 0.7131,
"step": 254
},
{
"epoch": 0.7746835443037975,
"grad_norm": 0.14591475005039056,
"learning_rate": 9.257645688666557e-06,
"loss": 0.7029,
"step": 255
},
{
"epoch": 0.7777215189873418,
"grad_norm": 0.15034474500103215,
"learning_rate": 9.248344514143786e-06,
"loss": 0.7794,
"step": 256
},
{
"epoch": 0.780759493670886,
"grad_norm": 0.1329243293321845,
"learning_rate": 9.238990166385304e-06,
"loss": 0.6886,
"step": 257
},
{
"epoch": 0.7837974683544304,
"grad_norm": 0.1449519021528343,
"learning_rate": 9.229582762472232e-06,
"loss": 0.6954,
"step": 258
},
{
"epoch": 0.7868354430379747,
"grad_norm": 0.14011762932739064,
"learning_rate": 9.220122420149753e-06,
"loss": 0.7534,
"step": 259
},
{
"epoch": 0.789873417721519,
"grad_norm": 0.1430832021135705,
"learning_rate": 9.21060925782564e-06,
"loss": 0.6782,
"step": 260
},
{
"epoch": 0.7929113924050633,
"grad_norm": 0.146329792863665,
"learning_rate": 9.201043394568773e-06,
"loss": 0.7083,
"step": 261
},
{
"epoch": 0.7959493670886076,
"grad_norm": 0.13966005655876726,
"learning_rate": 9.191424950107648e-06,
"loss": 0.753,
"step": 262
},
{
"epoch": 0.7989873417721519,
"grad_norm": 0.1452187402666851,
"learning_rate": 9.181754044828882e-06,
"loss": 0.7087,
"step": 263
},
{
"epoch": 0.8020253164556962,
"grad_norm": 0.14770290588916754,
"learning_rate": 9.172030799775698e-06,
"loss": 0.7521,
"step": 264
},
{
"epoch": 0.8050632911392405,
"grad_norm": 0.14859373509481294,
"learning_rate": 9.162255336646422e-06,
"loss": 0.6864,
"step": 265
},
{
"epoch": 0.8081012658227849,
"grad_norm": 0.13521731389755007,
"learning_rate": 9.152427777792947e-06,
"loss": 0.7083,
"step": 266
},
{
"epoch": 0.8111392405063291,
"grad_norm": 0.15899678623809688,
"learning_rate": 9.142548246219212e-06,
"loss": 0.7191,
"step": 267
},
{
"epoch": 0.8141772151898734,
"grad_norm": 0.1375227713494758,
"learning_rate": 9.132616865579655e-06,
"loss": 0.7276,
"step": 268
},
{
"epoch": 0.8172151898734177,
"grad_norm": 0.15380324048740807,
"learning_rate": 9.122633760177674e-06,
"loss": 0.7517,
"step": 269
},
{
"epoch": 0.8202531645569621,
"grad_norm": 0.1425907333848317,
"learning_rate": 9.112599054964058e-06,
"loss": 0.756,
"step": 270
},
{
"epoch": 0.8232911392405063,
"grad_norm": 0.14836696714745884,
"learning_rate": 9.102512875535439e-06,
"loss": 0.6865,
"step": 271
},
{
"epoch": 0.8263291139240506,
"grad_norm": 0.13262857577952253,
"learning_rate": 9.092375348132704e-06,
"loss": 0.6957,
"step": 272
},
{
"epoch": 0.8293670886075949,
"grad_norm": 0.13994743933365547,
"learning_rate": 9.082186599639429e-06,
"loss": 0.6617,
"step": 273
},
{
"epoch": 0.8324050632911393,
"grad_norm": 0.15723763933695586,
"learning_rate": 9.071946757580282e-06,
"loss": 0.7624,
"step": 274
},
{
"epoch": 0.8354430379746836,
"grad_norm": 0.1486656829525549,
"learning_rate": 9.06165595011943e-06,
"loss": 0.7211,
"step": 275
},
{
"epoch": 0.8384810126582278,
"grad_norm": 0.15779632813651678,
"learning_rate": 9.051314306058934e-06,
"loss": 0.7503,
"step": 276
},
{
"epoch": 0.8415189873417721,
"grad_norm": 0.1585128088782449,
"learning_rate": 9.040921954837139e-06,
"loss": 0.759,
"step": 277
},
{
"epoch": 0.8445569620253165,
"grad_norm": 0.14943760375413637,
"learning_rate": 9.030479026527048e-06,
"loss": 0.7452,
"step": 278
},
{
"epoch": 0.8475949367088608,
"grad_norm": 0.1537621293162188,
"learning_rate": 9.019985651834703e-06,
"loss": 0.7302,
"step": 279
},
{
"epoch": 0.850632911392405,
"grad_norm": 0.14310455157465138,
"learning_rate": 9.009441962097543e-06,
"loss": 0.727,
"step": 280
},
{
"epoch": 0.8536708860759493,
"grad_norm": 0.15263194477343178,
"learning_rate": 8.99884808928276e-06,
"loss": 0.7591,
"step": 281
},
{
"epoch": 0.8567088607594937,
"grad_norm": 0.14586809361609301,
"learning_rate": 8.98820416598565e-06,
"loss": 0.7439,
"step": 282
},
{
"epoch": 0.859746835443038,
"grad_norm": 0.1504705926875278,
"learning_rate": 8.97751032542795e-06,
"loss": 0.7565,
"step": 283
},
{
"epoch": 0.8627848101265823,
"grad_norm": 0.1441103904918681,
"learning_rate": 8.966766701456177e-06,
"loss": 0.7034,
"step": 284
},
{
"epoch": 0.8658227848101265,
"grad_norm": 0.14391086855133448,
"learning_rate": 8.955973428539943e-06,
"loss": 0.724,
"step": 285
},
{
"epoch": 0.8688607594936709,
"grad_norm": 0.15047599099834674,
"learning_rate": 8.945130641770281e-06,
"loss": 0.7179,
"step": 286
},
{
"epoch": 0.8718987341772152,
"grad_norm": 0.15074435170430608,
"learning_rate": 8.93423847685795e-06,
"loss": 0.731,
"step": 287
},
{
"epoch": 0.8749367088607595,
"grad_norm": 0.16108274909500533,
"learning_rate": 8.923297070131738e-06,
"loss": 0.7071,
"step": 288
},
{
"epoch": 0.8779746835443037,
"grad_norm": 0.15524331185932833,
"learning_rate": 8.91230655853675e-06,
"loss": 0.7153,
"step": 289
},
{
"epoch": 0.8810126582278481,
"grad_norm": 0.15110074800492984,
"learning_rate": 8.901267079632703e-06,
"loss": 0.74,
"step": 290
},
{
"epoch": 0.8840506329113924,
"grad_norm": 0.14585560022656124,
"learning_rate": 8.890178771592198e-06,
"loss": 0.7491,
"step": 291
},
{
"epoch": 0.8870886075949367,
"grad_norm": 0.14265142947205262,
"learning_rate": 8.879041773198996e-06,
"loss": 0.7055,
"step": 292
},
{
"epoch": 0.890126582278481,
"grad_norm": 0.142780593111172,
"learning_rate": 8.86785622384627e-06,
"loss": 0.687,
"step": 293
},
{
"epoch": 0.8931645569620253,
"grad_norm": 0.1454614837486431,
"learning_rate": 8.856622263534875e-06,
"loss": 0.745,
"step": 294
},
{
"epoch": 0.8962025316455696,
"grad_norm": 0.14081739179379577,
"learning_rate": 8.845340032871584e-06,
"loss": 0.7713,
"step": 295
},
{
"epoch": 0.8992405063291139,
"grad_norm": 0.14385443388022717,
"learning_rate": 8.834009673067337e-06,
"loss": 0.7472,
"step": 296
},
{
"epoch": 0.9022784810126582,
"grad_norm": 0.1473963417078419,
"learning_rate": 8.822631325935463e-06,
"loss": 0.7249,
"step": 297
},
{
"epoch": 0.9053164556962026,
"grad_norm": 0.1368896762497509,
"learning_rate": 8.811205133889917e-06,
"loss": 0.7491,
"step": 298
},
{
"epoch": 0.9083544303797468,
"grad_norm": 0.14095409495460387,
"learning_rate": 8.799731239943488e-06,
"loss": 0.7172,
"step": 299
},
{
"epoch": 0.9113924050632911,
"grad_norm": 0.15441223002212245,
"learning_rate": 8.788209787706014e-06,
"loss": 0.7573,
"step": 300
},
{
"epoch": 0.9144303797468355,
"grad_norm": 0.14670952915562407,
"learning_rate": 8.776640921382585e-06,
"loss": 0.6943,
"step": 301
},
{
"epoch": 0.9174683544303798,
"grad_norm": 0.14641743535819832,
"learning_rate": 8.765024785771732e-06,
"loss": 0.6878,
"step": 302
},
{
"epoch": 0.920506329113924,
"grad_norm": 0.14616935890059882,
"learning_rate": 8.753361526263622e-06,
"loss": 0.7261,
"step": 303
},
{
"epoch": 0.9235443037974683,
"grad_norm": 0.15240576293183697,
"learning_rate": 8.741651288838237e-06,
"loss": 0.7324,
"step": 304
},
{
"epoch": 0.9265822784810127,
"grad_norm": 0.15933931646246116,
"learning_rate": 8.729894220063542e-06,
"loss": 0.6935,
"step": 305
},
{
"epoch": 0.929620253164557,
"grad_norm": 0.14649711165417814,
"learning_rate": 8.718090467093654e-06,
"loss": 0.7483,
"step": 306
},
{
"epoch": 0.9326582278481013,
"grad_norm": 0.14580689387632406,
"learning_rate": 8.706240177667003e-06,
"loss": 0.7327,
"step": 307
},
{
"epoch": 0.9356962025316455,
"grad_norm": 0.14256585147606213,
"learning_rate": 8.694343500104474e-06,
"loss": 0.7143,
"step": 308
},
{
"epoch": 0.9387341772151899,
"grad_norm": 0.15788812574603747,
"learning_rate": 8.682400583307562e-06,
"loss": 0.7508,
"step": 309
},
{
"epoch": 0.9417721518987342,
"grad_norm": 0.13178792986163443,
"learning_rate": 8.670411576756502e-06,
"loss": 0.6685,
"step": 310
},
{
"epoch": 0.9448101265822785,
"grad_norm": 0.14325453111842404,
"learning_rate": 8.658376630508391e-06,
"loss": 0.7384,
"step": 311
},
{
"epoch": 0.9478481012658228,
"grad_norm": 0.14921601570309354,
"learning_rate": 8.646295895195334e-06,
"loss": 0.6916,
"step": 312
},
{
"epoch": 0.9508860759493671,
"grad_norm": 0.14867911627025984,
"learning_rate": 8.634169522022522e-06,
"loss": 0.7618,
"step": 313
},
{
"epoch": 0.9539240506329114,
"grad_norm": 0.13408458116847805,
"learning_rate": 8.621997662766378e-06,
"loss": 0.685,
"step": 314
},
{
"epoch": 0.9569620253164557,
"grad_norm": 0.14778446535199555,
"learning_rate": 8.609780469772623e-06,
"loss": 0.7715,
"step": 315
},
{
"epoch": 0.96,
"grad_norm": 0.1513493124946686,
"learning_rate": 8.597518095954399e-06,
"loss": 0.7225,
"step": 316
},
{
"epoch": 0.9630379746835444,
"grad_norm": 0.1498242800163109,
"learning_rate": 8.585210694790333e-06,
"loss": 0.6919,
"step": 317
},
{
"epoch": 0.9660759493670886,
"grad_norm": 0.14772376485550806,
"learning_rate": 8.572858420322626e-06,
"loss": 0.749,
"step": 318
},
{
"epoch": 0.9691139240506329,
"grad_norm": 0.15452004706677017,
"learning_rate": 8.56046142715513e-06,
"loss": 0.7202,
"step": 319
},
{
"epoch": 0.9721518987341772,
"grad_norm": 0.15340112248677157,
"learning_rate": 8.548019870451391e-06,
"loss": 0.7242,
"step": 320
},
{
"epoch": 0.9751898734177216,
"grad_norm": 0.13675490142166902,
"learning_rate": 8.535533905932739e-06,
"loss": 0.699,
"step": 321
},
{
"epoch": 0.9782278481012658,
"grad_norm": 0.14063114842689411,
"learning_rate": 8.523003689876312e-06,
"loss": 0.7342,
"step": 322
},
{
"epoch": 0.9812658227848101,
"grad_norm": 0.14103508935438214,
"learning_rate": 8.510429379113114e-06,
"loss": 0.6986,
"step": 323
},
{
"epoch": 0.9843037974683544,
"grad_norm": 0.14781753448940838,
"learning_rate": 8.497811131026046e-06,
"loss": 0.7206,
"step": 324
},
{
"epoch": 0.9873417721518988,
"grad_norm": 0.13798681094672127,
"learning_rate": 8.485149103547943e-06,
"loss": 0.6682,
"step": 325
},
{
"epoch": 0.990379746835443,
"grad_norm": 0.13956851609714183,
"learning_rate": 8.472443455159586e-06,
"loss": 0.7281,
"step": 326
},
{
"epoch": 0.9934177215189873,
"grad_norm": 0.14795209764564385,
"learning_rate": 8.459694344887732e-06,
"loss": 0.7449,
"step": 327
},
{
"epoch": 0.9964556962025316,
"grad_norm": 0.1340859424248503,
"learning_rate": 8.446901932303112e-06,
"loss": 0.6661,
"step": 328
},
{
"epoch": 0.999493670886076,
"grad_norm": 0.13943100974007655,
"learning_rate": 8.434066377518437e-06,
"loss": 0.6925,
"step": 329
},
{
"epoch": 1.0,
"grad_norm": 0.13943100974007655,
"learning_rate": 8.421187841186402e-06,
"loss": 0.7072,
"step": 330
},
{
"epoch": 1.0030379746835443,
"grad_norm": 0.35195652682604184,
"learning_rate": 8.408266484497664e-06,
"loss": 0.5878,
"step": 331
},
{
"epoch": 1.0060759493670886,
"grad_norm": 0.13790447422523408,
"learning_rate": 8.395302469178832e-06,
"loss": 0.5245,
"step": 332
},
{
"epoch": 1.0091139240506328,
"grad_norm": 0.1335900726197855,
"learning_rate": 8.382295957490435e-06,
"loss": 0.5884,
"step": 333
},
{
"epoch": 1.0121518987341773,
"grad_norm": 0.14393620775276017,
"learning_rate": 8.369247112224901e-06,
"loss": 0.5912,
"step": 334
},
{
"epoch": 1.0151898734177216,
"grad_norm": 0.1707656785677369,
"learning_rate": 8.356156096704516e-06,
"loss": 0.6168,
"step": 335
},
{
"epoch": 1.0182278481012659,
"grad_norm": 0.1408052023777488,
"learning_rate": 8.343023074779368e-06,
"loss": 0.5443,
"step": 336
},
{
"epoch": 1.0212658227848102,
"grad_norm": 0.15435053976419325,
"learning_rate": 8.329848210825322e-06,
"loss": 0.5769,
"step": 337
},
{
"epoch": 1.0243037974683544,
"grad_norm": 0.1474285624791992,
"learning_rate": 8.316631669741934e-06,
"loss": 0.5507,
"step": 338
},
{
"epoch": 1.0273417721518987,
"grad_norm": 0.13958495408136215,
"learning_rate": 8.303373616950408e-06,
"loss": 0.5182,
"step": 339
},
{
"epoch": 1.030379746835443,
"grad_norm": 0.13214638415892638,
"learning_rate": 8.290074218391515e-06,
"loss": 0.5391,
"step": 340
},
{
"epoch": 1.0334177215189873,
"grad_norm": 0.14532819580539497,
"learning_rate": 8.27673364052352e-06,
"loss": 0.5657,
"step": 341
},
{
"epoch": 1.0364556962025318,
"grad_norm": 0.15512970626334552,
"learning_rate": 8.263352050320094e-06,
"loss": 0.5656,
"step": 342
},
{
"epoch": 1.039493670886076,
"grad_norm": 0.14663572118341192,
"learning_rate": 8.249929615268234e-06,
"loss": 0.5628,
"step": 343
},
{
"epoch": 1.0425316455696203,
"grad_norm": 0.14225503452449526,
"learning_rate": 8.236466503366155e-06,
"loss": 0.5692,
"step": 344
},
{
"epoch": 1.0455696202531646,
"grad_norm": 0.14716192192280636,
"learning_rate": 8.222962883121196e-06,
"loss": 0.5971,
"step": 345
},
{
"epoch": 1.0486075949367089,
"grad_norm": 0.14798497203430436,
"learning_rate": 8.209418923547706e-06,
"loss": 0.5534,
"step": 346
},
{
"epoch": 1.0516455696202531,
"grad_norm": 0.1428993625557661,
"learning_rate": 8.195834794164925e-06,
"loss": 0.5548,
"step": 347
},
{
"epoch": 1.0546835443037974,
"grad_norm": 0.1489771128374146,
"learning_rate": 8.182210664994879e-06,
"loss": 0.5465,
"step": 348
},
{
"epoch": 1.0577215189873417,
"grad_norm": 0.14404057693239417,
"learning_rate": 8.168546706560231e-06,
"loss": 0.566,
"step": 349
},
{
"epoch": 1.0607594936708862,
"grad_norm": 0.14320281952903854,
"learning_rate": 8.154843089882159e-06,
"loss": 0.5618,
"step": 350
},
{
"epoch": 1.0637974683544305,
"grad_norm": 0.14217364132694757,
"learning_rate": 8.141099986478212e-06,
"loss": 0.5758,
"step": 351
},
{
"epoch": 1.0668354430379747,
"grad_norm": 0.14157409729112905,
"learning_rate": 8.127317568360164e-06,
"loss": 0.5528,
"step": 352
},
{
"epoch": 1.069873417721519,
"grad_norm": 0.14493308862688992,
"learning_rate": 8.113496008031863e-06,
"loss": 0.5814,
"step": 353
},
{
"epoch": 1.0729113924050633,
"grad_norm": 0.14532728499404404,
"learning_rate": 8.099635478487064e-06,
"loss": 0.5173,
"step": 354
},
{
"epoch": 1.0759493670886076,
"grad_norm": 0.1492872905556815,
"learning_rate": 8.085736153207277e-06,
"loss": 0.562,
"step": 355
},
{
"epoch": 1.0789873417721518,
"grad_norm": 0.13525206039788887,
"learning_rate": 8.07179820615958e-06,
"loss": 0.5522,
"step": 356
},
{
"epoch": 1.082025316455696,
"grad_norm": 0.15253157854907176,
"learning_rate": 8.057821811794457e-06,
"loss": 0.6107,
"step": 357
},
{
"epoch": 1.0850632911392406,
"grad_norm": 0.14063398627766024,
"learning_rate": 8.043807145043604e-06,
"loss": 0.5735,
"step": 358
},
{
"epoch": 1.0881012658227849,
"grad_norm": 0.14389818718167977,
"learning_rate": 8.029754381317741e-06,
"loss": 0.5334,
"step": 359
},
{
"epoch": 1.0911392405063292,
"grad_norm": 0.14167958700306557,
"learning_rate": 8.015663696504424e-06,
"loss": 0.5773,
"step": 360
},
{
"epoch": 1.0941772151898734,
"grad_norm": 0.1401169247223358,
"learning_rate": 8.001535266965829e-06,
"loss": 0.5568,
"step": 361
},
{
"epoch": 1.0972151898734177,
"grad_norm": 0.14447092652766144,
"learning_rate": 7.987369269536563e-06,
"loss": 0.5424,
"step": 362
},
{
"epoch": 1.100253164556962,
"grad_norm": 0.13562624939969367,
"learning_rate": 7.973165881521435e-06,
"loss": 0.549,
"step": 363
},
{
"epoch": 1.1032911392405063,
"grad_norm": 0.1449306113760785,
"learning_rate": 7.958925280693243e-06,
"loss": 0.5304,
"step": 364
},
{
"epoch": 1.1063291139240505,
"grad_norm": 0.14540166205441302,
"learning_rate": 7.944647645290555e-06,
"loss": 0.571,
"step": 365
},
{
"epoch": 1.109367088607595,
"grad_norm": 0.15144055477718635,
"learning_rate": 7.930333154015467e-06,
"loss": 0.5493,
"step": 366
},
{
"epoch": 1.1124050632911393,
"grad_norm": 0.14154469089239974,
"learning_rate": 7.915981986031367e-06,
"loss": 0.606,
"step": 367
},
{
"epoch": 1.1154430379746836,
"grad_norm": 0.13991802435102235,
"learning_rate": 7.901594320960709e-06,
"loss": 0.5572,
"step": 368
},
{
"epoch": 1.1184810126582279,
"grad_norm": 0.13313075315830625,
"learning_rate": 7.887170338882742e-06,
"loss": 0.5733,
"step": 369
},
{
"epoch": 1.1215189873417721,
"grad_norm": 0.14195483359233327,
"learning_rate": 7.872710220331271e-06,
"loss": 0.585,
"step": 370
},
{
"epoch": 1.1245569620253164,
"grad_norm": 0.13931767214524954,
"learning_rate": 7.858214146292394e-06,
"loss": 0.5445,
"step": 371
},
{
"epoch": 1.1275949367088607,
"grad_norm": 0.14365199015958316,
"learning_rate": 7.843682298202235e-06,
"loss": 0.5742,
"step": 372
},
{
"epoch": 1.130632911392405,
"grad_norm": 0.13892352185640874,
"learning_rate": 7.829114857944672e-06,
"loss": 0.5644,
"step": 373
},
{
"epoch": 1.1336708860759495,
"grad_norm": 0.14101201867485888,
"learning_rate": 7.814512007849069e-06,
"loss": 0.5616,
"step": 374
},
{
"epoch": 1.1367088607594937,
"grad_norm": 0.1356625238961039,
"learning_rate": 7.799873930687979e-06,
"loss": 0.5645,
"step": 375
},
{
"epoch": 1.139746835443038,
"grad_norm": 0.13564046321608458,
"learning_rate": 7.785200809674869e-06,
"loss": 0.5798,
"step": 376
},
{
"epoch": 1.1427848101265823,
"grad_norm": 0.14799014090819052,
"learning_rate": 7.770492828461824e-06,
"loss": 0.5695,
"step": 377
},
{
"epoch": 1.1458227848101266,
"grad_norm": 0.1347310269977235,
"learning_rate": 7.755750171137245e-06,
"loss": 0.5783,
"step": 378
},
{
"epoch": 1.1488607594936708,
"grad_norm": 0.1405798639605091,
"learning_rate": 7.74097302222355e-06,
"loss": 0.5589,
"step": 379
},
{
"epoch": 1.1518987341772151,
"grad_norm": 0.14596056892317547,
"learning_rate": 7.726161566674856e-06,
"loss": 0.5548,
"step": 380
},
{
"epoch": 1.1549367088607596,
"grad_norm": 0.14401713097153115,
"learning_rate": 7.711315989874677e-06,
"loss": 0.5759,
"step": 381
},
{
"epoch": 1.1579746835443039,
"grad_norm": 0.14374703144254186,
"learning_rate": 7.696436477633588e-06,
"loss": 0.5391,
"step": 382
},
{
"epoch": 1.1610126582278482,
"grad_norm": 0.14766321403035243,
"learning_rate": 7.681523216186912e-06,
"loss": 0.5883,
"step": 383
},
{
"epoch": 1.1640506329113924,
"grad_norm": 0.1375940357437819,
"learning_rate": 7.666576392192389e-06,
"loss": 0.5509,
"step": 384
},
{
"epoch": 1.1670886075949367,
"grad_norm": 0.1381506679407275,
"learning_rate": 7.651596192727826e-06,
"loss": 0.529,
"step": 385
},
{
"epoch": 1.170126582278481,
"grad_norm": 0.13807613640205604,
"learning_rate": 7.636582805288771e-06,
"loss": 0.5022,
"step": 386
},
{
"epoch": 1.1731645569620253,
"grad_norm": 0.14272839656072073,
"learning_rate": 7.621536417786159e-06,
"loss": 0.5648,
"step": 387
},
{
"epoch": 1.1762025316455695,
"grad_norm": 0.14465724664013882,
"learning_rate": 7.606457218543961e-06,
"loss": 0.5873,
"step": 388
},
{
"epoch": 1.1792405063291138,
"grad_norm": 0.14449484147463795,
"learning_rate": 7.5913453962968296e-06,
"loss": 0.5474,
"step": 389
},
{
"epoch": 1.1822784810126583,
"grad_norm": 0.14376467882067412,
"learning_rate": 7.576201140187727e-06,
"loss": 0.5545,
"step": 390
},
{
"epoch": 1.1853164556962026,
"grad_norm": 0.14202851308765727,
"learning_rate": 7.5610246397655715e-06,
"loss": 0.5828,
"step": 391
},
{
"epoch": 1.1883544303797469,
"grad_norm": 0.1462502151481694,
"learning_rate": 7.54581608498286e-06,
"loss": 0.5873,
"step": 392
},
{
"epoch": 1.1913924050632911,
"grad_norm": 0.13990538383436923,
"learning_rate": 7.530575666193283e-06,
"loss": 0.5592,
"step": 393
},
{
"epoch": 1.1944303797468354,
"grad_norm": 0.15276225499982546,
"learning_rate": 7.515303574149348e-06,
"loss": 0.5807,
"step": 394
},
{
"epoch": 1.1974683544303797,
"grad_norm": 0.1379333410631157,
"learning_rate": 7.500000000000001e-06,
"loss": 0.5337,
"step": 395
},
{
"epoch": 1.200506329113924,
"grad_norm": 0.13943056308693977,
"learning_rate": 7.484665135288214e-06,
"loss": 0.5438,
"step": 396
},
{
"epoch": 1.2035443037974685,
"grad_norm": 0.13856734895253275,
"learning_rate": 7.469299171948608e-06,
"loss": 0.5476,
"step": 397
},
{
"epoch": 1.2065822784810127,
"grad_norm": 0.13893892952476672,
"learning_rate": 7.453902302305032e-06,
"loss": 0.5564,
"step": 398
},
{
"epoch": 1.209620253164557,
"grad_norm": 0.14952936713448245,
"learning_rate": 7.438474719068174e-06,
"loss": 0.5283,
"step": 399
},
{
"epoch": 1.2126582278481013,
"grad_norm": 0.13855561393837743,
"learning_rate": 7.423016615333135e-06,
"loss": 0.5426,
"step": 400
},
{
"epoch": 1.2156962025316456,
"grad_norm": 0.14582609657438356,
"learning_rate": 7.4075281845770196e-06,
"loss": 0.5555,
"step": 401
},
{
"epoch": 1.2187341772151898,
"grad_norm": 0.14624318192875743,
"learning_rate": 7.392009620656513e-06,
"loss": 0.5727,
"step": 402
},
{
"epoch": 1.2217721518987341,
"grad_norm": 0.14098369650646345,
"learning_rate": 7.37646111780545e-06,
"loss": 0.5825,
"step": 403
},
{
"epoch": 1.2248101265822784,
"grad_norm": 0.14295467837134232,
"learning_rate": 7.360882870632393e-06,
"loss": 0.5587,
"step": 404
},
{
"epoch": 1.2278481012658227,
"grad_norm": 0.13475859762040648,
"learning_rate": 7.3452750741181855e-06,
"loss": 0.5642,
"step": 405
},
{
"epoch": 1.2308860759493672,
"grad_norm": 0.1397869789917611,
"learning_rate": 7.329637923613522e-06,
"loss": 0.5167,
"step": 406
},
{
"epoch": 1.2339240506329114,
"grad_norm": 0.14008874268991642,
"learning_rate": 7.313971614836496e-06,
"loss": 0.5418,
"step": 407
},
{
"epoch": 1.2369620253164557,
"grad_norm": 0.14927580619892877,
"learning_rate": 7.298276343870152e-06,
"loss": 0.5826,
"step": 408
},
{
"epoch": 1.24,
"grad_norm": 0.14527571180153181,
"learning_rate": 7.282552307160033e-06,
"loss": 0.5518,
"step": 409
},
{
"epoch": 1.2430379746835443,
"grad_norm": 0.14489892811821217,
"learning_rate": 7.26679970151172e-06,
"loss": 0.5499,
"step": 410
},
{
"epoch": 1.2460759493670885,
"grad_norm": 0.1451134850602267,
"learning_rate": 7.251018724088367e-06,
"loss": 0.579,
"step": 411
},
{
"epoch": 1.2491139240506328,
"grad_norm": 0.13242470106668042,
"learning_rate": 7.235209572408241e-06,
"loss": 0.5563,
"step": 412
},
{
"epoch": 1.2521518987341773,
"grad_norm": 0.14805669523665416,
"learning_rate": 7.2193724443422405e-06,
"loss": 0.6147,
"step": 413
},
{
"epoch": 1.2551898734177216,
"grad_norm": 0.1302889421614733,
"learning_rate": 7.203507538111423e-06,
"loss": 0.5567,
"step": 414
},
{
"epoch": 1.2582278481012659,
"grad_norm": 0.14285991469932954,
"learning_rate": 7.187615052284522e-06,
"loss": 0.5718,
"step": 415
},
{
"epoch": 1.2612658227848101,
"grad_norm": 0.15128709925990969,
"learning_rate": 7.171695185775468e-06,
"loss": 0.5581,
"step": 416
},
{
"epoch": 1.2643037974683544,
"grad_norm": 0.1420485997443191,
"learning_rate": 7.155748137840892e-06,
"loss": 0.574,
"step": 417
},
{
"epoch": 1.2673417721518987,
"grad_norm": 0.1399838719737584,
"learning_rate": 7.139774108077633e-06,
"loss": 0.569,
"step": 418
},
{
"epoch": 1.270379746835443,
"grad_norm": 0.13965944387092968,
"learning_rate": 7.12377329642024e-06,
"loss": 0.5445,
"step": 419
},
{
"epoch": 1.2734177215189875,
"grad_norm": 0.149148740952187,
"learning_rate": 7.107745903138472e-06,
"loss": 0.5923,
"step": 420
},
{
"epoch": 1.2764556962025315,
"grad_norm": 0.1421914319854205,
"learning_rate": 7.09169212883479e-06,
"loss": 0.5732,
"step": 421
},
{
"epoch": 1.279493670886076,
"grad_norm": 0.1498434614111481,
"learning_rate": 7.075612174441846e-06,
"loss": 0.5646,
"step": 422
},
{
"epoch": 1.2825316455696203,
"grad_norm": 0.13979599958581326,
"learning_rate": 7.059506241219964e-06,
"loss": 0.584,
"step": 423
},
{
"epoch": 1.2855696202531646,
"grad_norm": 0.15321777078459725,
"learning_rate": 7.04337453075463e-06,
"loss": 0.5634,
"step": 424
},
{
"epoch": 1.2886075949367088,
"grad_norm": 0.13990758699960726,
"learning_rate": 7.027217244953958e-06,
"loss": 0.5434,
"step": 425
},
{
"epoch": 1.2916455696202531,
"grad_norm": 0.14493433075593967,
"learning_rate": 7.011034586046177e-06,
"loss": 0.5495,
"step": 426
},
{
"epoch": 1.2946835443037974,
"grad_norm": 0.15209369171144482,
"learning_rate": 6.994826756577082e-06,
"loss": 0.5894,
"step": 427
},
{
"epoch": 1.2977215189873417,
"grad_norm": 0.14699225509678257,
"learning_rate": 6.978593959407516e-06,
"loss": 0.5752,
"step": 428
},
{
"epoch": 1.3007594936708862,
"grad_norm": 0.1497901452958901,
"learning_rate": 6.962336397710819e-06,
"loss": 0.5811,
"step": 429
},
{
"epoch": 1.3037974683544304,
"grad_norm": 0.15080869333540273,
"learning_rate": 6.946054274970292e-06,
"loss": 0.5838,
"step": 430
},
{
"epoch": 1.3068354430379747,
"grad_norm": 0.14750628950150269,
"learning_rate": 6.9297477949766445e-06,
"loss": 0.5655,
"step": 431
},
{
"epoch": 1.309873417721519,
"grad_norm": 0.1495028601788689,
"learning_rate": 6.913417161825449e-06,
"loss": 0.557,
"step": 432
},
{
"epoch": 1.3129113924050633,
"grad_norm": 0.135568113963619,
"learning_rate": 6.897062579914587e-06,
"loss": 0.5473,
"step": 433
},
{
"epoch": 1.3159493670886075,
"grad_norm": 0.1409983877447637,
"learning_rate": 6.88068425394168e-06,
"loss": 0.5412,
"step": 434
},
{
"epoch": 1.3189873417721518,
"grad_norm": 0.14195644818160238,
"learning_rate": 6.864282388901544e-06,
"loss": 0.5467,
"step": 435
},
{
"epoch": 1.3220253164556963,
"grad_norm": 0.14696388488045273,
"learning_rate": 6.847857190083611e-06,
"loss": 0.5622,
"step": 436
},
{
"epoch": 1.3250632911392404,
"grad_norm": 0.13234163090116136,
"learning_rate": 6.831408863069364e-06,
"loss": 0.54,
"step": 437
},
{
"epoch": 1.3281012658227849,
"grad_norm": 0.1533823957396425,
"learning_rate": 6.814937613729766e-06,
"loss": 0.5514,
"step": 438
},
{
"epoch": 1.3311392405063291,
"grad_norm": 0.15116390246065511,
"learning_rate": 6.79844364822268e-06,
"loss": 0.5368,
"step": 439
},
{
"epoch": 1.3341772151898734,
"grad_norm": 0.14590384620413216,
"learning_rate": 6.781927172990285e-06,
"loss": 0.5417,
"step": 440
},
{
"epoch": 1.3372151898734177,
"grad_norm": 0.1418688933481456,
"learning_rate": 6.765388394756504e-06,
"loss": 0.5488,
"step": 441
},
{
"epoch": 1.340253164556962,
"grad_norm": 0.13505338358252206,
"learning_rate": 6.748827520524406e-06,
"loss": 0.5328,
"step": 442
},
{
"epoch": 1.3432911392405062,
"grad_norm": 0.14825841588326408,
"learning_rate": 6.732244757573619e-06,
"loss": 0.5539,
"step": 443
},
{
"epoch": 1.3463291139240505,
"grad_norm": 0.14176422736895686,
"learning_rate": 6.715640313457733e-06,
"loss": 0.5239,
"step": 444
},
{
"epoch": 1.349367088607595,
"grad_norm": 0.15025498831454065,
"learning_rate": 6.699014396001707e-06,
"loss": 0.5576,
"step": 445
},
{
"epoch": 1.3524050632911393,
"grad_norm": 0.14849242454803074,
"learning_rate": 6.682367213299264e-06,
"loss": 0.5901,
"step": 446
},
{
"epoch": 1.3554430379746836,
"grad_norm": 0.1337453174485716,
"learning_rate": 6.665698973710289e-06,
"loss": 0.543,
"step": 447
},
{
"epoch": 1.3584810126582278,
"grad_norm": 0.16216307783943593,
"learning_rate": 6.6490098858582176e-06,
"loss": 0.6144,
"step": 448
},
{
"epoch": 1.3615189873417721,
"grad_norm": 0.15076621656691633,
"learning_rate": 6.632300158627427e-06,
"loss": 0.5305,
"step": 449
},
{
"epoch": 1.3645569620253164,
"grad_norm": 0.13592067804238195,
"learning_rate": 6.615570001160626e-06,
"loss": 0.5518,
"step": 450
},
{
"epoch": 1.3675949367088607,
"grad_norm": 0.14079801039067172,
"learning_rate": 6.598819622856227e-06,
"loss": 0.5732,
"step": 451
},
{
"epoch": 1.3706329113924052,
"grad_norm": 0.14457330915355807,
"learning_rate": 6.582049233365734e-06,
"loss": 0.5729,
"step": 452
},
{
"epoch": 1.3736708860759494,
"grad_norm": 0.15094714189957395,
"learning_rate": 6.565259042591112e-06,
"loss": 0.5934,
"step": 453
},
{
"epoch": 1.3767088607594937,
"grad_norm": 0.1466922947639145,
"learning_rate": 6.548449260682169e-06,
"loss": 0.5623,
"step": 454
},
{
"epoch": 1.379746835443038,
"grad_norm": 0.1400123201876512,
"learning_rate": 6.531620098033919e-06,
"loss": 0.5273,
"step": 455
},
{
"epoch": 1.3827848101265823,
"grad_norm": 0.14133958564834126,
"learning_rate": 6.514771765283942e-06,
"loss": 0.5396,
"step": 456
},
{
"epoch": 1.3858227848101266,
"grad_norm": 0.14258031825522197,
"learning_rate": 6.497904473309766e-06,
"loss": 0.5625,
"step": 457
},
{
"epoch": 1.3888607594936708,
"grad_norm": 0.14545898625012454,
"learning_rate": 6.481018433226212e-06,
"loss": 0.5442,
"step": 458
},
{
"epoch": 1.3918987341772153,
"grad_norm": 0.1424592407904095,
"learning_rate": 6.464113856382752e-06,
"loss": 0.5796,
"step": 459
},
{
"epoch": 1.3949367088607594,
"grad_norm": 0.14307497989204254,
"learning_rate": 6.447190954360878e-06,
"loss": 0.5323,
"step": 460
},
{
"epoch": 1.3979746835443039,
"grad_norm": 0.1512754411710344,
"learning_rate": 6.430249938971438e-06,
"loss": 0.561,
"step": 461
},
{
"epoch": 1.4010126582278482,
"grad_norm": 0.14687045523315304,
"learning_rate": 6.41329102225199e-06,
"loss": 0.569,
"step": 462
},
{
"epoch": 1.4040506329113924,
"grad_norm": 0.14183379480187908,
"learning_rate": 6.396314416464151e-06,
"loss": 0.5528,
"step": 463
},
{
"epoch": 1.4070886075949367,
"grad_norm": 0.14453430336620318,
"learning_rate": 6.37932033409094e-06,
"loss": 0.5445,
"step": 464
},
{
"epoch": 1.410126582278481,
"grad_norm": 0.14993745856428103,
"learning_rate": 6.3623089878341146e-06,
"loss": 0.554,
"step": 465
},
{
"epoch": 1.4131645569620253,
"grad_norm": 0.13473778660354907,
"learning_rate": 6.345280590611512e-06,
"loss": 0.5639,
"step": 466
},
{
"epoch": 1.4162025316455695,
"grad_norm": 0.1467946635023631,
"learning_rate": 6.328235355554382e-06,
"loss": 0.5435,
"step": 467
},
{
"epoch": 1.419240506329114,
"grad_norm": 0.14618662658256054,
"learning_rate": 6.311173496004723e-06,
"loss": 0.5966,
"step": 468
},
{
"epoch": 1.4222784810126583,
"grad_norm": 0.13742911095501387,
"learning_rate": 6.294095225512604e-06,
"loss": 0.5519,
"step": 469
},
{
"epoch": 1.4253164556962026,
"grad_norm": 0.14077085256802702,
"learning_rate": 6.2770007578335044e-06,
"loss": 0.5751,
"step": 470
},
{
"epoch": 1.4283544303797469,
"grad_norm": 0.14400912403186725,
"learning_rate": 6.259890306925627e-06,
"loss": 0.5635,
"step": 471
},
{
"epoch": 1.4313924050632911,
"grad_norm": 0.16384884804730898,
"learning_rate": 6.2427640869472235e-06,
"loss": 0.6224,
"step": 472
},
{
"epoch": 1.4344303797468354,
"grad_norm": 0.14311619239647858,
"learning_rate": 6.225622312253916e-06,
"loss": 0.5079,
"step": 473
},
{
"epoch": 1.4374683544303797,
"grad_norm": 0.14122193279872083,
"learning_rate": 6.208465197396013e-06,
"loss": 0.5707,
"step": 474
},
{
"epoch": 1.4405063291139242,
"grad_norm": 0.1364437220835936,
"learning_rate": 6.191292957115825e-06,
"loss": 0.5401,
"step": 475
},
{
"epoch": 1.4435443037974682,
"grad_norm": 0.15264933943847964,
"learning_rate": 6.174105806344975e-06,
"loss": 0.5493,
"step": 476
},
{
"epoch": 1.4465822784810127,
"grad_norm": 0.16129908194882306,
"learning_rate": 6.156903960201709e-06,
"loss": 0.6069,
"step": 477
},
{
"epoch": 1.449620253164557,
"grad_norm": 0.1618394903261924,
"learning_rate": 6.1396876339882e-06,
"loss": 0.5896,
"step": 478
},
{
"epoch": 1.4526582278481013,
"grad_norm": 0.14479158934168623,
"learning_rate": 6.122457043187863e-06,
"loss": 0.6691,
"step": 479
},
{
"epoch": 1.4556962025316456,
"grad_norm": 0.15826960180874897,
"learning_rate": 6.10521240346265e-06,
"loss": 0.5565,
"step": 480
},
{
"epoch": 1.4587341772151898,
"grad_norm": 0.14313720461361779,
"learning_rate": 6.087953930650349e-06,
"loss": 0.5577,
"step": 481
},
{
"epoch": 1.461772151898734,
"grad_norm": 0.14961067467424582,
"learning_rate": 6.070681840761889e-06,
"loss": 0.5598,
"step": 482
},
{
"epoch": 1.4648101265822784,
"grad_norm": 0.14055235040717554,
"learning_rate": 6.053396349978632e-06,
"loss": 0.5529,
"step": 483
},
{
"epoch": 1.4678481012658229,
"grad_norm": 0.13969576574961248,
"learning_rate": 6.036097674649672e-06,
"loss": 0.5227,
"step": 484
},
{
"epoch": 1.4708860759493672,
"grad_norm": 0.14004461937572002,
"learning_rate": 6.018786031289119e-06,
"loss": 0.5596,
"step": 485
},
{
"epoch": 1.4739240506329114,
"grad_norm": 0.13850250529404448,
"learning_rate": 6.001461636573397e-06,
"loss": 0.5724,
"step": 486
},
{
"epoch": 1.4769620253164557,
"grad_norm": 0.1449191119243632,
"learning_rate": 5.984124707338528e-06,
"loss": 0.5728,
"step": 487
},
{
"epoch": 1.48,
"grad_norm": 0.14942587196237087,
"learning_rate": 5.966775460577418e-06,
"loss": 0.5568,
"step": 488
},
{
"epoch": 1.4830379746835443,
"grad_norm": 0.1440922653563269,
"learning_rate": 5.949414113437142e-06,
"loss": 0.5308,
"step": 489
},
{
"epoch": 1.4860759493670885,
"grad_norm": 0.1458257665739855,
"learning_rate": 5.932040883216228e-06,
"loss": 0.5414,
"step": 490
},
{
"epoch": 1.489113924050633,
"grad_norm": 0.13936168878725216,
"learning_rate": 5.914655987361934e-06,
"loss": 0.5689,
"step": 491
},
{
"epoch": 1.492151898734177,
"grad_norm": 0.14337072371461196,
"learning_rate": 5.897259643467528e-06,
"loss": 0.5634,
"step": 492
},
{
"epoch": 1.4951898734177216,
"grad_norm": 0.1487136821718494,
"learning_rate": 5.8798520692695605e-06,
"loss": 0.5603,
"step": 493
},
{
"epoch": 1.4982278481012659,
"grad_norm": 0.15621189055198445,
"learning_rate": 5.862433482645151e-06,
"loss": 0.5822,
"step": 494
},
{
"epoch": 1.5012658227848101,
"grad_norm": 0.1452785414833775,
"learning_rate": 5.8450041016092465e-06,
"loss": 0.5745,
"step": 495
},
{
"epoch": 1.5043037974683544,
"grad_norm": 0.14691813312407356,
"learning_rate": 5.8275641443119015e-06,
"loss": 0.5651,
"step": 496
},
{
"epoch": 1.5073417721518987,
"grad_norm": 0.1422962192244908,
"learning_rate": 5.810113829035544e-06,
"loss": 0.5694,
"step": 497
},
{
"epoch": 1.5103797468354432,
"grad_norm": 0.13687442215416604,
"learning_rate": 5.792653374192245e-06,
"loss": 0.5199,
"step": 498
},
{
"epoch": 1.5134177215189872,
"grad_norm": 0.15559785355845704,
"learning_rate": 5.77518299832099e-06,
"loss": 0.5667,
"step": 499
},
{
"epoch": 1.5164556962025317,
"grad_norm": 0.1433021811228919,
"learning_rate": 5.757702920084931e-06,
"loss": 0.534,
"step": 500
},
{
"epoch": 1.5194936708860758,
"grad_norm": 0.14391604046214512,
"learning_rate": 5.740213358268658e-06,
"loss": 0.5672,
"step": 501
},
{
"epoch": 1.5225316455696203,
"grad_norm": 0.13734803599229806,
"learning_rate": 5.722714531775463e-06,
"loss": 0.5524,
"step": 502
},
{
"epoch": 1.5255696202531646,
"grad_norm": 0.13897178367227836,
"learning_rate": 5.705206659624597e-06,
"loss": 0.5246,
"step": 503
},
{
"epoch": 1.5286075949367088,
"grad_norm": 0.14789109970803385,
"learning_rate": 5.687689960948526e-06,
"loss": 0.5727,
"step": 504
},
{
"epoch": 1.5316455696202531,
"grad_norm": 0.15653372973953597,
"learning_rate": 5.670164654990189e-06,
"loss": 0.5685,
"step": 505
},
{
"epoch": 1.5346835443037974,
"grad_norm": 0.14160505984214866,
"learning_rate": 5.65263096110026e-06,
"loss": 0.5333,
"step": 506
},
{
"epoch": 1.5377215189873419,
"grad_norm": 0.14372485623337306,
"learning_rate": 5.635089098734394e-06,
"loss": 0.5556,
"step": 507
},
{
"epoch": 1.540759493670886,
"grad_norm": 0.14182995906457416,
"learning_rate": 5.617539287450492e-06,
"loss": 0.5597,
"step": 508
},
{
"epoch": 1.5437974683544304,
"grad_norm": 0.14039262485132767,
"learning_rate": 5.599981746905935e-06,
"loss": 0.5309,
"step": 509
},
{
"epoch": 1.5468354430379747,
"grad_norm": 0.14032897325734403,
"learning_rate": 5.582416696854853e-06,
"loss": 0.5665,
"step": 510
},
{
"epoch": 1.549873417721519,
"grad_norm": 0.1434217675146914,
"learning_rate": 5.564844357145365e-06,
"loss": 0.5555,
"step": 511
},
{
"epoch": 1.5529113924050633,
"grad_norm": 0.14771494262754153,
"learning_rate": 5.5472649477168264e-06,
"loss": 0.5812,
"step": 512
},
{
"epoch": 1.5559493670886075,
"grad_norm": 0.14694457454874488,
"learning_rate": 5.529678688597081e-06,
"loss": 0.5556,
"step": 513
},
{
"epoch": 1.558987341772152,
"grad_norm": 0.14005032417283586,
"learning_rate": 5.512085799899705e-06,
"loss": 0.5488,
"step": 514
},
{
"epoch": 1.562025316455696,
"grad_norm": 0.1495348119581355,
"learning_rate": 5.49448650182125e-06,
"loss": 0.5535,
"step": 515
},
{
"epoch": 1.5650632911392406,
"grad_norm": 0.1482663531985799,
"learning_rate": 5.476881014638491e-06,
"loss": 0.555,
"step": 516
},
{
"epoch": 1.5681012658227849,
"grad_norm": 0.13579619142220437,
"learning_rate": 5.459269558705667e-06,
"loss": 0.5095,
"step": 517
},
{
"epoch": 1.5711392405063291,
"grad_norm": 0.1349516715654807,
"learning_rate": 5.441652354451721e-06,
"loss": 0.551,
"step": 518
},
{
"epoch": 1.5741772151898734,
"grad_norm": 0.14250308468757655,
"learning_rate": 5.4240296223775465e-06,
"loss": 0.5771,
"step": 519
},
{
"epoch": 1.5772151898734177,
"grad_norm": 0.14898385307777995,
"learning_rate": 5.406401583053222e-06,
"loss": 0.5495,
"step": 520
},
{
"epoch": 1.5802531645569622,
"grad_norm": 0.1526056074250204,
"learning_rate": 5.388768457115254e-06,
"loss": 0.551,
"step": 521
},
{
"epoch": 1.5832911392405062,
"grad_norm": 0.13895662082873736,
"learning_rate": 5.371130465263813e-06,
"loss": 0.5661,
"step": 522
},
{
"epoch": 1.5863291139240507,
"grad_norm": 0.13655961057510518,
"learning_rate": 5.353487828259973e-06,
"loss": 0.5387,
"step": 523
},
{
"epoch": 1.5893670886075948,
"grad_norm": 0.13385305077105514,
"learning_rate": 5.33584076692295e-06,
"loss": 0.4979,
"step": 524
},
{
"epoch": 1.5924050632911393,
"grad_norm": 0.14461178515510015,
"learning_rate": 5.318189502127332e-06,
"loss": 0.5621,
"step": 525
},
{
"epoch": 1.5954430379746836,
"grad_norm": 0.13063832970849976,
"learning_rate": 5.300534254800321e-06,
"loss": 0.5512,
"step": 526
},
{
"epoch": 1.5984810126582278,
"grad_norm": 0.1339988600058471,
"learning_rate": 5.282875245918963e-06,
"loss": 0.5628,
"step": 527
},
{
"epoch": 1.6015189873417721,
"grad_norm": 0.14952996776146643,
"learning_rate": 5.265212696507387e-06,
"loss": 0.558,
"step": 528
},
{
"epoch": 1.6045569620253164,
"grad_norm": 0.14584127089296017,
"learning_rate": 5.247546827634035e-06,
"loss": 0.5673,
"step": 529
},
{
"epoch": 1.6075949367088609,
"grad_norm": 0.14976398397944954,
"learning_rate": 5.229877860408899e-06,
"loss": 0.527,
"step": 530
},
{
"epoch": 1.610632911392405,
"grad_norm": 0.13993952677265098,
"learning_rate": 5.212206015980742e-06,
"loss": 0.5669,
"step": 531
},
{
"epoch": 1.6136708860759494,
"grad_norm": 0.14125637957821963,
"learning_rate": 5.194531515534349e-06,
"loss": 0.5438,
"step": 532
},
{
"epoch": 1.6167088607594937,
"grad_norm": 0.1441000126670552,
"learning_rate": 5.176854580287744e-06,
"loss": 0.5636,
"step": 533
},
{
"epoch": 1.619746835443038,
"grad_norm": 0.1465477733257787,
"learning_rate": 5.159175431489424e-06,
"loss": 0.5622,
"step": 534
},
{
"epoch": 1.6227848101265823,
"grad_norm": 0.1452280680581016,
"learning_rate": 5.141494290415592e-06,
"loss": 0.5752,
"step": 535
},
{
"epoch": 1.6258227848101265,
"grad_norm": 0.1490460884264919,
"learning_rate": 5.123811378367387e-06,
"loss": 0.5956,
"step": 536
},
{
"epoch": 1.628860759493671,
"grad_norm": 0.14509109070781032,
"learning_rate": 5.106126916668118e-06,
"loss": 0.5698,
"step": 537
},
{
"epoch": 1.631898734177215,
"grad_norm": 0.13988927520941075,
"learning_rate": 5.088441126660484e-06,
"loss": 0.5576,
"step": 538
},
{
"epoch": 1.6349367088607596,
"grad_norm": 0.15090631504584412,
"learning_rate": 5.070754229703811e-06,
"loss": 0.5859,
"step": 539
},
{
"epoch": 1.6379746835443036,
"grad_norm": 0.14046022933676117,
"learning_rate": 5.053066447171282e-06,
"loss": 0.5364,
"step": 540
},
{
"epoch": 1.6410126582278481,
"grad_norm": 0.15029782080460274,
"learning_rate": 5.0353780004471605e-06,
"loss": 0.5898,
"step": 541
},
{
"epoch": 1.6440506329113924,
"grad_norm": 0.14468053902299152,
"learning_rate": 5.0176891109240265e-06,
"loss": 0.5559,
"step": 542
},
{
"epoch": 1.6470886075949367,
"grad_norm": 0.13603753933538643,
"learning_rate": 5e-06,
"loss": 0.5641,
"step": 543
},
{
"epoch": 1.650126582278481,
"grad_norm": 0.1312182710525947,
"learning_rate": 4.9823108890759735e-06,
"loss": 0.5781,
"step": 544
},
{
"epoch": 1.6531645569620252,
"grad_norm": 0.13893741157551903,
"learning_rate": 4.964621999552841e-06,
"loss": 0.5382,
"step": 545
},
{
"epoch": 1.6562025316455697,
"grad_norm": 0.1272389096785783,
"learning_rate": 4.94693355282872e-06,
"loss": 0.5537,
"step": 546
},
{
"epoch": 1.6592405063291138,
"grad_norm": 0.1484697926598053,
"learning_rate": 4.929245770296191e-06,
"loss": 0.5835,
"step": 547
},
{
"epoch": 1.6622784810126583,
"grad_norm": 0.13218383966226127,
"learning_rate": 4.911558873339517e-06,
"loss": 0.5722,
"step": 548
},
{
"epoch": 1.6653164556962026,
"grad_norm": 0.13912939261057347,
"learning_rate": 4.8938730833318825e-06,
"loss": 0.5168,
"step": 549
},
{
"epoch": 1.6683544303797468,
"grad_norm": 0.14285050385792192,
"learning_rate": 4.876188621632614e-06,
"loss": 0.5195,
"step": 550
},
{
"epoch": 1.6713924050632911,
"grad_norm": 0.14335550085150423,
"learning_rate": 4.85850570958441e-06,
"loss": 0.5643,
"step": 551
},
{
"epoch": 1.6744303797468354,
"grad_norm": 0.14417656814244476,
"learning_rate": 4.840824568510579e-06,
"loss": 0.5453,
"step": 552
},
{
"epoch": 1.67746835443038,
"grad_norm": 0.14091818725429653,
"learning_rate": 4.8231454197122575e-06,
"loss": 0.5499,
"step": 553
},
{
"epoch": 1.680506329113924,
"grad_norm": 0.13630949660068137,
"learning_rate": 4.805468484465651e-06,
"loss": 0.5529,
"step": 554
},
{
"epoch": 1.6835443037974684,
"grad_norm": 0.13452973158282802,
"learning_rate": 4.78779398401926e-06,
"loss": 0.5073,
"step": 555
},
{
"epoch": 1.6865822784810125,
"grad_norm": 0.13321837957700222,
"learning_rate": 4.770122139591103e-06,
"loss": 0.5437,
"step": 556
},
{
"epoch": 1.689620253164557,
"grad_norm": 0.14526224786375885,
"learning_rate": 4.752453172365966e-06,
"loss": 0.6003,
"step": 557
},
{
"epoch": 1.6926582278481013,
"grad_norm": 0.14880695833453583,
"learning_rate": 4.734787303492615e-06,
"loss": 0.5415,
"step": 558
},
{
"epoch": 1.6956962025316455,
"grad_norm": 0.1354671471518635,
"learning_rate": 4.717124754081038e-06,
"loss": 0.5415,
"step": 559
},
{
"epoch": 1.6987341772151898,
"grad_norm": 0.13901671698042392,
"learning_rate": 4.6994657451996815e-06,
"loss": 0.5451,
"step": 560
},
{
"epoch": 1.701772151898734,
"grad_norm": 0.14366775666503703,
"learning_rate": 4.6818104978726685e-06,
"loss": 0.5399,
"step": 561
},
{
"epoch": 1.7048101265822786,
"grad_norm": 0.137433696428405,
"learning_rate": 4.664159233077051e-06,
"loss": 0.5274,
"step": 562
},
{
"epoch": 1.7078481012658226,
"grad_norm": 0.1486497515378418,
"learning_rate": 4.646512171740028e-06,
"loss": 0.5506,
"step": 563
},
{
"epoch": 1.7108860759493671,
"grad_norm": 0.14299327049032062,
"learning_rate": 4.628869534736187e-06,
"loss": 0.5372,
"step": 564
},
{
"epoch": 1.7139240506329114,
"grad_norm": 0.1432222649388412,
"learning_rate": 4.611231542884747e-06,
"loss": 0.5628,
"step": 565
},
{
"epoch": 1.7169620253164557,
"grad_norm": 0.14210217228609906,
"learning_rate": 4.593598416946779e-06,
"loss": 0.554,
"step": 566
},
{
"epoch": 1.72,
"grad_norm": 0.14838285283850924,
"learning_rate": 4.575970377622456e-06,
"loss": 0.5561,
"step": 567
},
{
"epoch": 1.7230379746835442,
"grad_norm": 0.14626060214205217,
"learning_rate": 4.55834764554828e-06,
"loss": 0.5573,
"step": 568
},
{
"epoch": 1.7260759493670887,
"grad_norm": 0.13822402701812242,
"learning_rate": 4.540730441294334e-06,
"loss": 0.5474,
"step": 569
},
{
"epoch": 1.7291139240506328,
"grad_norm": 0.13990125190678557,
"learning_rate": 4.523118985361511e-06,
"loss": 0.5448,
"step": 570
},
{
"epoch": 1.7321518987341773,
"grad_norm": 0.14647795581607545,
"learning_rate": 4.505513498178752e-06,
"loss": 0.6005,
"step": 571
},
{
"epoch": 1.7351898734177216,
"grad_norm": 0.14114549792807873,
"learning_rate": 4.487914200100296e-06,
"loss": 0.5583,
"step": 572
},
{
"epoch": 1.7382278481012658,
"grad_norm": 0.14147578201123018,
"learning_rate": 4.47032131140292e-06,
"loss": 0.5493,
"step": 573
},
{
"epoch": 1.7412658227848101,
"grad_norm": 0.1461456067023322,
"learning_rate": 4.452735052283175e-06,
"loss": 0.5728,
"step": 574
},
{
"epoch": 1.7443037974683544,
"grad_norm": 0.14701108635229268,
"learning_rate": 4.435155642854637e-06,
"loss": 0.5389,
"step": 575
},
{
"epoch": 1.747341772151899,
"grad_norm": 0.14844626853283752,
"learning_rate": 4.4175833031451475e-06,
"loss": 0.5547,
"step": 576
},
{
"epoch": 1.750379746835443,
"grad_norm": 0.14268537466486175,
"learning_rate": 4.400018253094065e-06,
"loss": 0.5064,
"step": 577
},
{
"epoch": 1.7534177215189874,
"grad_norm": 0.1343337939680092,
"learning_rate": 4.38246071254951e-06,
"loss": 0.5081,
"step": 578
},
{
"epoch": 1.7564556962025315,
"grad_norm": 0.14358103354354712,
"learning_rate": 4.364910901265607e-06,
"loss": 0.5263,
"step": 579
},
{
"epoch": 1.759493670886076,
"grad_norm": 0.14075296925562727,
"learning_rate": 4.347369038899744e-06,
"loss": 0.5823,
"step": 580
},
{
"epoch": 1.7625316455696203,
"grad_norm": 0.15086719736162787,
"learning_rate": 4.329835345009813e-06,
"loss": 0.5728,
"step": 581
},
{
"epoch": 1.7655696202531646,
"grad_norm": 0.1482949289358636,
"learning_rate": 4.312310039051476e-06,
"loss": 0.5673,
"step": 582
},
{
"epoch": 1.7686075949367088,
"grad_norm": 0.14907940397518046,
"learning_rate": 4.294793340375405e-06,
"loss": 0.5605,
"step": 583
},
{
"epoch": 1.771645569620253,
"grad_norm": 0.13784144891569491,
"learning_rate": 4.2772854682245365e-06,
"loss": 0.5087,
"step": 584
},
{
"epoch": 1.7746835443037976,
"grad_norm": 0.1286612878135622,
"learning_rate": 4.259786641731344e-06,
"loss": 0.4975,
"step": 585
},
{
"epoch": 1.7777215189873417,
"grad_norm": 0.13947922425755688,
"learning_rate": 4.242297079915071e-06,
"loss": 0.5543,
"step": 586
},
{
"epoch": 1.7807594936708862,
"grad_norm": 0.13628866858565816,
"learning_rate": 4.224817001679011e-06,
"loss": 0.5616,
"step": 587
},
{
"epoch": 1.7837974683544304,
"grad_norm": 0.1449043124811266,
"learning_rate": 4.2073466258077564e-06,
"loss": 0.5507,
"step": 588
},
{
"epoch": 1.7868354430379747,
"grad_norm": 0.15523180294835678,
"learning_rate": 4.189886170964458e-06,
"loss": 0.5869,
"step": 589
},
{
"epoch": 1.789873417721519,
"grad_norm": 0.135928325073954,
"learning_rate": 4.172435855688101e-06,
"loss": 0.5294,
"step": 590
},
{
"epoch": 1.7929113924050633,
"grad_norm": 0.13255462114158595,
"learning_rate": 4.154995898390756e-06,
"loss": 0.5597,
"step": 591
},
{
"epoch": 1.7959493670886078,
"grad_norm": 0.14201226106350706,
"learning_rate": 4.13756651735485e-06,
"loss": 0.5635,
"step": 592
},
{
"epoch": 1.7989873417721518,
"grad_norm": 0.13625044615447665,
"learning_rate": 4.12014793073044e-06,
"loss": 0.5496,
"step": 593
},
{
"epoch": 1.8020253164556963,
"grad_norm": 0.13575015781553457,
"learning_rate": 4.102740356532474e-06,
"loss": 0.5426,
"step": 594
},
{
"epoch": 1.8050632911392404,
"grad_norm": 0.14705140164593614,
"learning_rate": 4.085344012638067e-06,
"loss": 0.537,
"step": 595
},
{
"epoch": 1.8081012658227849,
"grad_norm": 0.1409549906509916,
"learning_rate": 4.0679591167837725e-06,
"loss": 0.5631,
"step": 596
},
{
"epoch": 1.8111392405063291,
"grad_norm": 0.14079198651273092,
"learning_rate": 4.050585886562858e-06,
"loss": 0.5743,
"step": 597
},
{
"epoch": 1.8141772151898734,
"grad_norm": 0.1401708128735937,
"learning_rate": 4.033224539422584e-06,
"loss": 0.5572,
"step": 598
},
{
"epoch": 1.8172151898734177,
"grad_norm": 0.14493156119176476,
"learning_rate": 4.015875292661474e-06,
"loss": 0.5403,
"step": 599
},
{
"epoch": 1.820253164556962,
"grad_norm": 0.14715986914181306,
"learning_rate": 3.998538363426605e-06,
"loss": 0.548,
"step": 600
},
{
"epoch": 1.8232911392405065,
"grad_norm": 0.13644438323976482,
"learning_rate": 3.981213968710882e-06,
"loss": 0.5266,
"step": 601
},
{
"epoch": 1.8263291139240505,
"grad_norm": 0.1398253759857167,
"learning_rate": 3.96390232535033e-06,
"loss": 0.5456,
"step": 602
},
{
"epoch": 1.829367088607595,
"grad_norm": 0.14760211922260238,
"learning_rate": 3.94660365002137e-06,
"loss": 0.5765,
"step": 603
},
{
"epoch": 1.8324050632911393,
"grad_norm": 0.1418957483459648,
"learning_rate": 3.929318159238113e-06,
"loss": 0.542,
"step": 604
},
{
"epoch": 1.8354430379746836,
"grad_norm": 0.14314371065917036,
"learning_rate": 3.912046069349654e-06,
"loss": 0.5754,
"step": 605
},
{
"epoch": 1.8384810126582278,
"grad_norm": 0.13916767299258234,
"learning_rate": 3.894787596537352e-06,
"loss": 0.5126,
"step": 606
},
{
"epoch": 1.841518987341772,
"grad_norm": 0.1396512977982268,
"learning_rate": 3.877542956812137e-06,
"loss": 0.5511,
"step": 607
},
{
"epoch": 1.8445569620253166,
"grad_norm": 0.14749351134383998,
"learning_rate": 3.860312366011802e-06,
"loss": 0.5708,
"step": 608
},
{
"epoch": 1.8475949367088607,
"grad_norm": 0.1411435574639553,
"learning_rate": 3.843096039798293e-06,
"loss": 0.5753,
"step": 609
},
{
"epoch": 1.8506329113924052,
"grad_norm": 0.1428960506415565,
"learning_rate": 3.825894193655026e-06,
"loss": 0.5805,
"step": 610
},
{
"epoch": 1.8536708860759492,
"grad_norm": 0.14096006506759617,
"learning_rate": 3.808707042884176e-06,
"loss": 0.5566,
"step": 611
},
{
"epoch": 1.8567088607594937,
"grad_norm": 0.14385169997352637,
"learning_rate": 3.7915348026039877e-06,
"loss": 0.5354,
"step": 612
},
{
"epoch": 1.859746835443038,
"grad_norm": 0.14384954479264117,
"learning_rate": 3.7743776877460864e-06,
"loss": 0.5761,
"step": 613
},
{
"epoch": 1.8627848101265823,
"grad_norm": 0.14325377106229495,
"learning_rate": 3.757235913052778e-06,
"loss": 0.5609,
"step": 614
},
{
"epoch": 1.8658227848101265,
"grad_norm": 0.14392767974295542,
"learning_rate": 3.7401096930743753e-06,
"loss": 0.5351,
"step": 615
},
{
"epoch": 1.8688607594936708,
"grad_norm": 0.13860263164526795,
"learning_rate": 3.722999242166497e-06,
"loss": 0.5165,
"step": 616
},
{
"epoch": 1.8718987341772153,
"grad_norm": 0.14552207830586328,
"learning_rate": 3.705904774487396e-06,
"loss": 0.5412,
"step": 617
},
{
"epoch": 1.8749367088607594,
"grad_norm": 0.13798290117274228,
"learning_rate": 3.6888265039952796e-06,
"loss": 0.5509,
"step": 618
},
{
"epoch": 1.8779746835443039,
"grad_norm": 0.1492939489068322,
"learning_rate": 3.6717646444456196e-06,
"loss": 0.5634,
"step": 619
},
{
"epoch": 1.8810126582278481,
"grad_norm": 0.14346220201822088,
"learning_rate": 3.6547194093884907e-06,
"loss": 0.5306,
"step": 620
},
{
"epoch": 1.8840506329113924,
"grad_norm": 0.1372529739765176,
"learning_rate": 3.6376910121658867e-06,
"loss": 0.5237,
"step": 621
},
{
"epoch": 1.8870886075949367,
"grad_norm": 0.1476780967270864,
"learning_rate": 3.6206796659090605e-06,
"loss": 0.5516,
"step": 622
},
{
"epoch": 1.890126582278481,
"grad_norm": 0.13891558687222758,
"learning_rate": 3.60368558353585e-06,
"loss": 0.5453,
"step": 623
},
{
"epoch": 1.8931645569620255,
"grad_norm": 0.14645395153335597,
"learning_rate": 3.5867089777480124e-06,
"loss": 0.5599,
"step": 624
},
{
"epoch": 1.8962025316455695,
"grad_norm": 0.14582671520351428,
"learning_rate": 3.569750061028565e-06,
"loss": 0.5685,
"step": 625
},
{
"epoch": 1.899240506329114,
"grad_norm": 0.1427846190640779,
"learning_rate": 3.552809045639123e-06,
"loss": 0.5484,
"step": 626
},
{
"epoch": 1.902278481012658,
"grad_norm": 0.13630123350371642,
"learning_rate": 3.5358861436172487e-06,
"loss": 0.5207,
"step": 627
},
{
"epoch": 1.9053164556962026,
"grad_norm": 0.14311504681101872,
"learning_rate": 3.5189815667737916e-06,
"loss": 0.5644,
"step": 628
},
{
"epoch": 1.9083544303797468,
"grad_norm": 0.13509459795763357,
"learning_rate": 3.5020955266902344e-06,
"loss": 0.5471,
"step": 629
},
{
"epoch": 1.9113924050632911,
"grad_norm": 0.1450618320538534,
"learning_rate": 3.485228234716058e-06,
"loss": 0.5984,
"step": 630
},
{
"epoch": 1.9144303797468356,
"grad_norm": 0.13825705276927988,
"learning_rate": 3.4683799019660834e-06,
"loss": 0.5328,
"step": 631
},
{
"epoch": 1.9174683544303797,
"grad_norm": 0.14039425954035306,
"learning_rate": 3.4515507393178316e-06,
"loss": 0.5688,
"step": 632
},
{
"epoch": 1.9205063291139242,
"grad_norm": 0.13288585149565324,
"learning_rate": 3.4347409574088896e-06,
"loss": 0.5796,
"step": 633
},
{
"epoch": 1.9235443037974682,
"grad_norm": 0.148980738834977,
"learning_rate": 3.417950766634268e-06,
"loss": 0.5269,
"step": 634
},
{
"epoch": 1.9265822784810127,
"grad_norm": 0.14609255957393458,
"learning_rate": 3.401180377143774e-06,
"loss": 0.5677,
"step": 635
},
{
"epoch": 1.929620253164557,
"grad_norm": 0.1359215016501865,
"learning_rate": 3.3844299988393757e-06,
"loss": 0.5377,
"step": 636
},
{
"epoch": 1.9326582278481013,
"grad_norm": 0.13812680406064673,
"learning_rate": 3.3676998413725726e-06,
"loss": 0.5523,
"step": 637
},
{
"epoch": 1.9356962025316455,
"grad_norm": 0.1327702988731366,
"learning_rate": 3.3509901141417845e-06,
"loss": 0.5587,
"step": 638
},
{
"epoch": 1.9387341772151898,
"grad_norm": 0.14199245720615827,
"learning_rate": 3.3343010262897125e-06,
"loss": 0.5467,
"step": 639
},
{
"epoch": 1.9417721518987343,
"grad_norm": 0.14774749378135763,
"learning_rate": 3.3176327867007376e-06,
"loss": 0.5535,
"step": 640
},
{
"epoch": 1.9448101265822784,
"grad_norm": 0.15288003903087136,
"learning_rate": 3.300985603998296e-06,
"loss": 0.5837,
"step": 641
},
{
"epoch": 1.9478481012658229,
"grad_norm": 0.13722020959482253,
"learning_rate": 3.2843596865422687e-06,
"loss": 0.5248,
"step": 642
},
{
"epoch": 1.9508860759493671,
"grad_norm": 0.14417467655168203,
"learning_rate": 3.2677552424263836e-06,
"loss": 0.5339,
"step": 643
},
{
"epoch": 1.9539240506329114,
"grad_norm": 0.14278379325376636,
"learning_rate": 3.251172479475595e-06,
"loss": 0.5584,
"step": 644
},
{
"epoch": 1.9569620253164557,
"grad_norm": 0.14142202059314476,
"learning_rate": 3.234611605243496e-06,
"loss": 0.5383,
"step": 645
},
{
"epoch": 1.96,
"grad_norm": 0.14465300329222652,
"learning_rate": 3.2180728270097163e-06,
"loss": 0.5618,
"step": 646
},
{
"epoch": 1.9630379746835445,
"grad_norm": 0.1464687761772569,
"learning_rate": 3.2015563517773214e-06,
"loss": 0.5597,
"step": 647
},
{
"epoch": 1.9660759493670885,
"grad_norm": 0.1353865286420906,
"learning_rate": 3.1850623862702344e-06,
"loss": 0.5133,
"step": 648
},
{
"epoch": 1.969113924050633,
"grad_norm": 0.1454261849576213,
"learning_rate": 3.1685911369306364e-06,
"loss": 0.5922,
"step": 649
},
{
"epoch": 1.972151898734177,
"grad_norm": 0.14700261448719304,
"learning_rate": 3.1521428099163897e-06,
"loss": 0.5819,
"step": 650
},
{
"epoch": 1.9751898734177216,
"grad_norm": 0.14221751125407076,
"learning_rate": 3.1357176110984578e-06,
"loss": 0.5379,
"step": 651
},
{
"epoch": 1.9782278481012658,
"grad_norm": 0.14822785091064794,
"learning_rate": 3.1193157460583217e-06,
"loss": 0.582,
"step": 652
},
{
"epoch": 1.9812658227848101,
"grad_norm": 0.1412575132749595,
"learning_rate": 3.1029374200854167e-06,
"loss": 0.5356,
"step": 653
},
{
"epoch": 1.9843037974683544,
"grad_norm": 0.13340040841815526,
"learning_rate": 3.0865828381745515e-06,
"loss": 0.529,
"step": 654
},
{
"epoch": 1.9873417721518987,
"grad_norm": 0.14471668869668178,
"learning_rate": 3.070252205023356e-06,
"loss": 0.5573,
"step": 655
},
{
"epoch": 1.9903797468354432,
"grad_norm": 0.1343280166917402,
"learning_rate": 3.0539457250297095e-06,
"loss": 0.517,
"step": 656
},
{
"epoch": 1.9934177215189872,
"grad_norm": 0.14472286407987167,
"learning_rate": 3.0376636022891813e-06,
"loss": 0.5618,
"step": 657
},
{
"epoch": 1.9964556962025317,
"grad_norm": 0.14382602624595334,
"learning_rate": 3.0214060405924863e-06,
"loss": 0.5265,
"step": 658
},
{
"epoch": 1.999493670886076,
"grad_norm": 0.14660808079965273,
"learning_rate": 3.0051732434229185e-06,
"loss": 0.5296,
"step": 659
},
{
"epoch": 2.0,
"grad_norm": 0.14660808079965273,
"learning_rate": 2.988965413953825e-06,
"loss": 0.5292,
"step": 660
},
{
"epoch": 2.0030379746835445,
"grad_norm": 0.3574667987342137,
"learning_rate": 2.972782755046043e-06,
"loss": 0.404,
"step": 661
},
{
"epoch": 2.0060759493670886,
"grad_norm": 0.1386853846579846,
"learning_rate": 2.956625469245372e-06,
"loss": 0.3793,
"step": 662
},
{
"epoch": 2.009113924050633,
"grad_norm": 0.14162490732848218,
"learning_rate": 2.9404937587800374e-06,
"loss": 0.3873,
"step": 663
},
{
"epoch": 2.012151898734177,
"grad_norm": 0.14617377605368997,
"learning_rate": 2.924387825558155e-06,
"loss": 0.369,
"step": 664
},
{
"epoch": 2.0151898734177216,
"grad_norm": 0.1366181219034659,
"learning_rate": 2.90830787116521e-06,
"loss": 0.3758,
"step": 665
},
{
"epoch": 2.0182278481012657,
"grad_norm": 0.14211468580081227,
"learning_rate": 2.892254096861529e-06,
"loss": 0.3897,
"step": 666
},
{
"epoch": 2.02126582278481,
"grad_norm": 0.14141985614727673,
"learning_rate": 2.8762267035797607e-06,
"loss": 0.3811,
"step": 667
},
{
"epoch": 2.0243037974683546,
"grad_norm": 0.13901272165728304,
"learning_rate": 2.8602258919223703e-06,
"loss": 0.4199,
"step": 668
},
{
"epoch": 2.0273417721518987,
"grad_norm": 0.1666829934946773,
"learning_rate": 2.8442518621591085e-06,
"loss": 0.3738,
"step": 669
},
{
"epoch": 2.030379746835443,
"grad_norm": 0.15163462826568602,
"learning_rate": 2.828304814224532e-06,
"loss": 0.4043,
"step": 670
},
{
"epoch": 2.0334177215189873,
"grad_norm": 0.1498595150029576,
"learning_rate": 2.8123849477154808e-06,
"loss": 0.3674,
"step": 671
},
{
"epoch": 2.0364556962025318,
"grad_norm": 0.16298861392155772,
"learning_rate": 2.796492461888578e-06,
"loss": 0.3606,
"step": 672
},
{
"epoch": 2.039493670886076,
"grad_norm": 0.1651702105333408,
"learning_rate": 2.7806275556577624e-06,
"loss": 0.3512,
"step": 673
},
{
"epoch": 2.0425316455696203,
"grad_norm": 0.15287039406201183,
"learning_rate": 2.764790427591759e-06,
"loss": 0.3949,
"step": 674
},
{
"epoch": 2.0455696202531644,
"grad_norm": 0.1475889681451995,
"learning_rate": 2.748981275911633e-06,
"loss": 0.3527,
"step": 675
},
{
"epoch": 2.048607594936709,
"grad_norm": 0.15041610036768438,
"learning_rate": 2.733200298488284e-06,
"loss": 0.3996,
"step": 676
},
{
"epoch": 2.0516455696202534,
"grad_norm": 0.14184260873433086,
"learning_rate": 2.7174476928399685e-06,
"loss": 0.3757,
"step": 677
},
{
"epoch": 2.0546835443037974,
"grad_norm": 0.14445511127114966,
"learning_rate": 2.701723656129851e-06,
"loss": 0.3388,
"step": 678
},
{
"epoch": 2.057721518987342,
"grad_norm": 0.13942685496204735,
"learning_rate": 2.6860283851635067e-06,
"loss": 0.3749,
"step": 679
},
{
"epoch": 2.060759493670886,
"grad_norm": 0.1434478459905437,
"learning_rate": 2.670362076386478e-06,
"loss": 0.3681,
"step": 680
},
{
"epoch": 2.0637974683544305,
"grad_norm": 0.1419461369196199,
"learning_rate": 2.6547249258818162e-06,
"loss": 0.3771,
"step": 681
},
{
"epoch": 2.0668354430379745,
"grad_norm": 0.14415690241953952,
"learning_rate": 2.6391171293676077e-06,
"loss": 0.3895,
"step": 682
},
{
"epoch": 2.069873417721519,
"grad_norm": 0.14211032871865661,
"learning_rate": 2.6235388821945497e-06,
"loss": 0.345,
"step": 683
},
{
"epoch": 2.0729113924050635,
"grad_norm": 0.14068917338536288,
"learning_rate": 2.607990379343489e-06,
"loss": 0.3447,
"step": 684
},
{
"epoch": 2.0759493670886076,
"grad_norm": 0.14055662228238847,
"learning_rate": 2.59247181542298e-06,
"loss": 0.3763,
"step": 685
},
{
"epoch": 2.078987341772152,
"grad_norm": 0.14124406243142484,
"learning_rate": 2.576983384666867e-06,
"loss": 0.3521,
"step": 686
},
{
"epoch": 2.082025316455696,
"grad_norm": 0.14291453835375909,
"learning_rate": 2.5615252809318287e-06,
"loss": 0.3582,
"step": 687
},
{
"epoch": 2.0850632911392406,
"grad_norm": 0.1460732385329124,
"learning_rate": 2.5460976976949686e-06,
"loss": 0.3709,
"step": 688
},
{
"epoch": 2.0881012658227847,
"grad_norm": 0.14485653716892255,
"learning_rate": 2.5307008280513956e-06,
"loss": 0.3588,
"step": 689
},
{
"epoch": 2.091139240506329,
"grad_norm": 0.1465911852717234,
"learning_rate": 2.515334864711786e-06,
"loss": 0.3771,
"step": 690
},
{
"epoch": 2.094177215189873,
"grad_norm": 0.13197544386108068,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.3692,
"step": 691
},
{
"epoch": 2.0972151898734177,
"grad_norm": 0.15871811724241183,
"learning_rate": 2.484696425850653e-06,
"loss": 0.3685,
"step": 692
},
{
"epoch": 2.100253164556962,
"grad_norm": 0.13945520601609362,
"learning_rate": 2.469424333806718e-06,
"loss": 0.3482,
"step": 693
},
{
"epoch": 2.1032911392405063,
"grad_norm": 0.1431696802686398,
"learning_rate": 2.454183915017142e-06,
"loss": 0.3681,
"step": 694
},
{
"epoch": 2.1063291139240508,
"grad_norm": 0.14147201785437902,
"learning_rate": 2.4389753602344298e-06,
"loss": 0.3589,
"step": 695
},
{
"epoch": 2.109367088607595,
"grad_norm": 0.1413089483297598,
"learning_rate": 2.423798859812275e-06,
"loss": 0.3657,
"step": 696
},
{
"epoch": 2.1124050632911393,
"grad_norm": 0.14505692905921022,
"learning_rate": 2.4086546037031734e-06,
"loss": 0.3826,
"step": 697
},
{
"epoch": 2.1154430379746834,
"grad_norm": 0.14175858850346926,
"learning_rate": 2.393542781456038e-06,
"loss": 0.3625,
"step": 698
},
{
"epoch": 2.118481012658228,
"grad_norm": 0.14145320968168484,
"learning_rate": 2.3784635822138424e-06,
"loss": 0.367,
"step": 699
},
{
"epoch": 2.1215189873417724,
"grad_norm": 0.14472698977269888,
"learning_rate": 2.3634171947112307e-06,
"loss": 0.3977,
"step": 700
},
{
"epoch": 2.1245569620253164,
"grad_norm": 0.14060009486280914,
"learning_rate": 2.348403807272176e-06,
"loss": 0.3654,
"step": 701
},
{
"epoch": 2.127594936708861,
"grad_norm": 0.1323369322662927,
"learning_rate": 2.3334236078076126e-06,
"loss": 0.374,
"step": 702
},
{
"epoch": 2.130632911392405,
"grad_norm": 0.13871843025606753,
"learning_rate": 2.318476783813088e-06,
"loss": 0.3516,
"step": 703
},
{
"epoch": 2.1336708860759495,
"grad_norm": 0.13665483983345827,
"learning_rate": 2.3035635223664136e-06,
"loss": 0.3788,
"step": 704
},
{
"epoch": 2.1367088607594935,
"grad_norm": 0.14156145083764254,
"learning_rate": 2.288684010125325e-06,
"loss": 0.3719,
"step": 705
},
{
"epoch": 2.139746835443038,
"grad_norm": 0.1416005014074393,
"learning_rate": 2.2738384333251447e-06,
"loss": 0.3458,
"step": 706
},
{
"epoch": 2.1427848101265825,
"grad_norm": 0.1395435783001548,
"learning_rate": 2.2590269777764516e-06,
"loss": 0.3997,
"step": 707
},
{
"epoch": 2.1458227848101266,
"grad_norm": 0.14129712405820286,
"learning_rate": 2.2442498288627555e-06,
"loss": 0.3772,
"step": 708
},
{
"epoch": 2.148860759493671,
"grad_norm": 0.13423105798210677,
"learning_rate": 2.229507171538178e-06,
"loss": 0.3562,
"step": 709
},
{
"epoch": 2.151898734177215,
"grad_norm": 0.1398912160788065,
"learning_rate": 2.214799190325133e-06,
"loss": 0.3465,
"step": 710
},
{
"epoch": 2.1549367088607596,
"grad_norm": 0.13951688351084254,
"learning_rate": 2.2001260693120236e-06,
"loss": 0.3606,
"step": 711
},
{
"epoch": 2.1579746835443037,
"grad_norm": 0.13894816262705856,
"learning_rate": 2.185487992150933e-06,
"loss": 0.3653,
"step": 712
},
{
"epoch": 2.161012658227848,
"grad_norm": 0.14021673321330894,
"learning_rate": 2.1708851420553277e-06,
"loss": 0.3518,
"step": 713
},
{
"epoch": 2.164050632911392,
"grad_norm": 0.14815706787540678,
"learning_rate": 2.156317701797766e-06,
"loss": 0.3618,
"step": 714
},
{
"epoch": 2.1670886075949367,
"grad_norm": 0.1442615360609417,
"learning_rate": 2.141785853707607e-06,
"loss": 0.3814,
"step": 715
},
{
"epoch": 2.170126582278481,
"grad_norm": 0.14693744202965667,
"learning_rate": 2.12728977966873e-06,
"loss": 0.3702,
"step": 716
},
{
"epoch": 2.1731645569620253,
"grad_norm": 0.1417303529290372,
"learning_rate": 2.1128296611172593e-06,
"loss": 0.3511,
"step": 717
},
{
"epoch": 2.1762025316455698,
"grad_norm": 0.14179184653741922,
"learning_rate": 2.0984056790392926e-06,
"loss": 0.3635,
"step": 718
},
{
"epoch": 2.179240506329114,
"grad_norm": 0.13988953083591674,
"learning_rate": 2.0840180139686333e-06,
"loss": 0.3596,
"step": 719
},
{
"epoch": 2.1822784810126583,
"grad_norm": 0.13593635557199768,
"learning_rate": 2.0696668459845354e-06,
"loss": 0.3688,
"step": 720
},
{
"epoch": 2.1853164556962024,
"grad_norm": 0.1452918041182462,
"learning_rate": 2.0553523547094473e-06,
"loss": 0.4024,
"step": 721
},
{
"epoch": 2.188354430379747,
"grad_norm": 0.13179141412839,
"learning_rate": 2.041074719306757e-06,
"loss": 0.3883,
"step": 722
},
{
"epoch": 2.191392405063291,
"grad_norm": 0.14527521731366982,
"learning_rate": 2.0268341184785674e-06,
"loss": 0.3586,
"step": 723
},
{
"epoch": 2.1944303797468354,
"grad_norm": 0.141640009335079,
"learning_rate": 2.0126307304634383e-06,
"loss": 0.3418,
"step": 724
},
{
"epoch": 2.19746835443038,
"grad_norm": 0.13534271988208676,
"learning_rate": 1.998464733034172e-06,
"loss": 0.3542,
"step": 725
},
{
"epoch": 2.200506329113924,
"grad_norm": 0.13937271639571533,
"learning_rate": 1.98433630349558e-06,
"loss": 0.3487,
"step": 726
},
{
"epoch": 2.2035443037974685,
"grad_norm": 0.14219916038892946,
"learning_rate": 1.9702456186822595e-06,
"loss": 0.3484,
"step": 727
},
{
"epoch": 2.2065822784810125,
"grad_norm": 0.13660123224575374,
"learning_rate": 1.956192854956397e-06,
"loss": 0.3503,
"step": 728
},
{
"epoch": 2.209620253164557,
"grad_norm": 0.1409506648161819,
"learning_rate": 1.9421781882055447e-06,
"loss": 0.373,
"step": 729
},
{
"epoch": 2.212658227848101,
"grad_norm": 0.14047713525681946,
"learning_rate": 1.9282017938404202e-06,
"loss": 0.3721,
"step": 730
},
{
"epoch": 2.2156962025316456,
"grad_norm": 0.139451374909164,
"learning_rate": 1.9142638467927254e-06,
"loss": 0.372,
"step": 731
},
{
"epoch": 2.21873417721519,
"grad_norm": 0.14095477401855694,
"learning_rate": 1.9003645215129356e-06,
"loss": 0.3914,
"step": 732
},
{
"epoch": 2.221772151898734,
"grad_norm": 0.15446900783196774,
"learning_rate": 1.8865039919681377e-06,
"loss": 0.3807,
"step": 733
},
{
"epoch": 2.2248101265822786,
"grad_norm": 0.14673994714173552,
"learning_rate": 1.8726824316398372e-06,
"loss": 0.3609,
"step": 734
},
{
"epoch": 2.2278481012658227,
"grad_norm": 0.14758184686530929,
"learning_rate": 1.8589000135217882e-06,
"loss": 0.3395,
"step": 735
},
{
"epoch": 2.230886075949367,
"grad_norm": 0.14211141269917668,
"learning_rate": 1.845156910117843e-06,
"loss": 0.3821,
"step": 736
},
{
"epoch": 2.233924050632911,
"grad_norm": 0.14513257772004282,
"learning_rate": 1.831453293439771e-06,
"loss": 0.3899,
"step": 737
},
{
"epoch": 2.2369620253164557,
"grad_norm": 0.14360399357832268,
"learning_rate": 1.8177893350051213e-06,
"loss": 0.3599,
"step": 738
},
{
"epoch": 2.24,
"grad_norm": 0.1421345505264031,
"learning_rate": 1.8041652058350768e-06,
"loss": 0.3604,
"step": 739
},
{
"epoch": 2.2430379746835443,
"grad_norm": 0.13312641259007316,
"learning_rate": 1.7905810764522963e-06,
"loss": 0.3654,
"step": 740
},
{
"epoch": 2.2460759493670888,
"grad_norm": 0.14218988322941647,
"learning_rate": 1.7770371168788042e-06,
"loss": 0.3559,
"step": 741
},
{
"epoch": 2.249113924050633,
"grad_norm": 0.14208321089497616,
"learning_rate": 1.7635334966338463e-06,
"loss": 0.3517,
"step": 742
},
{
"epoch": 2.2521518987341773,
"grad_norm": 0.14127201594720978,
"learning_rate": 1.7500703847317663e-06,
"loss": 0.3995,
"step": 743
},
{
"epoch": 2.2551898734177214,
"grad_norm": 0.14581924224244086,
"learning_rate": 1.7366479496799076e-06,
"loss": 0.3474,
"step": 744
},
{
"epoch": 2.258227848101266,
"grad_norm": 0.1402244780578575,
"learning_rate": 1.723266359476483e-06,
"loss": 0.3411,
"step": 745
},
{
"epoch": 2.26126582278481,
"grad_norm": 0.1384075219093103,
"learning_rate": 1.7099257816084851e-06,
"loss": 0.377,
"step": 746
},
{
"epoch": 2.2643037974683544,
"grad_norm": 0.14542196888379869,
"learning_rate": 1.6966263830495939e-06,
"loss": 0.3733,
"step": 747
},
{
"epoch": 2.267341772151899,
"grad_norm": 0.14061074341979693,
"learning_rate": 1.6833683302580661e-06,
"loss": 0.3611,
"step": 748
},
{
"epoch": 2.270379746835443,
"grad_norm": 0.14151399997222402,
"learning_rate": 1.6701517891746805e-06,
"loss": 0.3675,
"step": 749
},
{
"epoch": 2.2734177215189875,
"grad_norm": 0.14743297739700706,
"learning_rate": 1.656976925220633e-06,
"loss": 0.359,
"step": 750
},
{
"epoch": 2.2764556962025315,
"grad_norm": 0.139782212285992,
"learning_rate": 1.6438439032954857e-06,
"loss": 0.3747,
"step": 751
},
{
"epoch": 2.279493670886076,
"grad_norm": 0.13955135512445044,
"learning_rate": 1.6307528877751e-06,
"loss": 0.3526,
"step": 752
},
{
"epoch": 2.28253164556962,
"grad_norm": 0.13638102492670431,
"learning_rate": 1.6177040425095664e-06,
"loss": 0.3791,
"step": 753
},
{
"epoch": 2.2855696202531646,
"grad_norm": 0.13722953132884094,
"learning_rate": 1.6046975308211699e-06,
"loss": 0.3549,
"step": 754
},
{
"epoch": 2.2886075949367086,
"grad_norm": 0.13730644404932044,
"learning_rate": 1.5917335155023368e-06,
"loss": 0.3688,
"step": 755
},
{
"epoch": 2.291645569620253,
"grad_norm": 0.152856980626857,
"learning_rate": 1.5788121588135975e-06,
"loss": 0.3652,
"step": 756
},
{
"epoch": 2.2946835443037976,
"grad_norm": 0.1393588461745144,
"learning_rate": 1.5659336224815642e-06,
"loss": 0.3606,
"step": 757
},
{
"epoch": 2.2977215189873417,
"grad_norm": 0.1352559540583668,
"learning_rate": 1.553098067696891e-06,
"loss": 0.3598,
"step": 758
},
{
"epoch": 2.300759493670886,
"grad_norm": 0.13975979693883497,
"learning_rate": 1.5403056551122697e-06,
"loss": 0.3562,
"step": 759
},
{
"epoch": 2.3037974683544302,
"grad_norm": 0.14082699563333942,
"learning_rate": 1.5275565448404146e-06,
"loss": 0.3632,
"step": 760
},
{
"epoch": 2.3068354430379747,
"grad_norm": 0.1444199782676381,
"learning_rate": 1.5148508964520586e-06,
"loss": 0.3676,
"step": 761
},
{
"epoch": 2.309873417721519,
"grad_norm": 0.14939812391967536,
"learning_rate": 1.502188868973955e-06,
"loss": 0.39,
"step": 762
},
{
"epoch": 2.3129113924050633,
"grad_norm": 0.14827888289725544,
"learning_rate": 1.4895706208868876e-06,
"loss": 0.3698,
"step": 763
},
{
"epoch": 2.3159493670886078,
"grad_norm": 0.14193235148758998,
"learning_rate": 1.4769963101236894e-06,
"loss": 0.3477,
"step": 764
},
{
"epoch": 2.318987341772152,
"grad_norm": 0.14253457252980234,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.3658,
"step": 765
},
{
"epoch": 2.3220253164556963,
"grad_norm": 0.14584547496813957,
"learning_rate": 1.4519801295486102e-06,
"loss": 0.381,
"step": 766
},
{
"epoch": 2.3250632911392404,
"grad_norm": 0.1382275600719031,
"learning_rate": 1.439538572844873e-06,
"loss": 0.3445,
"step": 767
},
{
"epoch": 2.328101265822785,
"grad_norm": 0.1390590172326061,
"learning_rate": 1.4271415796773742e-06,
"loss": 0.3788,
"step": 768
},
{
"epoch": 2.331139240506329,
"grad_norm": 0.14109343766879234,
"learning_rate": 1.4147893052096684e-06,
"loss": 0.3591,
"step": 769
},
{
"epoch": 2.3341772151898734,
"grad_norm": 0.14542614353397915,
"learning_rate": 1.4024819040456023e-06,
"loss": 0.3777,
"step": 770
},
{
"epoch": 2.337215189873418,
"grad_norm": 0.13499097512948352,
"learning_rate": 1.390219530227378e-06,
"loss": 0.3611,
"step": 771
},
{
"epoch": 2.340253164556962,
"grad_norm": 0.14731429626870976,
"learning_rate": 1.378002337233625e-06,
"loss": 0.3687,
"step": 772
},
{
"epoch": 2.3432911392405065,
"grad_norm": 0.14898716888323726,
"learning_rate": 1.3658304779774784e-06,
"loss": 0.3948,
"step": 773
},
{
"epoch": 2.3463291139240505,
"grad_norm": 0.14050750727274172,
"learning_rate": 1.3537041048046696e-06,
"loss": 0.3463,
"step": 774
},
{
"epoch": 2.349367088607595,
"grad_norm": 0.14023730470046572,
"learning_rate": 1.3416233694916086e-06,
"loss": 0.369,
"step": 775
},
{
"epoch": 2.352405063291139,
"grad_norm": 0.14551200257588823,
"learning_rate": 1.3295884232435008e-06,
"loss": 0.3436,
"step": 776
},
{
"epoch": 2.3554430379746836,
"grad_norm": 0.14391540661810454,
"learning_rate": 1.3175994166924394e-06,
"loss": 0.3531,
"step": 777
},
{
"epoch": 2.3584810126582276,
"grad_norm": 0.13062333222456327,
"learning_rate": 1.3056564998955274e-06,
"loss": 0.3323,
"step": 778
},
{
"epoch": 2.361518987341772,
"grad_norm": 0.1385906330087921,
"learning_rate": 1.2937598223330006e-06,
"loss": 0.3611,
"step": 779
},
{
"epoch": 2.3645569620253166,
"grad_norm": 0.14173852169646894,
"learning_rate": 1.2819095329063469e-06,
"loss": 0.3573,
"step": 780
},
{
"epoch": 2.3675949367088607,
"grad_norm": 0.1395324012967561,
"learning_rate": 1.2701057799364591e-06,
"loss": 0.3744,
"step": 781
},
{
"epoch": 2.370632911392405,
"grad_norm": 0.15051070985075915,
"learning_rate": 1.2583487111617647e-06,
"loss": 0.3641,
"step": 782
},
{
"epoch": 2.3736708860759492,
"grad_norm": 0.14670986093390542,
"learning_rate": 1.246638473736378e-06,
"loss": 0.3439,
"step": 783
},
{
"epoch": 2.3767088607594937,
"grad_norm": 0.1457775446421222,
"learning_rate": 1.2349752142282706e-06,
"loss": 0.3828,
"step": 784
},
{
"epoch": 2.379746835443038,
"grad_norm": 0.14253862050319224,
"learning_rate": 1.223359078617416e-06,
"loss": 0.3667,
"step": 785
},
{
"epoch": 2.3827848101265823,
"grad_norm": 0.14582245384466594,
"learning_rate": 1.2117902122939861e-06,
"loss": 0.3515,
"step": 786
},
{
"epoch": 2.3858227848101268,
"grad_norm": 0.1396466440135159,
"learning_rate": 1.2002687600565138e-06,
"loss": 0.3392,
"step": 787
},
{
"epoch": 2.388860759493671,
"grad_norm": 0.14497747521117874,
"learning_rate": 1.1887948661100833e-06,
"loss": 0.3679,
"step": 788
},
{
"epoch": 2.3918987341772153,
"grad_norm": 0.1400434061550448,
"learning_rate": 1.1773686740645384e-06,
"loss": 0.3782,
"step": 789
},
{
"epoch": 2.3949367088607594,
"grad_norm": 0.15068552522031847,
"learning_rate": 1.165990326932665e-06,
"loss": 0.3982,
"step": 790
},
{
"epoch": 2.397974683544304,
"grad_norm": 0.14464648985072429,
"learning_rate": 1.1546599671284158e-06,
"loss": 0.382,
"step": 791
},
{
"epoch": 2.401012658227848,
"grad_norm": 0.14150707244836175,
"learning_rate": 1.1433777364651272e-06,
"loss": 0.3692,
"step": 792
},
{
"epoch": 2.4040506329113924,
"grad_norm": 0.14598665701127156,
"learning_rate": 1.1321437761537307e-06,
"loss": 0.3608,
"step": 793
},
{
"epoch": 2.407088607594937,
"grad_norm": 0.13971211615116638,
"learning_rate": 1.1209582268010056e-06,
"loss": 0.3575,
"step": 794
},
{
"epoch": 2.410126582278481,
"grad_norm": 0.14477123446370144,
"learning_rate": 1.1098212284078037e-06,
"loss": 0.3542,
"step": 795
},
{
"epoch": 2.4131645569620255,
"grad_norm": 0.14076240365795126,
"learning_rate": 1.098732920367298e-06,
"loss": 0.3545,
"step": 796
},
{
"epoch": 2.4162025316455695,
"grad_norm": 0.1405893390604257,
"learning_rate": 1.0876934414632523e-06,
"loss": 0.3741,
"step": 797
},
{
"epoch": 2.419240506329114,
"grad_norm": 0.14279215153613262,
"learning_rate": 1.0767029298682642e-06,
"loss": 0.3448,
"step": 798
},
{
"epoch": 2.422278481012658,
"grad_norm": 0.13748561916619084,
"learning_rate": 1.0657615231420492e-06,
"loss": 0.3458,
"step": 799
},
{
"epoch": 2.4253164556962026,
"grad_norm": 0.14494068184127612,
"learning_rate": 1.0548693582297203e-06,
"loss": 0.353,
"step": 800
},
{
"epoch": 2.4283544303797466,
"grad_norm": 0.14224661128262228,
"learning_rate": 1.0440265714600573e-06,
"loss": 0.3619,
"step": 801
},
{
"epoch": 2.431392405063291,
"grad_norm": 0.1422301175985462,
"learning_rate": 1.0332332985438248e-06,
"loss": 0.3786,
"step": 802
},
{
"epoch": 2.4344303797468356,
"grad_norm": 0.14197811091570336,
"learning_rate": 1.0224896745720513e-06,
"loss": 0.3655,
"step": 803
},
{
"epoch": 2.4374683544303797,
"grad_norm": 0.1408337485003354,
"learning_rate": 1.0117958340143508e-06,
"loss": 0.3962,
"step": 804
},
{
"epoch": 2.440506329113924,
"grad_norm": 0.14198169733129237,
"learning_rate": 1.0011519107172413e-06,
"loss": 0.3544,
"step": 805
},
{
"epoch": 2.4435443037974682,
"grad_norm": 0.14765256400357799,
"learning_rate": 9.905580379024581e-07,
"loss": 0.3519,
"step": 806
},
{
"epoch": 2.4465822784810127,
"grad_norm": 0.13305260404010058,
"learning_rate": 9.80014348165298e-07,
"loss": 0.346,
"step": 807
},
{
"epoch": 2.449620253164557,
"grad_norm": 0.1435729151110055,
"learning_rate": 9.695209734729533e-07,
"loss": 0.3686,
"step": 808
},
{
"epoch": 2.4526582278481013,
"grad_norm": 0.1493962374386908,
"learning_rate": 9.590780451628617e-07,
"loss": 0.3647,
"step": 809
},
{
"epoch": 2.4556962025316453,
"grad_norm": 0.13608529098836616,
"learning_rate": 9.486856939410672e-07,
"loss": 0.3735,
"step": 810
},
{
"epoch": 2.45873417721519,
"grad_norm": 0.14503694981671503,
"learning_rate": 9.383440498805712e-07,
"loss": 0.3489,
"step": 811
},
{
"epoch": 2.4617721518987343,
"grad_norm": 0.14583750885660682,
"learning_rate": 9.280532424197192e-07,
"loss": 0.3675,
"step": 812
},
{
"epoch": 2.4648101265822784,
"grad_norm": 0.1447589564816734,
"learning_rate": 9.178134003605721e-07,
"loss": 0.3639,
"step": 813
},
{
"epoch": 2.467848101265823,
"grad_norm": 0.14148984313136934,
"learning_rate": 9.076246518672971e-07,
"loss": 0.3619,
"step": 814
},
{
"epoch": 2.470886075949367,
"grad_norm": 0.14750236977694262,
"learning_rate": 8.974871244645628e-07,
"loss": 0.3552,
"step": 815
},
{
"epoch": 2.4739240506329114,
"grad_norm": 0.14286668133865993,
"learning_rate": 8.874009450359428e-07,
"loss": 0.3898,
"step": 816
},
{
"epoch": 2.476962025316456,
"grad_norm": 0.1484301811121413,
"learning_rate": 8.773662398223276e-07,
"loss": 0.3748,
"step": 817
},
{
"epoch": 2.48,
"grad_norm": 0.14341606721481867,
"learning_rate": 8.673831344203454e-07,
"loss": 0.3498,
"step": 818
},
{
"epoch": 2.4830379746835445,
"grad_norm": 0.14189771292479822,
"learning_rate": 8.574517537807897e-07,
"loss": 0.3587,
"step": 819
},
{
"epoch": 2.4860759493670885,
"grad_norm": 0.14305942408590702,
"learning_rate": 8.475722222070542e-07,
"loss": 0.3649,
"step": 820
},
{
"epoch": 2.489113924050633,
"grad_norm": 0.14404611768972017,
"learning_rate": 8.377446633535797e-07,
"loss": 0.354,
"step": 821
},
{
"epoch": 2.492151898734177,
"grad_norm": 0.1406676353514383,
"learning_rate": 8.279692002243028e-07,
"loss": 0.3702,
"step": 822
},
{
"epoch": 2.4951898734177216,
"grad_norm": 0.14167576764446896,
"learning_rate": 8.182459551711197e-07,
"loss": 0.3483,
"step": 823
},
{
"epoch": 2.4982278481012656,
"grad_norm": 0.1413673019085962,
"learning_rate": 8.085750498923528e-07,
"loss": 0.3613,
"step": 824
},
{
"epoch": 2.50126582278481,
"grad_norm": 0.13949858187507314,
"learning_rate": 7.989566054312286e-07,
"loss": 0.3588,
"step": 825
},
{
"epoch": 2.5043037974683546,
"grad_norm": 0.13399060946873256,
"learning_rate": 7.893907421743613e-07,
"loss": 0.3508,
"step": 826
},
{
"epoch": 2.5073417721518987,
"grad_norm": 0.14815186524448404,
"learning_rate": 7.798775798502484e-07,
"loss": 0.3753,
"step": 827
},
{
"epoch": 2.510379746835443,
"grad_norm": 0.14107003221585707,
"learning_rate": 7.704172375277691e-07,
"loss": 0.353,
"step": 828
},
{
"epoch": 2.5134177215189872,
"grad_norm": 0.12880402651650627,
"learning_rate": 7.610098336146965e-07,
"loss": 0.3426,
"step": 829
},
{
"epoch": 2.5164556962025317,
"grad_norm": 0.13893882279621997,
"learning_rate": 7.516554858562142e-07,
"loss": 0.3742,
"step": 830
},
{
"epoch": 2.519493670886076,
"grad_norm": 0.14133937833940632,
"learning_rate": 7.423543113334436e-07,
"loss": 0.353,
"step": 831
},
{
"epoch": 2.5225316455696203,
"grad_norm": 0.1419575084368612,
"learning_rate": 7.331064264619786e-07,
"loss": 0.3679,
"step": 832
},
{
"epoch": 2.5255696202531643,
"grad_norm": 0.14081615107573495,
"learning_rate": 7.239119469904227e-07,
"loss": 0.3463,
"step": 833
},
{
"epoch": 2.528607594936709,
"grad_norm": 0.1346841225757085,
"learning_rate": 7.147709879989539e-07,
"loss": 0.3512,
"step": 834
},
{
"epoch": 2.5316455696202533,
"grad_norm": 0.14935032059019177,
"learning_rate": 7.056836638978698e-07,
"loss": 0.3857,
"step": 835
},
{
"epoch": 2.5346835443037974,
"grad_norm": 0.1439669767055388,
"learning_rate": 6.966500884261635e-07,
"loss": 0.3654,
"step": 836
},
{
"epoch": 2.537721518987342,
"grad_norm": 0.1364128007511732,
"learning_rate": 6.876703746500984e-07,
"loss": 0.3241,
"step": 837
},
{
"epoch": 2.540759493670886,
"grad_norm": 0.14254432013124188,
"learning_rate": 6.787446349617899e-07,
"loss": 0.3802,
"step": 838
},
{
"epoch": 2.5437974683544304,
"grad_norm": 0.1348271565130609,
"learning_rate": 6.698729810778065e-07,
"loss": 0.3606,
"step": 839
},
{
"epoch": 2.546835443037975,
"grad_norm": 0.1395010006488868,
"learning_rate": 6.610555240377653e-07,
"loss": 0.3776,
"step": 840
},
{
"epoch": 2.549873417721519,
"grad_norm": 0.15111108270200227,
"learning_rate": 6.522923742029374e-07,
"loss": 0.3711,
"step": 841
},
{
"epoch": 2.552911392405063,
"grad_norm": 0.13608547533374346,
"learning_rate": 6.435836412548835e-07,
"loss": 0.3516,
"step": 842
},
{
"epoch": 2.5559493670886075,
"grad_norm": 0.13381246094662583,
"learning_rate": 6.349294341940593e-07,
"loss": 0.351,
"step": 843
},
{
"epoch": 2.558987341772152,
"grad_norm": 0.14005213416771026,
"learning_rate": 6.263298613384705e-07,
"loss": 0.3508,
"step": 844
},
{
"epoch": 2.562025316455696,
"grad_norm": 0.13085523859476353,
"learning_rate": 6.177850303223059e-07,
"loss": 0.3661,
"step": 845
},
{
"epoch": 2.5650632911392406,
"grad_norm": 0.14060955175856465,
"learning_rate": 6.092950480945897e-07,
"loss": 0.3472,
"step": 846
},
{
"epoch": 2.5681012658227846,
"grad_norm": 0.14813730552847526,
"learning_rate": 6.008600209178539e-07,
"loss": 0.3531,
"step": 847
},
{
"epoch": 2.571139240506329,
"grad_norm": 0.15000846362334494,
"learning_rate": 5.92480054366793e-07,
"loss": 0.3681,
"step": 848
},
{
"epoch": 2.5741772151898736,
"grad_norm": 0.14334858570822884,
"learning_rate": 5.841552533269534e-07,
"loss": 0.34,
"step": 849
},
{
"epoch": 2.5772151898734177,
"grad_norm": 0.1314990905808941,
"learning_rate": 5.75885721993421e-07,
"loss": 0.357,
"step": 850
},
{
"epoch": 2.580253164556962,
"grad_norm": 0.1461629388988376,
"learning_rate": 5.676715638695063e-07,
"loss": 0.331,
"step": 851
},
{
"epoch": 2.5832911392405062,
"grad_norm": 0.137611905623909,
"learning_rate": 5.595128817654638e-07,
"loss": 0.3517,
"step": 852
},
{
"epoch": 2.5863291139240507,
"grad_norm": 0.14882425269799035,
"learning_rate": 5.514097777971939e-07,
"loss": 0.3599,
"step": 853
},
{
"epoch": 2.589367088607595,
"grad_norm": 0.13474588857180161,
"learning_rate": 5.433623533849658e-07,
"loss": 0.3518,
"step": 854
},
{
"epoch": 2.5924050632911393,
"grad_norm": 0.13467344047962448,
"learning_rate": 5.353707092521581e-07,
"loss": 0.3463,
"step": 855
},
{
"epoch": 2.5954430379746833,
"grad_norm": 0.13822903597871414,
"learning_rate": 5.274349454239836e-07,
"loss": 0.3651,
"step": 856
},
{
"epoch": 2.598481012658228,
"grad_norm": 0.13473144890924296,
"learning_rate": 5.195551612262478e-07,
"loss": 0.3591,
"step": 857
},
{
"epoch": 2.6015189873417723,
"grad_norm": 0.14778113657864514,
"learning_rate": 5.117314552841052e-07,
"loss": 0.3952,
"step": 858
},
{
"epoch": 2.6045569620253164,
"grad_norm": 0.15168402166456194,
"learning_rate": 5.039639255208156e-07,
"loss": 0.3542,
"step": 859
},
{
"epoch": 2.607594936708861,
"grad_norm": 0.14600363407620232,
"learning_rate": 4.962526691565333e-07,
"loss": 0.3357,
"step": 860
},
{
"epoch": 2.610632911392405,
"grad_norm": 0.1282479097902379,
"learning_rate": 4.885977827070748e-07,
"loss": 0.3379,
"step": 861
},
{
"epoch": 2.6136708860759494,
"grad_norm": 0.14419548611833952,
"learning_rate": 4.809993619827203e-07,
"loss": 0.3426,
"step": 862
},
{
"epoch": 2.616708860759494,
"grad_norm": 0.13407436879878423,
"learning_rate": 4.734575020870169e-07,
"loss": 0.3805,
"step": 863
},
{
"epoch": 2.619746835443038,
"grad_norm": 0.13430329562453588,
"learning_rate": 4.659722974155767e-07,
"loss": 0.345,
"step": 864
},
{
"epoch": 2.622784810126582,
"grad_norm": 0.14431698089054304,
"learning_rate": 4.5854384165490596e-07,
"loss": 0.3616,
"step": 865
},
{
"epoch": 2.6258227848101265,
"grad_norm": 0.14455208613547382,
"learning_rate": 4.511722277812286e-07,
"loss": 0.3448,
"step": 866
},
{
"epoch": 2.628860759493671,
"grad_norm": 0.13429112067273102,
"learning_rate": 4.43857548059321e-07,
"loss": 0.3661,
"step": 867
},
{
"epoch": 2.631898734177215,
"grad_norm": 0.14159181014988267,
"learning_rate": 4.365998940413629e-07,
"loss": 0.3455,
"step": 868
},
{
"epoch": 2.6349367088607596,
"grad_norm": 0.136621586848235,
"learning_rate": 4.293993565657828e-07,
"loss": 0.3436,
"step": 869
},
{
"epoch": 2.6379746835443036,
"grad_norm": 0.14360285658878558,
"learning_rate": 4.222560257561276e-07,
"loss": 0.3507,
"step": 870
},
{
"epoch": 2.641012658227848,
"grad_norm": 0.14225730914449142,
"learning_rate": 4.151699910199336e-07,
"loss": 0.3528,
"step": 871
},
{
"epoch": 2.6440506329113926,
"grad_norm": 0.14838260397402092,
"learning_rate": 4.0814134104760483e-07,
"loss": 0.3755,
"step": 872
},
{
"epoch": 2.6470886075949367,
"grad_norm": 0.1598367499255771,
"learning_rate": 4.0117016381130636e-07,
"loss": 0.3891,
"step": 873
},
{
"epoch": 2.6501265822784807,
"grad_norm": 0.14446123574545194,
"learning_rate": 3.9425654656386094e-07,
"loss": 0.3716,
"step": 874
},
{
"epoch": 2.6531645569620252,
"grad_norm": 0.14517669624053445,
"learning_rate": 3.87400575837657e-07,
"loss": 0.3629,
"step": 875
},
{
"epoch": 2.6562025316455697,
"grad_norm": 0.13863663951583624,
"learning_rate": 3.8060233744356634e-07,
"loss": 0.3628,
"step": 876
},
{
"epoch": 2.659240506329114,
"grad_norm": 0.14367297665910245,
"learning_rate": 3.7386191646987094e-07,
"loss": 0.3759,
"step": 877
},
{
"epoch": 2.6622784810126583,
"grad_norm": 0.13598047085266352,
"learning_rate": 3.671793972811954e-07,
"loss": 0.3703,
"step": 878
},
{
"epoch": 2.6653164556962023,
"grad_norm": 0.14898777719114867,
"learning_rate": 3.6055486351745327e-07,
"loss": 0.3452,
"step": 879
},
{
"epoch": 2.668354430379747,
"grad_norm": 0.13807909580315986,
"learning_rate": 3.539883980928005e-07,
"loss": 0.3687,
"step": 880
},
{
"epoch": 2.6713924050632913,
"grad_norm": 0.14897676417163686,
"learning_rate": 3.4748008319459457e-07,
"loss": 0.3458,
"step": 881
},
{
"epoch": 2.6744303797468354,
"grad_norm": 0.14839136336747957,
"learning_rate": 3.410300002823691e-07,
"loss": 0.3541,
"step": 882
},
{
"epoch": 2.67746835443038,
"grad_norm": 0.1376194355310099,
"learning_rate": 3.346382300868134e-07,
"loss": 0.333,
"step": 883
},
{
"epoch": 2.680506329113924,
"grad_norm": 0.14122555818361973,
"learning_rate": 3.2830485260876064e-07,
"loss": 0.3982,
"step": 884
},
{
"epoch": 2.6835443037974684,
"grad_norm": 0.14968124197902433,
"learning_rate": 3.220299471181898e-07,
"loss": 0.3537,
"step": 885
},
{
"epoch": 2.6865822784810125,
"grad_norm": 0.14116097579071474,
"learning_rate": 3.158135921532268e-07,
"loss": 0.3525,
"step": 886
},
{
"epoch": 2.689620253164557,
"grad_norm": 0.1399951176094029,
"learning_rate": 3.096558655191706e-07,
"loss": 0.3301,
"step": 887
},
{
"epoch": 2.692658227848101,
"grad_norm": 0.14786696698824425,
"learning_rate": 3.035568442875136e-07,
"loss": 0.3686,
"step": 888
},
{
"epoch": 2.6956962025316455,
"grad_norm": 0.13574997006805542,
"learning_rate": 2.9751660479497737e-07,
"loss": 0.3742,
"step": 889
},
{
"epoch": 2.69873417721519,
"grad_norm": 0.14162583087805647,
"learning_rate": 2.915352226425583e-07,
"loss": 0.3502,
"step": 890
},
{
"epoch": 2.701772151898734,
"grad_norm": 0.1404375267715342,
"learning_rate": 2.85612772694579e-07,
"loss": 0.3567,
"step": 891
},
{
"epoch": 2.7048101265822786,
"grad_norm": 0.1418915799031849,
"learning_rate": 2.7974932907775863e-07,
"loss": 0.349,
"step": 892
},
{
"epoch": 2.7078481012658226,
"grad_norm": 0.14563236272596766,
"learning_rate": 2.739449651802756e-07,
"loss": 0.3903,
"step": 893
},
{
"epoch": 2.710886075949367,
"grad_norm": 0.1416930939009952,
"learning_rate": 2.6819975365085237e-07,
"loss": 0.3742,
"step": 894
},
{
"epoch": 2.7139240506329116,
"grad_norm": 0.14797637210969025,
"learning_rate": 2.6251376639785163e-07,
"loss": 0.3779,
"step": 895
},
{
"epoch": 2.7169620253164557,
"grad_norm": 0.14863207599810754,
"learning_rate": 2.5688707458836724e-07,
"loss": 0.3827,
"step": 896
},
{
"epoch": 2.7199999999999998,
"grad_norm": 0.14561232207790262,
"learning_rate": 2.5131974864734063e-07,
"loss": 0.3531,
"step": 897
},
{
"epoch": 2.7230379746835442,
"grad_norm": 0.14766046853660647,
"learning_rate": 2.45811858256676e-07,
"loss": 0.3312,
"step": 898
},
{
"epoch": 2.7260759493670887,
"grad_norm": 0.14177051091887588,
"learning_rate": 2.403634723543674e-07,
"loss": 0.3362,
"step": 899
},
{
"epoch": 2.729113924050633,
"grad_norm": 0.14501857620086245,
"learning_rate": 2.3497465913364047e-07,
"loss": 0.354,
"step": 900
},
{
"epoch": 2.7321518987341773,
"grad_norm": 0.14566914555210908,
"learning_rate": 2.2964548604209214e-07,
"loss": 0.367,
"step": 901
},
{
"epoch": 2.7351898734177214,
"grad_norm": 0.14227637192680714,
"learning_rate": 2.2437601978085144e-07,
"loss": 0.3646,
"step": 902
},
{
"epoch": 2.738227848101266,
"grad_norm": 0.13705012394654958,
"learning_rate": 2.1916632630374579e-07,
"loss": 0.3791,
"step": 903
},
{
"epoch": 2.7412658227848103,
"grad_norm": 0.14332007524431387,
"learning_rate": 2.1401647081646825e-07,
"loss": 0.3759,
"step": 904
},
{
"epoch": 2.7443037974683544,
"grad_norm": 0.14995105596897376,
"learning_rate": 2.0892651777577045e-07,
"loss": 0.3587,
"step": 905
},
{
"epoch": 2.747341772151899,
"grad_norm": 0.1322865348126154,
"learning_rate": 2.0389653088865035e-07,
"loss": 0.3266,
"step": 906
},
{
"epoch": 2.750379746835443,
"grad_norm": 0.13702889779019603,
"learning_rate": 1.989265731115525e-07,
"loss": 0.3754,
"step": 907
},
{
"epoch": 2.7534177215189874,
"grad_norm": 0.13943316795835564,
"learning_rate": 1.940167066495896e-07,
"loss": 0.3391,
"step": 908
},
{
"epoch": 2.7564556962025315,
"grad_norm": 0.13964099607089664,
"learning_rate": 1.8916699295575324e-07,
"loss": 0.3519,
"step": 909
},
{
"epoch": 2.759493670886076,
"grad_norm": 0.13394426066716283,
"learning_rate": 1.8437749273015116e-07,
"loss": 0.3701,
"step": 910
},
{
"epoch": 2.76253164556962,
"grad_norm": 0.1542873317357114,
"learning_rate": 1.7964826591924722e-07,
"loss": 0.3672,
"step": 911
},
{
"epoch": 2.7655696202531646,
"grad_norm": 0.13601394591415503,
"learning_rate": 1.749793717151055e-07,
"loss": 0.3794,
"step": 912
},
{
"epoch": 2.768607594936709,
"grad_norm": 0.149333961320097,
"learning_rate": 1.7037086855465902e-07,
"loss": 0.3517,
"step": 913
},
{
"epoch": 2.771645569620253,
"grad_norm": 0.1439570881143369,
"learning_rate": 1.6582281411896827e-07,
"loss": 0.369,
"step": 914
},
{
"epoch": 2.7746835443037976,
"grad_norm": 0.1338348522243044,
"learning_rate": 1.6133526533250566e-07,
"loss": 0.3306,
"step": 915
},
{
"epoch": 2.7777215189873417,
"grad_norm": 0.14056655146374267,
"learning_rate": 1.5690827836244317e-07,
"loss": 0.3666,
"step": 916
},
{
"epoch": 2.780759493670886,
"grad_norm": 0.14694930084022878,
"learning_rate": 1.5254190861794415e-07,
"loss": 0.3685,
"step": 917
},
{
"epoch": 2.7837974683544306,
"grad_norm": 0.13942952495633854,
"learning_rate": 1.4823621074947503e-07,
"loss": 0.3679,
"step": 918
},
{
"epoch": 2.7868354430379747,
"grad_norm": 0.15093468034751636,
"learning_rate": 1.4399123864811904e-07,
"loss": 0.3791,
"step": 919
},
{
"epoch": 2.7898734177215188,
"grad_norm": 0.14746517550482235,
"learning_rate": 1.398070454449013e-07,
"loss": 0.3986,
"step": 920
},
{
"epoch": 2.7929113924050633,
"grad_norm": 0.13795572431759273,
"learning_rate": 1.3568368351012718e-07,
"loss": 0.3442,
"step": 921
},
{
"epoch": 2.7959493670886078,
"grad_norm": 0.14934161196335263,
"learning_rate": 1.3162120445272096e-07,
"loss": 0.3431,
"step": 922
},
{
"epoch": 2.798987341772152,
"grad_norm": 0.13830591712832402,
"learning_rate": 1.2761965911958385e-07,
"loss": 0.3657,
"step": 923
},
{
"epoch": 2.8020253164556963,
"grad_norm": 0.14185016126182032,
"learning_rate": 1.236790975949592e-07,
"loss": 0.3971,
"step": 924
},
{
"epoch": 2.8050632911392404,
"grad_norm": 0.14893734927509886,
"learning_rate": 1.1979956919979996e-07,
"loss": 0.3496,
"step": 925
},
{
"epoch": 2.808101265822785,
"grad_norm": 0.14717788380695743,
"learning_rate": 1.1598112249115723e-07,
"loss": 0.3702,
"step": 926
},
{
"epoch": 2.8111392405063294,
"grad_norm": 0.15069631565588076,
"learning_rate": 1.1222380526156929e-07,
"loss": 0.3706,
"step": 927
},
{
"epoch": 2.8141772151898734,
"grad_norm": 0.1333471123136446,
"learning_rate": 1.0852766453846308e-07,
"loss": 0.352,
"step": 928
},
{
"epoch": 2.8172151898734175,
"grad_norm": 0.15098682346178977,
"learning_rate": 1.0489274658356808e-07,
"loss": 0.3641,
"step": 929
},
{
"epoch": 2.820253164556962,
"grad_norm": 0.14501492746112243,
"learning_rate": 1.0131909689233444e-07,
"loss": 0.3647,
"step": 930
},
{
"epoch": 2.8232911392405065,
"grad_norm": 0.1425701216690938,
"learning_rate": 9.780676019336632e-08,
"loss": 0.3441,
"step": 931
},
{
"epoch": 2.8263291139240505,
"grad_norm": 0.13992220457173968,
"learning_rate": 9.435578044786009e-08,
"loss": 0.3446,
"step": 932
},
{
"epoch": 2.829367088607595,
"grad_norm": 0.13753537015830755,
"learning_rate": 9.096620084905472e-08,
"loss": 0.3222,
"step": 933
},
{
"epoch": 2.832405063291139,
"grad_norm": 0.1433818687736568,
"learning_rate": 8.763806382169005e-08,
"loss": 0.3723,
"step": 934
},
{
"epoch": 2.8354430379746836,
"grad_norm": 0.14635658707240395,
"learning_rate": 8.437141102147883e-08,
"loss": 0.3519,
"step": 935
},
{
"epoch": 2.838481012658228,
"grad_norm": 0.14341707235390178,
"learning_rate": 8.11662833345822e-08,
"loss": 0.3735,
"step": 936
},
{
"epoch": 2.841518987341772,
"grad_norm": 0.14428097001730178,
"learning_rate": 7.802272087709951e-08,
"loss": 0.3435,
"step": 937
},
{
"epoch": 2.8445569620253166,
"grad_norm": 0.13539837299939841,
"learning_rate": 7.494076299456531e-08,
"loss": 0.3514,
"step": 938
},
{
"epoch": 2.8475949367088607,
"grad_norm": 0.13340280244063915,
"learning_rate": 7.192044826145772e-08,
"loss": 0.3121,
"step": 939
},
{
"epoch": 2.850632911392405,
"grad_norm": 0.13775709740053435,
"learning_rate": 6.896181448071582e-08,
"loss": 0.3592,
"step": 940
},
{
"epoch": 2.853670886075949,
"grad_norm": 0.13370052932506574,
"learning_rate": 6.606489868326571e-08,
"loss": 0.3928,
"step": 941
},
{
"epoch": 2.8567088607594937,
"grad_norm": 0.13858250025209845,
"learning_rate": 6.322973712755698e-08,
"loss": 0.3548,
"step": 942
},
{
"epoch": 2.8597468354430378,
"grad_norm": 0.14100466864912514,
"learning_rate": 6.045636529911025e-08,
"loss": 0.3814,
"step": 943
},
{
"epoch": 2.8627848101265823,
"grad_norm": 0.13927977406099926,
"learning_rate": 5.7744817910069804e-08,
"loss": 0.3571,
"step": 944
},
{
"epoch": 2.8658227848101268,
"grad_norm": 0.14225333857138436,
"learning_rate": 5.509512889877333e-08,
"loss": 0.3688,
"step": 945
},
{
"epoch": 2.868860759493671,
"grad_norm": 0.14509348324717275,
"learning_rate": 5.250733142932562e-08,
"loss": 0.3615,
"step": 946
},
{
"epoch": 2.8718987341772153,
"grad_norm": 0.1441834903168049,
"learning_rate": 4.998145789118114e-08,
"loss": 0.3745,
"step": 947
},
{
"epoch": 2.8749367088607594,
"grad_norm": 0.1522154692938043,
"learning_rate": 4.751753989874153e-08,
"loss": 0.3734,
"step": 948
},
{
"epoch": 2.877974683544304,
"grad_norm": 0.14299093831394943,
"learning_rate": 4.511560829095818e-08,
"loss": 0.3886,
"step": 949
},
{
"epoch": 2.8810126582278484,
"grad_norm": 0.15119476391129122,
"learning_rate": 4.2775693130948094e-08,
"loss": 0.369,
"step": 950
},
{
"epoch": 2.8840506329113924,
"grad_norm": 0.14903127814300893,
"learning_rate": 4.0497823705615836e-08,
"loss": 0.3722,
"step": 951
},
{
"epoch": 2.8870886075949365,
"grad_norm": 0.15050927595878638,
"learning_rate": 3.828202852528717e-08,
"loss": 0.4036,
"step": 952
},
{
"epoch": 2.890126582278481,
"grad_norm": 0.1462541853521983,
"learning_rate": 3.6128335323353804e-08,
"loss": 0.357,
"step": 953
},
{
"epoch": 2.8931645569620255,
"grad_norm": 0.14359248091321075,
"learning_rate": 3.4036771055923066e-08,
"loss": 0.3395,
"step": 954
},
{
"epoch": 2.8962025316455695,
"grad_norm": 0.14417757378782878,
"learning_rate": 3.2007361901485455e-08,
"loss": 0.3599,
"step": 955
},
{
"epoch": 2.899240506329114,
"grad_norm": 0.14247102359863195,
"learning_rate": 3.004013326058153e-08,
"loss": 0.3442,
"step": 956
},
{
"epoch": 2.902278481012658,
"grad_norm": 0.13709282238228418,
"learning_rate": 2.8135109755487723e-08,
"loss": 0.3578,
"step": 957
},
{
"epoch": 2.9053164556962026,
"grad_norm": 0.14387482380283953,
"learning_rate": 2.629231522990716e-08,
"loss": 0.3592,
"step": 958
},
{
"epoch": 2.908354430379747,
"grad_norm": 0.14514780847876588,
"learning_rate": 2.4511772748669894e-08,
"loss": 0.3794,
"step": 959
},
{
"epoch": 2.911392405063291,
"grad_norm": 0.14241431855080486,
"learning_rate": 2.2793504597447003e-08,
"loss": 0.3675,
"step": 960
},
{
"epoch": 2.9144303797468356,
"grad_norm": 0.13736916697690793,
"learning_rate": 2.1137532282469176e-08,
"loss": 0.3779,
"step": 961
},
{
"epoch": 2.9174683544303797,
"grad_norm": 0.1448670614839553,
"learning_rate": 1.954387653025802e-08,
"loss": 0.3528,
"step": 962
},
{
"epoch": 2.920506329113924,
"grad_norm": 0.13097811503034512,
"learning_rate": 1.8012557287367394e-08,
"loss": 0.3328,
"step": 963
},
{
"epoch": 2.923544303797468,
"grad_norm": 0.13561749309077153,
"learning_rate": 1.6543593720134142e-08,
"loss": 0.3386,
"step": 964
},
{
"epoch": 2.9265822784810127,
"grad_norm": 0.14165891512714415,
"learning_rate": 1.513700421443609e-08,
"loss": 0.3546,
"step": 965
},
{
"epoch": 2.9296202531645568,
"grad_norm": 0.1433081917389733,
"learning_rate": 1.379280637546443e-08,
"loss": 0.3745,
"step": 966
},
{
"epoch": 2.9326582278481013,
"grad_norm": 0.14293035691840986,
"learning_rate": 1.2511017027501682e-08,
"loss": 0.3385,
"step": 967
},
{
"epoch": 2.9356962025316458,
"grad_norm": 0.1403000487949085,
"learning_rate": 1.1291652213710758e-08,
"loss": 0.3567,
"step": 968
},
{
"epoch": 2.93873417721519,
"grad_norm": 0.14281092893489616,
"learning_rate": 1.0134727195937332e-08,
"loss": 0.3563,
"step": 969
},
{
"epoch": 2.9417721518987343,
"grad_norm": 0.13034908498073247,
"learning_rate": 9.04025645451445e-09,
"loss": 0.3518,
"step": 970
},
{
"epoch": 2.9448101265822784,
"grad_norm": 0.14582338155809607,
"learning_rate": 8.008253688084888e-09,
"loss": 0.3382,
"step": 971
},
{
"epoch": 2.947848101265823,
"grad_norm": 0.199628396153666,
"learning_rate": 7.038731813426292e-09,
"loss": 0.343,
"step": 972
},
{
"epoch": 2.9508860759493674,
"grad_norm": 0.14061669479319516,
"learning_rate": 6.1317029652929734e-09,
"loss": 0.3689,
"step": 973
},
{
"epoch": 2.9539240506329114,
"grad_norm": 0.14376239008080066,
"learning_rate": 5.2871784962627015e-09,
"loss": 0.3632,
"step": 974
},
{
"epoch": 2.9569620253164555,
"grad_norm": 0.13594297376826525,
"learning_rate": 4.505168976592922e-09,
"loss": 0.3553,
"step": 975
},
{
"epoch": 2.96,
"grad_norm": 0.14081835098065112,
"learning_rate": 3.785684194090866e-09,
"loss": 0.3531,
"step": 976
},
{
"epoch": 2.9630379746835445,
"grad_norm": 0.1424973642953699,
"learning_rate": 3.1287331539903155e-09,
"loss": 0.367,
"step": 977
},
{
"epoch": 2.9660759493670885,
"grad_norm": 0.1397337473486439,
"learning_rate": 2.534324078837802e-09,
"loss": 0.3834,
"step": 978
},
{
"epoch": 2.969113924050633,
"grad_norm": 0.14766955506816515,
"learning_rate": 2.002464408392135e-09,
"loss": 0.3392,
"step": 979
},
{
"epoch": 2.972151898734177,
"grad_norm": 0.13454940932320703,
"learning_rate": 1.5331607995267006e-09,
"loss": 0.3583,
"step": 980
},
{
"epoch": 2.9751898734177216,
"grad_norm": 0.13195240467317917,
"learning_rate": 1.1264191261528557e-09,
"loss": 0.3256,
"step": 981
},
{
"epoch": 2.978227848101266,
"grad_norm": 0.1388110702511361,
"learning_rate": 7.82244479139993e-10,
"loss": 0.3765,
"step": 982
},
{
"epoch": 2.98126582278481,
"grad_norm": 0.14580240061434538,
"learning_rate": 5.006411662555888e-10,
"loss": 0.368,
"step": 983
},
{
"epoch": 2.984303797468354,
"grad_norm": 0.14285610722978442,
"learning_rate": 2.816127121102463e-10,
"loss": 0.3497,
"step": 984
},
{
"epoch": 2.9873417721518987,
"grad_norm": 0.14237084088208307,
"learning_rate": 1.251618581127323e-10,
"loss": 0.3527,
"step": 985
},
{
"epoch": 2.990379746835443,
"grad_norm": 0.13991973912714595,
"learning_rate": 3.129056243833528e-11,
"loss": 0.3807,
"step": 986
},
{
"epoch": 2.993417721518987,
"grad_norm": 0.1445451641629666,
"learning_rate": 0.0,
"loss": 0.3581,
"step": 987
},
{
"epoch": 2.993417721518987,
"step": 987,
"total_flos": 2.6892487337023898e+17,
"train_loss": 0.5692941358930071,
"train_runtime": 10205.646,
"train_samples_per_second": 4.644,
"train_steps_per_second": 0.097
}
],
"logging_steps": 1,
"max_steps": 987,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.6892487337023898e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}