DongfuJiang's picture
End of training
b2f5f7d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9994008388256441,
"eval_steps": 500,
"global_step": 1251,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000798881565807869,
"grad_norm": 6.264526371887429,
"learning_rate": 7.936507936507937e-08,
"loss": 0.2446,
"step": 1
},
{
"epoch": 0.001597763131615738,
"grad_norm": 6.308694064244479,
"learning_rate": 1.5873015873015874e-07,
"loss": 0.2338,
"step": 2
},
{
"epoch": 0.002396644697423607,
"grad_norm": 5.832783152147895,
"learning_rate": 2.3809523809523811e-07,
"loss": 0.256,
"step": 3
},
{
"epoch": 0.003195526263231476,
"grad_norm": 5.799948682363988,
"learning_rate": 3.174603174603175e-07,
"loss": 0.2472,
"step": 4
},
{
"epoch": 0.003994407829039345,
"grad_norm": 5.976049807500368,
"learning_rate": 3.9682539682539683e-07,
"loss": 0.2477,
"step": 5
},
{
"epoch": 0.004793289394847214,
"grad_norm": 5.519855873058023,
"learning_rate": 4.7619047619047623e-07,
"loss": 0.2579,
"step": 6
},
{
"epoch": 0.005592170960655083,
"grad_norm": 4.19702291174845,
"learning_rate": 5.555555555555555e-07,
"loss": 0.2701,
"step": 7
},
{
"epoch": 0.006391052526462952,
"grad_norm": 3.5584756391837984,
"learning_rate": 6.34920634920635e-07,
"loss": 0.237,
"step": 8
},
{
"epoch": 0.007189934092270821,
"grad_norm": 3.2434211283881376,
"learning_rate": 7.142857142857143e-07,
"loss": 0.2415,
"step": 9
},
{
"epoch": 0.00798881565807869,
"grad_norm": 3.137678886876266,
"learning_rate": 7.936507936507937e-07,
"loss": 0.2278,
"step": 10
},
{
"epoch": 0.00878769722388656,
"grad_norm": 2.955592830949147,
"learning_rate": 8.73015873015873e-07,
"loss": 0.2283,
"step": 11
},
{
"epoch": 0.009586578789694428,
"grad_norm": 2.1650634302297247,
"learning_rate": 9.523809523809525e-07,
"loss": 0.2207,
"step": 12
},
{
"epoch": 0.010385460355502297,
"grad_norm": 2.150736904578608,
"learning_rate": 1.0317460317460317e-06,
"loss": 0.2141,
"step": 13
},
{
"epoch": 0.011184341921310166,
"grad_norm": 2.1243728083407585,
"learning_rate": 1.111111111111111e-06,
"loss": 0.2189,
"step": 14
},
{
"epoch": 0.011983223487118035,
"grad_norm": 2.1170944400671665,
"learning_rate": 1.1904761904761906e-06,
"loss": 0.2051,
"step": 15
},
{
"epoch": 0.012782105052925903,
"grad_norm": 1.9460350133992896,
"learning_rate": 1.26984126984127e-06,
"loss": 0.2019,
"step": 16
},
{
"epoch": 0.013580986618733772,
"grad_norm": 2.25981348368823,
"learning_rate": 1.3492063492063493e-06,
"loss": 0.203,
"step": 17
},
{
"epoch": 0.014379868184541641,
"grad_norm": 2.0595030317939615,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.1913,
"step": 18
},
{
"epoch": 0.01517874975034951,
"grad_norm": 1.801627566947741,
"learning_rate": 1.507936507936508e-06,
"loss": 0.1794,
"step": 19
},
{
"epoch": 0.01597763131615738,
"grad_norm": 1.6285244682529882,
"learning_rate": 1.5873015873015873e-06,
"loss": 0.1919,
"step": 20
},
{
"epoch": 0.016776512881965248,
"grad_norm": 1.3096573076000306,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.188,
"step": 21
},
{
"epoch": 0.01757539444777312,
"grad_norm": 1.3145326993964916,
"learning_rate": 1.746031746031746e-06,
"loss": 0.1798,
"step": 22
},
{
"epoch": 0.018374276013580985,
"grad_norm": 1.4510146563932291,
"learning_rate": 1.8253968253968254e-06,
"loss": 0.1679,
"step": 23
},
{
"epoch": 0.019173157579388856,
"grad_norm": 1.5676004476506082,
"learning_rate": 1.904761904761905e-06,
"loss": 0.1689,
"step": 24
},
{
"epoch": 0.019972039145196723,
"grad_norm": 1.3717967151760737,
"learning_rate": 1.984126984126984e-06,
"loss": 0.166,
"step": 25
},
{
"epoch": 0.020770920711004594,
"grad_norm": 1.2215402830411317,
"learning_rate": 2.0634920634920634e-06,
"loss": 0.1536,
"step": 26
},
{
"epoch": 0.02156980227681246,
"grad_norm": 1.0936849815044172,
"learning_rate": 2.1428571428571427e-06,
"loss": 0.1468,
"step": 27
},
{
"epoch": 0.02236868384262033,
"grad_norm": 1.0938814113616944,
"learning_rate": 2.222222222222222e-06,
"loss": 0.1515,
"step": 28
},
{
"epoch": 0.023167565408428202,
"grad_norm": 1.062505185068681,
"learning_rate": 2.301587301587302e-06,
"loss": 0.1402,
"step": 29
},
{
"epoch": 0.02396644697423607,
"grad_norm": 0.9972504126476375,
"learning_rate": 2.380952380952381e-06,
"loss": 0.1444,
"step": 30
},
{
"epoch": 0.02476532854004394,
"grad_norm": 1.1079316413916769,
"learning_rate": 2.4603174603174605e-06,
"loss": 0.1551,
"step": 31
},
{
"epoch": 0.025564210105851807,
"grad_norm": 1.1116202589649768,
"learning_rate": 2.53968253968254e-06,
"loss": 0.1523,
"step": 32
},
{
"epoch": 0.026363091671659677,
"grad_norm": 1.0526520024318986,
"learning_rate": 2.6190476190476192e-06,
"loss": 0.1416,
"step": 33
},
{
"epoch": 0.027161973237467545,
"grad_norm": 1.1059950145911168,
"learning_rate": 2.6984126984126986e-06,
"loss": 0.1345,
"step": 34
},
{
"epoch": 0.027960854803275415,
"grad_norm": 1.066947971257434,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.14,
"step": 35
},
{
"epoch": 0.028759736369083282,
"grad_norm": 1.1529644796723812,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.1318,
"step": 36
},
{
"epoch": 0.029558617934891153,
"grad_norm": 1.1073580581614721,
"learning_rate": 2.936507936507937e-06,
"loss": 0.1322,
"step": 37
},
{
"epoch": 0.03035749950069902,
"grad_norm": 1.1121577182014704,
"learning_rate": 3.015873015873016e-06,
"loss": 0.1414,
"step": 38
},
{
"epoch": 0.03115638106650689,
"grad_norm": 1.0598917305665783,
"learning_rate": 3.0952380952380957e-06,
"loss": 0.1329,
"step": 39
},
{
"epoch": 0.03195526263231476,
"grad_norm": 1.134894863039634,
"learning_rate": 3.1746031746031746e-06,
"loss": 0.1517,
"step": 40
},
{
"epoch": 0.03275414419812263,
"grad_norm": 1.001409942797884,
"learning_rate": 3.2539682539682544e-06,
"loss": 0.1392,
"step": 41
},
{
"epoch": 0.033553025763930495,
"grad_norm": 0.9891396742941516,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.14,
"step": 42
},
{
"epoch": 0.03435190732973837,
"grad_norm": 1.1266509544929115,
"learning_rate": 3.412698412698413e-06,
"loss": 0.1389,
"step": 43
},
{
"epoch": 0.03515078889554624,
"grad_norm": 0.9628290753555916,
"learning_rate": 3.492063492063492e-06,
"loss": 0.1385,
"step": 44
},
{
"epoch": 0.035949670461354104,
"grad_norm": 0.9720867605907642,
"learning_rate": 3.5714285714285718e-06,
"loss": 0.1262,
"step": 45
},
{
"epoch": 0.03674855202716197,
"grad_norm": 1.0868944036474724,
"learning_rate": 3.6507936507936507e-06,
"loss": 0.1388,
"step": 46
},
{
"epoch": 0.037547433592969845,
"grad_norm": 1.3146063770283822,
"learning_rate": 3.7301587301587305e-06,
"loss": 0.1285,
"step": 47
},
{
"epoch": 0.03834631515877771,
"grad_norm": 1.1651760656127519,
"learning_rate": 3.80952380952381e-06,
"loss": 0.1382,
"step": 48
},
{
"epoch": 0.03914519672458558,
"grad_norm": 0.9773806832036162,
"learning_rate": 3.88888888888889e-06,
"loss": 0.1184,
"step": 49
},
{
"epoch": 0.039944078290393446,
"grad_norm": 0.9830874096474851,
"learning_rate": 3.968253968253968e-06,
"loss": 0.1234,
"step": 50
},
{
"epoch": 0.04074295985620132,
"grad_norm": 1.3541297874851739,
"learning_rate": 4.047619047619048e-06,
"loss": 0.1308,
"step": 51
},
{
"epoch": 0.04154184142200919,
"grad_norm": 0.970571662092641,
"learning_rate": 4.126984126984127e-06,
"loss": 0.1377,
"step": 52
},
{
"epoch": 0.042340722987817055,
"grad_norm": 1.2143547903735357,
"learning_rate": 4.206349206349207e-06,
"loss": 0.1237,
"step": 53
},
{
"epoch": 0.04313960455362492,
"grad_norm": 1.0098454532215537,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.1231,
"step": 54
},
{
"epoch": 0.043938486119432796,
"grad_norm": 1.1067360434608624,
"learning_rate": 4.365079365079366e-06,
"loss": 0.137,
"step": 55
},
{
"epoch": 0.04473736768524066,
"grad_norm": 1.0456018151170878,
"learning_rate": 4.444444444444444e-06,
"loss": 0.1328,
"step": 56
},
{
"epoch": 0.04553624925104853,
"grad_norm": 0.9886332631374318,
"learning_rate": 4.523809523809524e-06,
"loss": 0.104,
"step": 57
},
{
"epoch": 0.046335130816856404,
"grad_norm": 1.0663992036951566,
"learning_rate": 4.603174603174604e-06,
"loss": 0.1241,
"step": 58
},
{
"epoch": 0.04713401238266427,
"grad_norm": 1.2133014739756007,
"learning_rate": 4.682539682539683e-06,
"loss": 0.1246,
"step": 59
},
{
"epoch": 0.04793289394847214,
"grad_norm": 1.130294064670252,
"learning_rate": 4.761904761904762e-06,
"loss": 0.1217,
"step": 60
},
{
"epoch": 0.048731775514280005,
"grad_norm": 1.087512054416334,
"learning_rate": 4.841269841269842e-06,
"loss": 0.1352,
"step": 61
},
{
"epoch": 0.04953065708008788,
"grad_norm": 1.186584513621857,
"learning_rate": 4.920634920634921e-06,
"loss": 0.1266,
"step": 62
},
{
"epoch": 0.05032953864589575,
"grad_norm": 0.9656151090360315,
"learning_rate": 5e-06,
"loss": 0.1282,
"step": 63
},
{
"epoch": 0.051128420211703614,
"grad_norm": 1.0445397820159212,
"learning_rate": 4.9999912586879515e-06,
"loss": 0.1205,
"step": 64
},
{
"epoch": 0.05192730177751148,
"grad_norm": 1.0626412035822264,
"learning_rate": 4.999965034812934e-06,
"loss": 0.119,
"step": 65
},
{
"epoch": 0.052726183343319355,
"grad_norm": 1.0884763391021568,
"learning_rate": 4.999921328558333e-06,
"loss": 0.122,
"step": 66
},
{
"epoch": 0.05352506490912722,
"grad_norm": 1.1255928841636056,
"learning_rate": 4.999860140229788e-06,
"loss": 0.1317,
"step": 67
},
{
"epoch": 0.05432394647493509,
"grad_norm": 1.021100809804264,
"learning_rate": 4.9997814702551914e-06,
"loss": 0.1196,
"step": 68
},
{
"epoch": 0.05512282804074296,
"grad_norm": 0.9267329405198692,
"learning_rate": 4.999685319184688e-06,
"loss": 0.1328,
"step": 69
},
{
"epoch": 0.05592170960655083,
"grad_norm": 1.0294211721447675,
"learning_rate": 4.9995716876906654e-06,
"loss": 0.1161,
"step": 70
},
{
"epoch": 0.0567205911723587,
"grad_norm": 1.0626502091277237,
"learning_rate": 4.999440576567755e-06,
"loss": 0.1346,
"step": 71
},
{
"epoch": 0.057519472738166565,
"grad_norm": 0.9387601593971938,
"learning_rate": 4.999291986732823e-06,
"loss": 0.1253,
"step": 72
},
{
"epoch": 0.05831835430397444,
"grad_norm": 0.9664307515868064,
"learning_rate": 4.999125919224966e-06,
"loss": 0.118,
"step": 73
},
{
"epoch": 0.059117235869782306,
"grad_norm": 0.9320980438153565,
"learning_rate": 4.998942375205502e-06,
"loss": 0.1181,
"step": 74
},
{
"epoch": 0.05991611743559017,
"grad_norm": 1.087597810065219,
"learning_rate": 4.998741355957963e-06,
"loss": 0.1123,
"step": 75
},
{
"epoch": 0.06071499900139804,
"grad_norm": 0.9940393858545528,
"learning_rate": 4.998522862888088e-06,
"loss": 0.1274,
"step": 76
},
{
"epoch": 0.061513880567205914,
"grad_norm": 0.9573078672504775,
"learning_rate": 4.998286897523808e-06,
"loss": 0.1207,
"step": 77
},
{
"epoch": 0.06231276213301378,
"grad_norm": 1.0642550055625968,
"learning_rate": 4.998033461515242e-06,
"loss": 0.1286,
"step": 78
},
{
"epoch": 0.06311164369882165,
"grad_norm": 1.051177584232888,
"learning_rate": 4.99776255663468e-06,
"loss": 0.1243,
"step": 79
},
{
"epoch": 0.06391052526462952,
"grad_norm": 0.954997292990204,
"learning_rate": 4.997474184776573e-06,
"loss": 0.1128,
"step": 80
},
{
"epoch": 0.06470940683043738,
"grad_norm": 0.8950215033759747,
"learning_rate": 4.997168347957521e-06,
"loss": 0.1059,
"step": 81
},
{
"epoch": 0.06550828839624526,
"grad_norm": 1.0278639983929891,
"learning_rate": 4.996845048316253e-06,
"loss": 0.129,
"step": 82
},
{
"epoch": 0.06630716996205313,
"grad_norm": 0.9125269501555735,
"learning_rate": 4.996504288113624e-06,
"loss": 0.1213,
"step": 83
},
{
"epoch": 0.06710605152786099,
"grad_norm": 1.154599556719037,
"learning_rate": 4.996146069732583e-06,
"loss": 0.1153,
"step": 84
},
{
"epoch": 0.06790493309366886,
"grad_norm": 0.9583233046394478,
"learning_rate": 4.995770395678171e-06,
"loss": 0.1303,
"step": 85
},
{
"epoch": 0.06870381465947674,
"grad_norm": 0.956624411481031,
"learning_rate": 4.995377268577495e-06,
"loss": 0.1128,
"step": 86
},
{
"epoch": 0.0695026962252846,
"grad_norm": 0.9303535332486583,
"learning_rate": 4.994966691179712e-06,
"loss": 0.1078,
"step": 87
},
{
"epoch": 0.07030157779109247,
"grad_norm": 0.8445483716745527,
"learning_rate": 4.994538666356009e-06,
"loss": 0.1139,
"step": 88
},
{
"epoch": 0.07110045935690033,
"grad_norm": 1.1838181005074246,
"learning_rate": 4.994093197099587e-06,
"loss": 0.1266,
"step": 89
},
{
"epoch": 0.07189934092270821,
"grad_norm": 0.9212525379918269,
"learning_rate": 4.993630286525634e-06,
"loss": 0.1105,
"step": 90
},
{
"epoch": 0.07269822248851608,
"grad_norm": 0.9557168849403959,
"learning_rate": 4.993149937871306e-06,
"loss": 0.1114,
"step": 91
},
{
"epoch": 0.07349710405432394,
"grad_norm": 0.9335531131462851,
"learning_rate": 4.992652154495706e-06,
"loss": 0.1187,
"step": 92
},
{
"epoch": 0.07429598562013182,
"grad_norm": 1.0684490615988076,
"learning_rate": 4.992136939879857e-06,
"loss": 0.1207,
"step": 93
},
{
"epoch": 0.07509486718593969,
"grad_norm": 0.917221466030395,
"learning_rate": 4.9916042976266795e-06,
"loss": 0.1089,
"step": 94
},
{
"epoch": 0.07589374875174755,
"grad_norm": 0.9102037919118495,
"learning_rate": 4.991054231460969e-06,
"loss": 0.1173,
"step": 95
},
{
"epoch": 0.07669263031755542,
"grad_norm": 0.9957426271614526,
"learning_rate": 4.990486745229364e-06,
"loss": 0.1282,
"step": 96
},
{
"epoch": 0.0774915118833633,
"grad_norm": 1.0340400584386213,
"learning_rate": 4.989901842900326e-06,
"loss": 0.1224,
"step": 97
},
{
"epoch": 0.07829039344917116,
"grad_norm": 0.8694305718065952,
"learning_rate": 4.989299528564103e-06,
"loss": 0.1115,
"step": 98
},
{
"epoch": 0.07908927501497903,
"grad_norm": 1.0607942211887487,
"learning_rate": 4.988679806432712e-06,
"loss": 0.1098,
"step": 99
},
{
"epoch": 0.07988815658078689,
"grad_norm": 0.9482051164342659,
"learning_rate": 4.9880426808398986e-06,
"loss": 0.1089,
"step": 100
},
{
"epoch": 0.08068703814659477,
"grad_norm": 1.1060592404442136,
"learning_rate": 4.987388156241115e-06,
"loss": 0.1071,
"step": 101
},
{
"epoch": 0.08148591971240264,
"grad_norm": 1.0630018016859446,
"learning_rate": 4.986716237213484e-06,
"loss": 0.13,
"step": 102
},
{
"epoch": 0.0822848012782105,
"grad_norm": 0.980659330827137,
"learning_rate": 4.986026928455767e-06,
"loss": 0.1163,
"step": 103
},
{
"epoch": 0.08308368284401838,
"grad_norm": 1.0796295507919234,
"learning_rate": 4.985320234788337e-06,
"loss": 0.1234,
"step": 104
},
{
"epoch": 0.08388256440982625,
"grad_norm": 1.0235556044811625,
"learning_rate": 4.9845961611531356e-06,
"loss": 0.1082,
"step": 105
},
{
"epoch": 0.08468144597563411,
"grad_norm": 1.0228391219961424,
"learning_rate": 4.983854712613647e-06,
"loss": 0.1168,
"step": 106
},
{
"epoch": 0.08548032754144198,
"grad_norm": 0.9963769291850916,
"learning_rate": 4.983095894354858e-06,
"loss": 0.0995,
"step": 107
},
{
"epoch": 0.08627920910724984,
"grad_norm": 0.9258045063883333,
"learning_rate": 4.982319711683221e-06,
"loss": 0.1138,
"step": 108
},
{
"epoch": 0.08707809067305772,
"grad_norm": 0.9711841814401072,
"learning_rate": 4.981526170026621e-06,
"loss": 0.1168,
"step": 109
},
{
"epoch": 0.08787697223886559,
"grad_norm": 0.8855803559568926,
"learning_rate": 4.980715274934334e-06,
"loss": 0.1087,
"step": 110
},
{
"epoch": 0.08867585380467345,
"grad_norm": 0.8370807746302283,
"learning_rate": 4.9798870320769884e-06,
"loss": 0.1144,
"step": 111
},
{
"epoch": 0.08947473537048133,
"grad_norm": 0.9128597902530756,
"learning_rate": 4.97904144724653e-06,
"loss": 0.1148,
"step": 112
},
{
"epoch": 0.0902736169362892,
"grad_norm": 0.8855921084987907,
"learning_rate": 4.978178526356173e-06,
"loss": 0.115,
"step": 113
},
{
"epoch": 0.09107249850209706,
"grad_norm": 0.8484966494474456,
"learning_rate": 4.977298275440368e-06,
"loss": 0.1131,
"step": 114
},
{
"epoch": 0.09187138006790493,
"grad_norm": 0.9057924487387431,
"learning_rate": 4.976400700654752e-06,
"loss": 0.1192,
"step": 115
},
{
"epoch": 0.09267026163371281,
"grad_norm": 0.8833176940743735,
"learning_rate": 4.975485808276111e-06,
"loss": 0.1134,
"step": 116
},
{
"epoch": 0.09346914319952067,
"grad_norm": 0.906265232831976,
"learning_rate": 4.974553604702332e-06,
"loss": 0.1141,
"step": 117
},
{
"epoch": 0.09426802476532854,
"grad_norm": 1.167029695596976,
"learning_rate": 4.973604096452361e-06,
"loss": 0.1093,
"step": 118
},
{
"epoch": 0.0950669063311364,
"grad_norm": 0.7956711447411758,
"learning_rate": 4.972637290166158e-06,
"loss": 0.1039,
"step": 119
},
{
"epoch": 0.09586578789694428,
"grad_norm": 0.8384697255234442,
"learning_rate": 4.971653192604645e-06,
"loss": 0.1034,
"step": 120
},
{
"epoch": 0.09666466946275215,
"grad_norm": 0.8244072448799775,
"learning_rate": 4.970651810649666e-06,
"loss": 0.1154,
"step": 121
},
{
"epoch": 0.09746355102856001,
"grad_norm": 0.8661919813344635,
"learning_rate": 4.969633151303934e-06,
"loss": 0.1096,
"step": 122
},
{
"epoch": 0.09826243259436789,
"grad_norm": 0.8808417828216416,
"learning_rate": 4.968597221690986e-06,
"loss": 0.1151,
"step": 123
},
{
"epoch": 0.09906131416017576,
"grad_norm": 0.7526710590860509,
"learning_rate": 4.967544029055128e-06,
"loss": 0.1102,
"step": 124
},
{
"epoch": 0.09986019572598362,
"grad_norm": 0.9100920468208363,
"learning_rate": 4.9664735807613895e-06,
"loss": 0.1135,
"step": 125
},
{
"epoch": 0.1006590772917915,
"grad_norm": 0.8566800660889555,
"learning_rate": 4.965385884295467e-06,
"loss": 0.1124,
"step": 126
},
{
"epoch": 0.10145795885759937,
"grad_norm": 0.8854006584420304,
"learning_rate": 4.964280947263677e-06,
"loss": 0.0986,
"step": 127
},
{
"epoch": 0.10225684042340723,
"grad_norm": 0.969878610508023,
"learning_rate": 4.963158777392898e-06,
"loss": 0.0983,
"step": 128
},
{
"epoch": 0.1030557219892151,
"grad_norm": 0.9892660403306187,
"learning_rate": 4.962019382530521e-06,
"loss": 0.1108,
"step": 129
},
{
"epoch": 0.10385460355502296,
"grad_norm": 0.9753877213466595,
"learning_rate": 4.960862770644389e-06,
"loss": 0.1115,
"step": 130
},
{
"epoch": 0.10465348512083084,
"grad_norm": 0.9033107702224492,
"learning_rate": 4.959688949822748e-06,
"loss": 0.1117,
"step": 131
},
{
"epoch": 0.10545236668663871,
"grad_norm": 0.965532325966032,
"learning_rate": 4.9584979282741856e-06,
"loss": 0.1039,
"step": 132
},
{
"epoch": 0.10625124825244657,
"grad_norm": 0.9844322564004251,
"learning_rate": 4.957289714327572e-06,
"loss": 0.1093,
"step": 133
},
{
"epoch": 0.10705012981825444,
"grad_norm": 0.9619990158546806,
"learning_rate": 4.95606431643201e-06,
"loss": 0.116,
"step": 134
},
{
"epoch": 0.10784901138406232,
"grad_norm": 0.8325965340613319,
"learning_rate": 4.9548217431567665e-06,
"loss": 0.1161,
"step": 135
},
{
"epoch": 0.10864789294987018,
"grad_norm": 1.0194058532875092,
"learning_rate": 4.953562003191219e-06,
"loss": 0.1149,
"step": 136
},
{
"epoch": 0.10944677451567805,
"grad_norm": 0.9170383197008514,
"learning_rate": 4.952285105344792e-06,
"loss": 0.1059,
"step": 137
},
{
"epoch": 0.11024565608148593,
"grad_norm": 0.9189470478972314,
"learning_rate": 4.950991058546893e-06,
"loss": 0.1004,
"step": 138
},
{
"epoch": 0.11104453764729379,
"grad_norm": 1.0687382655451492,
"learning_rate": 4.949679871846857e-06,
"loss": 0.1145,
"step": 139
},
{
"epoch": 0.11184341921310166,
"grad_norm": 0.9134055983302238,
"learning_rate": 4.948351554413879e-06,
"loss": 0.1061,
"step": 140
},
{
"epoch": 0.11264230077890952,
"grad_norm": 1.0565717743207121,
"learning_rate": 4.947006115536947e-06,
"loss": 0.1066,
"step": 141
},
{
"epoch": 0.1134411823447174,
"grad_norm": 0.8523742921266169,
"learning_rate": 4.945643564624782e-06,
"loss": 0.1068,
"step": 142
},
{
"epoch": 0.11424006391052527,
"grad_norm": 0.9427600241720556,
"learning_rate": 4.944263911205772e-06,
"loss": 0.1088,
"step": 143
},
{
"epoch": 0.11503894547633313,
"grad_norm": 0.9823498202200215,
"learning_rate": 4.942867164927899e-06,
"loss": 0.1128,
"step": 144
},
{
"epoch": 0.115837827042141,
"grad_norm": 0.8481811655422338,
"learning_rate": 4.941453335558682e-06,
"loss": 0.1054,
"step": 145
},
{
"epoch": 0.11663670860794888,
"grad_norm": 0.9044089222762782,
"learning_rate": 4.940022432985096e-06,
"loss": 0.11,
"step": 146
},
{
"epoch": 0.11743559017375674,
"grad_norm": 0.860131950830847,
"learning_rate": 4.938574467213519e-06,
"loss": 0.1061,
"step": 147
},
{
"epoch": 0.11823447173956461,
"grad_norm": 0.9581110488146714,
"learning_rate": 4.937109448369643e-06,
"loss": 0.1127,
"step": 148
},
{
"epoch": 0.11903335330537247,
"grad_norm": 0.8720482965396903,
"learning_rate": 4.935627386698418e-06,
"loss": 0.1085,
"step": 149
},
{
"epoch": 0.11983223487118035,
"grad_norm": 0.8112483294643192,
"learning_rate": 4.934128292563978e-06,
"loss": 0.1041,
"step": 150
},
{
"epoch": 0.12063111643698822,
"grad_norm": 0.9203686211432968,
"learning_rate": 4.93261217644956e-06,
"loss": 0.1036,
"step": 151
},
{
"epoch": 0.12142999800279608,
"grad_norm": 0.7759012119797701,
"learning_rate": 4.93107904895744e-06,
"loss": 0.095,
"step": 152
},
{
"epoch": 0.12222887956860395,
"grad_norm": 0.9139977963522751,
"learning_rate": 4.9295289208088545e-06,
"loss": 0.1053,
"step": 153
},
{
"epoch": 0.12302776113441183,
"grad_norm": 0.8627349081243817,
"learning_rate": 4.927961802843927e-06,
"loss": 0.1117,
"step": 154
},
{
"epoch": 0.12382664270021969,
"grad_norm": 0.928872031641752,
"learning_rate": 4.92637770602159e-06,
"loss": 0.1184,
"step": 155
},
{
"epoch": 0.12462552426602756,
"grad_norm": 0.8292246195088976,
"learning_rate": 4.924776641419513e-06,
"loss": 0.1058,
"step": 156
},
{
"epoch": 0.12542440583183542,
"grad_norm": 0.9477438857662375,
"learning_rate": 4.92315862023402e-06,
"loss": 0.1034,
"step": 157
},
{
"epoch": 0.1262232873976433,
"grad_norm": 0.8702969195611917,
"learning_rate": 4.921523653780012e-06,
"loss": 0.1116,
"step": 158
},
{
"epoch": 0.12702216896345117,
"grad_norm": 0.9493498442525029,
"learning_rate": 4.919871753490892e-06,
"loss": 0.1102,
"step": 159
},
{
"epoch": 0.12782105052925904,
"grad_norm": 0.9116753131736711,
"learning_rate": 4.9182029309184785e-06,
"loss": 0.1132,
"step": 160
},
{
"epoch": 0.12861993209506692,
"grad_norm": 0.7582793065158208,
"learning_rate": 4.916517197732933e-06,
"loss": 0.1064,
"step": 161
},
{
"epoch": 0.12941881366087477,
"grad_norm": 0.9778914130367525,
"learning_rate": 4.914814565722671e-06,
"loss": 0.1118,
"step": 162
},
{
"epoch": 0.13021769522668264,
"grad_norm": 0.9419365180087802,
"learning_rate": 4.913095046794282e-06,
"loss": 0.1038,
"step": 163
},
{
"epoch": 0.1310165767924905,
"grad_norm": 0.9612330406862204,
"learning_rate": 4.911358652972448e-06,
"loss": 0.1071,
"step": 164
},
{
"epoch": 0.1318154583582984,
"grad_norm": 0.9231553642667918,
"learning_rate": 4.9096053963998555e-06,
"loss": 0.1054,
"step": 165
},
{
"epoch": 0.13261433992410626,
"grad_norm": 0.884065574169948,
"learning_rate": 4.907835289337116e-06,
"loss": 0.103,
"step": 166
},
{
"epoch": 0.1334132214899141,
"grad_norm": 0.9476846210224978,
"learning_rate": 4.906048344162677e-06,
"loss": 0.1099,
"step": 167
},
{
"epoch": 0.13421210305572198,
"grad_norm": 0.8958610299998433,
"learning_rate": 4.904244573372733e-06,
"loss": 0.1091,
"step": 168
},
{
"epoch": 0.13501098462152986,
"grad_norm": 0.9020407628170796,
"learning_rate": 4.902423989581143e-06,
"loss": 0.1116,
"step": 169
},
{
"epoch": 0.13580986618733773,
"grad_norm": 0.9238934392642306,
"learning_rate": 4.900586605519341e-06,
"loss": 0.0988,
"step": 170
},
{
"epoch": 0.1366087477531456,
"grad_norm": 0.9207175470533663,
"learning_rate": 4.8987324340362445e-06,
"loss": 0.1109,
"step": 171
},
{
"epoch": 0.13740762931895348,
"grad_norm": 0.8920474499664821,
"learning_rate": 4.896861488098165e-06,
"loss": 0.1135,
"step": 172
},
{
"epoch": 0.13820651088476132,
"grad_norm": 0.9691154046483549,
"learning_rate": 4.894973780788722e-06,
"loss": 0.1128,
"step": 173
},
{
"epoch": 0.1390053924505692,
"grad_norm": 0.902524602492558,
"learning_rate": 4.893069325308747e-06,
"loss": 0.1223,
"step": 174
},
{
"epoch": 0.13980427401637707,
"grad_norm": 0.8576273271722297,
"learning_rate": 4.89114813497619e-06,
"loss": 0.1053,
"step": 175
},
{
"epoch": 0.14060315558218495,
"grad_norm": 0.9412918203296141,
"learning_rate": 4.889210223226032e-06,
"loss": 0.1088,
"step": 176
},
{
"epoch": 0.14140203714799282,
"grad_norm": 0.8254367045725899,
"learning_rate": 4.8872556036101845e-06,
"loss": 0.1136,
"step": 177
},
{
"epoch": 0.14220091871380067,
"grad_norm": 0.813317696779795,
"learning_rate": 4.885284289797402e-06,
"loss": 0.0968,
"step": 178
},
{
"epoch": 0.14299980027960854,
"grad_norm": 0.8893584230465316,
"learning_rate": 4.883296295573176e-06,
"loss": 0.101,
"step": 179
},
{
"epoch": 0.14379868184541642,
"grad_norm": 0.7566705356571157,
"learning_rate": 4.881291634839652e-06,
"loss": 0.0894,
"step": 180
},
{
"epoch": 0.1445975634112243,
"grad_norm": 0.8599708405924054,
"learning_rate": 4.8792703216155205e-06,
"loss": 0.1125,
"step": 181
},
{
"epoch": 0.14539644497703216,
"grad_norm": 0.8456967021924519,
"learning_rate": 4.877232370035926e-06,
"loss": 0.1049,
"step": 182
},
{
"epoch": 0.14619532654284004,
"grad_norm": 0.815478274809968,
"learning_rate": 4.875177794352364e-06,
"loss": 0.1073,
"step": 183
},
{
"epoch": 0.14699420810864788,
"grad_norm": 0.7878779155800018,
"learning_rate": 4.873106608932585e-06,
"loss": 0.0978,
"step": 184
},
{
"epoch": 0.14779308967445576,
"grad_norm": 0.8810812572826364,
"learning_rate": 4.871018828260492e-06,
"loss": 0.1102,
"step": 185
},
{
"epoch": 0.14859197124026363,
"grad_norm": 0.806104646763574,
"learning_rate": 4.868914466936038e-06,
"loss": 0.1222,
"step": 186
},
{
"epoch": 0.1493908528060715,
"grad_norm": 0.8151326052732307,
"learning_rate": 4.866793539675127e-06,
"loss": 0.1071,
"step": 187
},
{
"epoch": 0.15018973437187938,
"grad_norm": 0.8000214408032337,
"learning_rate": 4.864656061309507e-06,
"loss": 0.0997,
"step": 188
},
{
"epoch": 0.15098861593768723,
"grad_norm": 0.7896324428536934,
"learning_rate": 4.862502046786671e-06,
"loss": 0.1062,
"step": 189
},
{
"epoch": 0.1517874975034951,
"grad_norm": 0.7552560858678763,
"learning_rate": 4.860331511169752e-06,
"loss": 0.0976,
"step": 190
},
{
"epoch": 0.15258637906930297,
"grad_norm": 0.790809221885304,
"learning_rate": 4.858144469637409e-06,
"loss": 0.1133,
"step": 191
},
{
"epoch": 0.15338526063511085,
"grad_norm": 0.7624019298282761,
"learning_rate": 4.8559409374837356e-06,
"loss": 0.0887,
"step": 192
},
{
"epoch": 0.15418414220091872,
"grad_norm": 0.8522514916028995,
"learning_rate": 4.853720930118139e-06,
"loss": 0.1192,
"step": 193
},
{
"epoch": 0.1549830237667266,
"grad_norm": 0.8482297033111496,
"learning_rate": 4.851484463065243e-06,
"loss": 0.1128,
"step": 194
},
{
"epoch": 0.15578190533253444,
"grad_norm": 0.9895092259087344,
"learning_rate": 4.849231551964771e-06,
"loss": 0.1021,
"step": 195
},
{
"epoch": 0.15658078689834232,
"grad_norm": 0.8597378998737626,
"learning_rate": 4.846962212571443e-06,
"loss": 0.11,
"step": 196
},
{
"epoch": 0.1573796684641502,
"grad_norm": 0.8192511349717183,
"learning_rate": 4.844676460754862e-06,
"loss": 0.0992,
"step": 197
},
{
"epoch": 0.15817855002995806,
"grad_norm": 0.8928261617816722,
"learning_rate": 4.842374312499405e-06,
"loss": 0.1018,
"step": 198
},
{
"epoch": 0.15897743159576594,
"grad_norm": 0.7985841136149491,
"learning_rate": 4.840055783904106e-06,
"loss": 0.1022,
"step": 199
},
{
"epoch": 0.15977631316157379,
"grad_norm": 0.789467705600129,
"learning_rate": 4.837720891182553e-06,
"loss": 0.0946,
"step": 200
},
{
"epoch": 0.16057519472738166,
"grad_norm": 0.9446894087276032,
"learning_rate": 4.835369650662767e-06,
"loss": 0.1251,
"step": 201
},
{
"epoch": 0.16137407629318953,
"grad_norm": 0.7973312335808964,
"learning_rate": 4.833002078787089e-06,
"loss": 0.1061,
"step": 202
},
{
"epoch": 0.1621729578589974,
"grad_norm": 0.7600431803278112,
"learning_rate": 4.830618192112065e-06,
"loss": 0.0966,
"step": 203
},
{
"epoch": 0.16297183942480528,
"grad_norm": 0.7950707857967954,
"learning_rate": 4.828218007308335e-06,
"loss": 0.0946,
"step": 204
},
{
"epoch": 0.16377072099061313,
"grad_norm": 0.9725967990292367,
"learning_rate": 4.825801541160509e-06,
"loss": 0.1172,
"step": 205
},
{
"epoch": 0.164569602556421,
"grad_norm": 0.8521609257869547,
"learning_rate": 4.823368810567056e-06,
"loss": 0.1124,
"step": 206
},
{
"epoch": 0.16536848412222888,
"grad_norm": 0.8352032191193668,
"learning_rate": 4.8209198325401815e-06,
"loss": 0.1014,
"step": 207
},
{
"epoch": 0.16616736568803675,
"grad_norm": 0.817037921684215,
"learning_rate": 4.818454624205711e-06,
"loss": 0.0925,
"step": 208
},
{
"epoch": 0.16696624725384462,
"grad_norm": 0.8714751075848507,
"learning_rate": 4.815973202802966e-06,
"loss": 0.1069,
"step": 209
},
{
"epoch": 0.1677651288196525,
"grad_norm": 0.7980826304152385,
"learning_rate": 4.813475585684653e-06,
"loss": 0.1006,
"step": 210
},
{
"epoch": 0.16856401038546034,
"grad_norm": 0.8251849516010518,
"learning_rate": 4.810961790316731e-06,
"loss": 0.106,
"step": 211
},
{
"epoch": 0.16936289195126822,
"grad_norm": 0.8599512267451508,
"learning_rate": 4.808431834278294e-06,
"loss": 0.0983,
"step": 212
},
{
"epoch": 0.1701617735170761,
"grad_norm": 0.8871414735762658,
"learning_rate": 4.805885735261454e-06,
"loss": 0.1038,
"step": 213
},
{
"epoch": 0.17096065508288397,
"grad_norm": 0.8473694336902333,
"learning_rate": 4.8033235110712055e-06,
"loss": 0.0985,
"step": 214
},
{
"epoch": 0.17175953664869184,
"grad_norm": 0.7784578078223237,
"learning_rate": 4.800745179625308e-06,
"loss": 0.0954,
"step": 215
},
{
"epoch": 0.1725584182144997,
"grad_norm": 0.8439903426690519,
"learning_rate": 4.798150758954164e-06,
"loss": 0.0946,
"step": 216
},
{
"epoch": 0.17335729978030756,
"grad_norm": 0.7683932085699543,
"learning_rate": 4.7955402672006855e-06,
"loss": 0.0991,
"step": 217
},
{
"epoch": 0.17415618134611544,
"grad_norm": 0.8210534029782739,
"learning_rate": 4.79291372262017e-06,
"loss": 0.1169,
"step": 218
},
{
"epoch": 0.1749550629119233,
"grad_norm": 0.8469001008689524,
"learning_rate": 4.790271143580174e-06,
"loss": 0.1043,
"step": 219
},
{
"epoch": 0.17575394447773118,
"grad_norm": 0.8221750551755307,
"learning_rate": 4.787612548560385e-06,
"loss": 0.111,
"step": 220
},
{
"epoch": 0.17655282604353906,
"grad_norm": 0.7931373818167854,
"learning_rate": 4.78493795615249e-06,
"loss": 0.1102,
"step": 221
},
{
"epoch": 0.1773517076093469,
"grad_norm": 0.7197354675625041,
"learning_rate": 4.7822473850600444e-06,
"loss": 0.0976,
"step": 222
},
{
"epoch": 0.17815058917515478,
"grad_norm": 0.7872396703665671,
"learning_rate": 4.779540854098348e-06,
"loss": 0.1009,
"step": 223
},
{
"epoch": 0.17894947074096265,
"grad_norm": 0.8668934472760813,
"learning_rate": 4.776818382194305e-06,
"loss": 0.0999,
"step": 224
},
{
"epoch": 0.17974835230677053,
"grad_norm": 0.8481767317765282,
"learning_rate": 4.7740799883862966e-06,
"loss": 0.1037,
"step": 225
},
{
"epoch": 0.1805472338725784,
"grad_norm": 0.8178437972168623,
"learning_rate": 4.771325691824046e-06,
"loss": 0.0978,
"step": 226
},
{
"epoch": 0.18134611543838625,
"grad_norm": 0.8164863929375823,
"learning_rate": 4.768555511768486e-06,
"loss": 0.1022,
"step": 227
},
{
"epoch": 0.18214499700419412,
"grad_norm": 0.8689175267561307,
"learning_rate": 4.765769467591626e-06,
"loss": 0.0982,
"step": 228
},
{
"epoch": 0.182943878570002,
"grad_norm": 0.8402988320046502,
"learning_rate": 4.762967578776406e-06,
"loss": 0.0881,
"step": 229
},
{
"epoch": 0.18374276013580987,
"grad_norm": 0.7116078445916488,
"learning_rate": 4.760149864916579e-06,
"loss": 0.0948,
"step": 230
},
{
"epoch": 0.18454164170161774,
"grad_norm": 0.814050015517037,
"learning_rate": 4.757316345716554e-06,
"loss": 0.0937,
"step": 231
},
{
"epoch": 0.18534052326742562,
"grad_norm": 0.7662089706655602,
"learning_rate": 4.754467040991272e-06,
"loss": 0.1019,
"step": 232
},
{
"epoch": 0.18613940483323346,
"grad_norm": 0.8826194920747791,
"learning_rate": 4.751601970666064e-06,
"loss": 0.1083,
"step": 233
},
{
"epoch": 0.18693828639904134,
"grad_norm": 0.7980561660257126,
"learning_rate": 4.748721154776508e-06,
"loss": 0.1124,
"step": 234
},
{
"epoch": 0.1877371679648492,
"grad_norm": 0.7882742889167881,
"learning_rate": 4.745824613468293e-06,
"loss": 0.088,
"step": 235
},
{
"epoch": 0.18853604953065708,
"grad_norm": 0.7820268690800588,
"learning_rate": 4.742912366997076e-06,
"loss": 0.0871,
"step": 236
},
{
"epoch": 0.18933493109646496,
"grad_norm": 0.8615828011540341,
"learning_rate": 4.73998443572834e-06,
"loss": 0.1042,
"step": 237
},
{
"epoch": 0.1901338126622728,
"grad_norm": 0.7435755225972785,
"learning_rate": 4.737040840137255e-06,
"loss": 0.0957,
"step": 238
},
{
"epoch": 0.19093269422808068,
"grad_norm": 0.7918088118130509,
"learning_rate": 4.734081600808531e-06,
"loss": 0.1002,
"step": 239
},
{
"epoch": 0.19173157579388855,
"grad_norm": 0.8364127760580201,
"learning_rate": 4.731106738436275e-06,
"loss": 0.0896,
"step": 240
},
{
"epoch": 0.19253045735969643,
"grad_norm": 0.7869544838131202,
"learning_rate": 4.728116273823848e-06,
"loss": 0.1011,
"step": 241
},
{
"epoch": 0.1933293389255043,
"grad_norm": 0.780676449932054,
"learning_rate": 4.725110227883716e-06,
"loss": 0.0897,
"step": 242
},
{
"epoch": 0.19412822049131218,
"grad_norm": 0.8631262185605082,
"learning_rate": 4.7220886216373095e-06,
"loss": 0.1003,
"step": 243
},
{
"epoch": 0.19492710205712002,
"grad_norm": 0.8807375012349913,
"learning_rate": 4.7190514762148685e-06,
"loss": 0.0998,
"step": 244
},
{
"epoch": 0.1957259836229279,
"grad_norm": 0.7564793946843111,
"learning_rate": 4.715998812855305e-06,
"loss": 0.1009,
"step": 245
},
{
"epoch": 0.19652486518873577,
"grad_norm": 0.7851241620875284,
"learning_rate": 4.7129306529060415e-06,
"loss": 0.0934,
"step": 246
},
{
"epoch": 0.19732374675454364,
"grad_norm": 0.83739262621917,
"learning_rate": 4.709847017822876e-06,
"loss": 0.0895,
"step": 247
},
{
"epoch": 0.19812262832035152,
"grad_norm": 0.7640947304805092,
"learning_rate": 4.706747929169821e-06,
"loss": 0.1096,
"step": 248
},
{
"epoch": 0.19892150988615936,
"grad_norm": 0.7811995113015094,
"learning_rate": 4.703633408618955e-06,
"loss": 0.1063,
"step": 249
},
{
"epoch": 0.19972039145196724,
"grad_norm": 0.8544109266568962,
"learning_rate": 4.700503477950278e-06,
"loss": 0.1071,
"step": 250
},
{
"epoch": 0.2005192730177751,
"grad_norm": 0.8431749832390882,
"learning_rate": 4.697358159051549e-06,
"loss": 0.1047,
"step": 251
},
{
"epoch": 0.201318154583583,
"grad_norm": 0.8570451142462542,
"learning_rate": 4.694197473918139e-06,
"loss": 0.1088,
"step": 252
},
{
"epoch": 0.20211703614939086,
"grad_norm": 0.7877487371067441,
"learning_rate": 4.691021444652877e-06,
"loss": 0.0954,
"step": 253
},
{
"epoch": 0.20291591771519873,
"grad_norm": 0.9045557407382668,
"learning_rate": 4.687830093465893e-06,
"loss": 0.1027,
"step": 254
},
{
"epoch": 0.20371479928100658,
"grad_norm": 0.7246585315986585,
"learning_rate": 4.684623442674463e-06,
"loss": 0.0973,
"step": 255
},
{
"epoch": 0.20451368084681446,
"grad_norm": 0.818430244234053,
"learning_rate": 4.681401514702856e-06,
"loss": 0.1059,
"step": 256
},
{
"epoch": 0.20531256241262233,
"grad_norm": 0.7497021560526902,
"learning_rate": 4.678164332082175e-06,
"loss": 0.0938,
"step": 257
},
{
"epoch": 0.2061114439784302,
"grad_norm": 0.8957393415270142,
"learning_rate": 4.674911917450198e-06,
"loss": 0.1061,
"step": 258
},
{
"epoch": 0.20691032554423808,
"grad_norm": 0.7645317265386067,
"learning_rate": 4.671644293551222e-06,
"loss": 0.0903,
"step": 259
},
{
"epoch": 0.20770920711004592,
"grad_norm": 0.7682244794090962,
"learning_rate": 4.668361483235903e-06,
"loss": 0.085,
"step": 260
},
{
"epoch": 0.2085080886758538,
"grad_norm": 0.854685496484154,
"learning_rate": 4.665063509461098e-06,
"loss": 0.0994,
"step": 261
},
{
"epoch": 0.20930697024166167,
"grad_norm": 0.8815096120890409,
"learning_rate": 4.661750395289698e-06,
"loss": 0.1058,
"step": 262
},
{
"epoch": 0.21010585180746955,
"grad_norm": 0.794089561085315,
"learning_rate": 4.6584221638904775e-06,
"loss": 0.1016,
"step": 263
},
{
"epoch": 0.21090473337327742,
"grad_norm": 0.8758797896184406,
"learning_rate": 4.655078838537924e-06,
"loss": 0.0972,
"step": 264
},
{
"epoch": 0.2117036149390853,
"grad_norm": 0.7898730124221975,
"learning_rate": 4.651720442612076e-06,
"loss": 0.0918,
"step": 265
},
{
"epoch": 0.21250249650489314,
"grad_norm": 0.7317067873877707,
"learning_rate": 4.648346999598364e-06,
"loss": 0.0913,
"step": 266
},
{
"epoch": 0.21330137807070101,
"grad_norm": 0.7843972077721959,
"learning_rate": 4.644958533087443e-06,
"loss": 0.0951,
"step": 267
},
{
"epoch": 0.2141002596365089,
"grad_norm": 0.792184243401163,
"learning_rate": 4.641555066775027e-06,
"loss": 0.0972,
"step": 268
},
{
"epoch": 0.21489914120231676,
"grad_norm": 0.7582413630818394,
"learning_rate": 4.638136624461723e-06,
"loss": 0.0932,
"step": 269
},
{
"epoch": 0.21569802276812464,
"grad_norm": 0.7816406773037096,
"learning_rate": 4.634703230052871e-06,
"loss": 0.1042,
"step": 270
},
{
"epoch": 0.21649690433393248,
"grad_norm": 0.7596966546637388,
"learning_rate": 4.631254907558366e-06,
"loss": 0.0971,
"step": 271
},
{
"epoch": 0.21729578589974036,
"grad_norm": 0.783304310420589,
"learning_rate": 4.627791681092499e-06,
"loss": 0.095,
"step": 272
},
{
"epoch": 0.21809466746554823,
"grad_norm": 0.8086579883341541,
"learning_rate": 4.624313574873787e-06,
"loss": 0.1129,
"step": 273
},
{
"epoch": 0.2188935490313561,
"grad_norm": 0.6962761290814109,
"learning_rate": 4.620820613224796e-06,
"loss": 0.089,
"step": 274
},
{
"epoch": 0.21969243059716398,
"grad_norm": 0.7288448006122578,
"learning_rate": 4.617312820571981e-06,
"loss": 0.0882,
"step": 275
},
{
"epoch": 0.22049131216297185,
"grad_norm": 0.8048648789646363,
"learning_rate": 4.613790221445511e-06,
"loss": 0.1025,
"step": 276
},
{
"epoch": 0.2212901937287797,
"grad_norm": 0.7929094130785004,
"learning_rate": 4.610252840479097e-06,
"loss": 0.0989,
"step": 277
},
{
"epoch": 0.22208907529458757,
"grad_norm": 0.8168348398218105,
"learning_rate": 4.606700702409818e-06,
"loss": 0.1048,
"step": 278
},
{
"epoch": 0.22288795686039545,
"grad_norm": 0.9063263874082697,
"learning_rate": 4.603133832077953e-06,
"loss": 0.1019,
"step": 279
},
{
"epoch": 0.22368683842620332,
"grad_norm": 0.8822514604154854,
"learning_rate": 4.599552254426804e-06,
"loss": 0.0869,
"step": 280
},
{
"epoch": 0.2244857199920112,
"grad_norm": 0.6911719229844316,
"learning_rate": 4.595955994502519e-06,
"loss": 0.0943,
"step": 281
},
{
"epoch": 0.22528460155781904,
"grad_norm": 0.9183376642968846,
"learning_rate": 4.592345077453925e-06,
"loss": 0.0993,
"step": 282
},
{
"epoch": 0.22608348312362692,
"grad_norm": 0.7954093378579109,
"learning_rate": 4.588719528532342e-06,
"loss": 0.0989,
"step": 283
},
{
"epoch": 0.2268823646894348,
"grad_norm": 0.8198658669695411,
"learning_rate": 4.5850793730914135e-06,
"loss": 0.0909,
"step": 284
},
{
"epoch": 0.22768124625524266,
"grad_norm": 0.7592112175879184,
"learning_rate": 4.5814246365869285e-06,
"loss": 0.1019,
"step": 285
},
{
"epoch": 0.22848012782105054,
"grad_norm": 0.8383090068025297,
"learning_rate": 4.577755344576641e-06,
"loss": 0.1096,
"step": 286
},
{
"epoch": 0.22927900938685838,
"grad_norm": 0.7719532995528691,
"learning_rate": 4.5740715227200904e-06,
"loss": 0.0919,
"step": 287
},
{
"epoch": 0.23007789095266626,
"grad_norm": 0.6949182347348394,
"learning_rate": 4.570373196778427e-06,
"loss": 0.0833,
"step": 288
},
{
"epoch": 0.23087677251847413,
"grad_norm": 0.8542806997632925,
"learning_rate": 4.566660392614229e-06,
"loss": 0.0942,
"step": 289
},
{
"epoch": 0.231675654084282,
"grad_norm": 0.7942245149648606,
"learning_rate": 4.562933136191317e-06,
"loss": 0.1038,
"step": 290
},
{
"epoch": 0.23247453565008988,
"grad_norm": 0.741118108734099,
"learning_rate": 4.559191453574582e-06,
"loss": 0.0996,
"step": 291
},
{
"epoch": 0.23327341721589775,
"grad_norm": 0.7401819189973926,
"learning_rate": 4.555435370929797e-06,
"loss": 0.0881,
"step": 292
},
{
"epoch": 0.2340722987817056,
"grad_norm": 0.7225340215993067,
"learning_rate": 4.551664914523433e-06,
"loss": 0.0889,
"step": 293
},
{
"epoch": 0.23487118034751348,
"grad_norm": 0.7771401029247064,
"learning_rate": 4.54788011072248e-06,
"loss": 0.1059,
"step": 294
},
{
"epoch": 0.23567006191332135,
"grad_norm": 0.7629163339227021,
"learning_rate": 4.5440809859942585e-06,
"loss": 0.103,
"step": 295
},
{
"epoch": 0.23646894347912922,
"grad_norm": 0.7448031192353857,
"learning_rate": 4.5402675669062345e-06,
"loss": 0.0995,
"step": 296
},
{
"epoch": 0.2372678250449371,
"grad_norm": 0.7283315899507647,
"learning_rate": 4.53643988012584e-06,
"loss": 0.1025,
"step": 297
},
{
"epoch": 0.23806670661074494,
"grad_norm": 0.9027319022911194,
"learning_rate": 4.532597952420276e-06,
"loss": 0.1066,
"step": 298
},
{
"epoch": 0.23886558817655282,
"grad_norm": 0.7785808633007979,
"learning_rate": 4.5287418106563355e-06,
"loss": 0.0973,
"step": 299
},
{
"epoch": 0.2396644697423607,
"grad_norm": 0.9390527397483459,
"learning_rate": 4.52487148180021e-06,
"loss": 0.0863,
"step": 300
},
{
"epoch": 0.24046335130816857,
"grad_norm": 0.865775604026073,
"learning_rate": 4.5209869929172975e-06,
"loss": 0.1084,
"step": 301
},
{
"epoch": 0.24126223287397644,
"grad_norm": 0.8403236400106889,
"learning_rate": 4.5170883711720245e-06,
"loss": 0.0904,
"step": 302
},
{
"epoch": 0.24206111443978431,
"grad_norm": 0.8064673147923992,
"learning_rate": 4.513175643827647e-06,
"loss": 0.0941,
"step": 303
},
{
"epoch": 0.24285999600559216,
"grad_norm": 0.8013189960100633,
"learning_rate": 4.50924883824606e-06,
"loss": 0.0992,
"step": 304
},
{
"epoch": 0.24365887757140003,
"grad_norm": 0.7728593022833018,
"learning_rate": 4.50530798188761e-06,
"loss": 0.0893,
"step": 305
},
{
"epoch": 0.2444577591372079,
"grad_norm": 0.9052442733312893,
"learning_rate": 4.501353102310901e-06,
"loss": 0.0928,
"step": 306
},
{
"epoch": 0.24525664070301578,
"grad_norm": 0.6668531899886609,
"learning_rate": 4.497384227172603e-06,
"loss": 0.0877,
"step": 307
},
{
"epoch": 0.24605552226882366,
"grad_norm": 0.871779640726829,
"learning_rate": 4.493401384227257e-06,
"loss": 0.0968,
"step": 308
},
{
"epoch": 0.2468544038346315,
"grad_norm": 0.8664305902715963,
"learning_rate": 4.489404601327081e-06,
"loss": 0.0941,
"step": 309
},
{
"epoch": 0.24765328540043938,
"grad_norm": 0.753124311591743,
"learning_rate": 4.485393906421776e-06,
"loss": 0.0867,
"step": 310
},
{
"epoch": 0.24845216696624725,
"grad_norm": 0.8414426164462598,
"learning_rate": 4.48136932755833e-06,
"loss": 0.0917,
"step": 311
},
{
"epoch": 0.24925104853205513,
"grad_norm": 0.8167935257894663,
"learning_rate": 4.477330892880824e-06,
"loss": 0.0899,
"step": 312
},
{
"epoch": 0.25004993009786297,
"grad_norm": 0.8240834348323666,
"learning_rate": 4.47327863063023e-06,
"loss": 0.0997,
"step": 313
},
{
"epoch": 0.25084881166367085,
"grad_norm": 0.8148444956345996,
"learning_rate": 4.469212569144222e-06,
"loss": 0.0929,
"step": 314
},
{
"epoch": 0.2516476932294787,
"grad_norm": 0.812151270928214,
"learning_rate": 4.4651327368569695e-06,
"loss": 0.1031,
"step": 315
},
{
"epoch": 0.2524465747952866,
"grad_norm": 0.7918672951252329,
"learning_rate": 4.46103916229894e-06,
"loss": 0.0916,
"step": 316
},
{
"epoch": 0.25324545636109447,
"grad_norm": 0.7824718515480527,
"learning_rate": 4.456931874096705e-06,
"loss": 0.0865,
"step": 317
},
{
"epoch": 0.25404433792690234,
"grad_norm": 0.7520103031931814,
"learning_rate": 4.452810900972734e-06,
"loss": 0.093,
"step": 318
},
{
"epoch": 0.2548432194927102,
"grad_norm": 0.8388616884327461,
"learning_rate": 4.448676271745198e-06,
"loss": 0.0865,
"step": 319
},
{
"epoch": 0.2556421010585181,
"grad_norm": 0.7264911003280431,
"learning_rate": 4.444528015327763e-06,
"loss": 0.0894,
"step": 320
},
{
"epoch": 0.25644098262432596,
"grad_norm": 0.8182746828018328,
"learning_rate": 4.440366160729393e-06,
"loss": 0.1001,
"step": 321
},
{
"epoch": 0.25723986419013384,
"grad_norm": 0.8487906160638949,
"learning_rate": 4.436190737054142e-06,
"loss": 0.1168,
"step": 322
},
{
"epoch": 0.25803874575594166,
"grad_norm": 0.9326635695818013,
"learning_rate": 4.432001773500958e-06,
"loss": 0.1056,
"step": 323
},
{
"epoch": 0.25883762732174953,
"grad_norm": 0.7153701333351724,
"learning_rate": 4.42779929936347e-06,
"loss": 0.0954,
"step": 324
},
{
"epoch": 0.2596365088875574,
"grad_norm": 0.8066963737724318,
"learning_rate": 4.423583344029786e-06,
"loss": 0.0931,
"step": 325
},
{
"epoch": 0.2604353904533653,
"grad_norm": 0.7908989414413082,
"learning_rate": 4.419353936982293e-06,
"loss": 0.0982,
"step": 326
},
{
"epoch": 0.26123427201917315,
"grad_norm": 0.846279083139174,
"learning_rate": 4.415111107797445e-06,
"loss": 0.1006,
"step": 327
},
{
"epoch": 0.262033153584981,
"grad_norm": 1.0343812566729962,
"learning_rate": 4.410854886145556e-06,
"loss": 0.105,
"step": 328
},
{
"epoch": 0.2628320351507889,
"grad_norm": 0.7771271488054676,
"learning_rate": 4.406585301790595e-06,
"loss": 0.0929,
"step": 329
},
{
"epoch": 0.2636309167165968,
"grad_norm": 0.9484884466652146,
"learning_rate": 4.402302384589979e-06,
"loss": 0.096,
"step": 330
},
{
"epoch": 0.26442979828240465,
"grad_norm": 0.9666766702925966,
"learning_rate": 4.398006164494358e-06,
"loss": 0.1055,
"step": 331
},
{
"epoch": 0.2652286798482125,
"grad_norm": 0.7057771202211578,
"learning_rate": 4.393696671547415e-06,
"loss": 0.0904,
"step": 332
},
{
"epoch": 0.2660275614140204,
"grad_norm": 0.9400899783662074,
"learning_rate": 4.3893739358856465e-06,
"loss": 0.0878,
"step": 333
},
{
"epoch": 0.2668264429798282,
"grad_norm": 0.9030024491318343,
"learning_rate": 4.385037987738158e-06,
"loss": 0.1005,
"step": 334
},
{
"epoch": 0.2676253245456361,
"grad_norm": 0.762520292407699,
"learning_rate": 4.38068885742645e-06,
"loss": 0.0922,
"step": 335
},
{
"epoch": 0.26842420611144396,
"grad_norm": 1.010542949334313,
"learning_rate": 4.376326575364206e-06,
"loss": 0.1021,
"step": 336
},
{
"epoch": 0.26922308767725184,
"grad_norm": 0.8081986385372553,
"learning_rate": 4.371951172057082e-06,
"loss": 0.1031,
"step": 337
},
{
"epoch": 0.2700219692430597,
"grad_norm": 0.8022669170988722,
"learning_rate": 4.367562678102491e-06,
"loss": 0.1012,
"step": 338
},
{
"epoch": 0.2708208508088676,
"grad_norm": 0.9241574900249333,
"learning_rate": 4.363161124189387e-06,
"loss": 0.0907,
"step": 339
},
{
"epoch": 0.27161973237467546,
"grad_norm": 0.7707255024516305,
"learning_rate": 4.358746541098057e-06,
"loss": 0.095,
"step": 340
},
{
"epoch": 0.27241861394048333,
"grad_norm": 0.9222172849209264,
"learning_rate": 4.354318959699899e-06,
"loss": 0.0878,
"step": 341
},
{
"epoch": 0.2732174955062912,
"grad_norm": 0.7886919216124627,
"learning_rate": 4.34987841095721e-06,
"loss": 0.0894,
"step": 342
},
{
"epoch": 0.2740163770720991,
"grad_norm": 0.727790265160795,
"learning_rate": 4.3454249259229665e-06,
"loss": 0.0921,
"step": 343
},
{
"epoch": 0.27481525863790696,
"grad_norm": 0.8093415051048494,
"learning_rate": 4.340958535740612e-06,
"loss": 0.0889,
"step": 344
},
{
"epoch": 0.2756141402037148,
"grad_norm": 0.8827233794289518,
"learning_rate": 4.336479271643833e-06,
"loss": 0.092,
"step": 345
},
{
"epoch": 0.27641302176952265,
"grad_norm": 0.7468481018597232,
"learning_rate": 4.3319871649563474e-06,
"loss": 0.0957,
"step": 346
},
{
"epoch": 0.2772119033353305,
"grad_norm": 0.8611203322886116,
"learning_rate": 4.32748224709168e-06,
"loss": 0.102,
"step": 347
},
{
"epoch": 0.2780107849011384,
"grad_norm": 0.7823320107034754,
"learning_rate": 4.322964549552943e-06,
"loss": 0.0955,
"step": 348
},
{
"epoch": 0.27880966646694627,
"grad_norm": 0.8059112729344878,
"learning_rate": 4.318434103932622e-06,
"loss": 0.0882,
"step": 349
},
{
"epoch": 0.27960854803275415,
"grad_norm": 0.8138007618828386,
"learning_rate": 4.313890941912347e-06,
"loss": 0.0834,
"step": 350
},
{
"epoch": 0.280407429598562,
"grad_norm": 0.7404431489736772,
"learning_rate": 4.309335095262675e-06,
"loss": 0.0949,
"step": 351
},
{
"epoch": 0.2812063111643699,
"grad_norm": 0.7470188231140139,
"learning_rate": 4.30476659584287e-06,
"loss": 0.0842,
"step": 352
},
{
"epoch": 0.28200519273017777,
"grad_norm": 0.797025807552636,
"learning_rate": 4.3001854756006724e-06,
"loss": 0.0834,
"step": 353
},
{
"epoch": 0.28280407429598564,
"grad_norm": 0.814130943144683,
"learning_rate": 4.295591766572086e-06,
"loss": 0.0932,
"step": 354
},
{
"epoch": 0.2836029558617935,
"grad_norm": 0.8189778490428435,
"learning_rate": 4.290985500881143e-06,
"loss": 0.1058,
"step": 355
},
{
"epoch": 0.28440183742760133,
"grad_norm": 0.8241913671649066,
"learning_rate": 4.286366710739691e-06,
"loss": 0.0942,
"step": 356
},
{
"epoch": 0.2852007189934092,
"grad_norm": 0.8965816503974172,
"learning_rate": 4.281735428447158e-06,
"loss": 0.0832,
"step": 357
},
{
"epoch": 0.2859996005592171,
"grad_norm": 0.8157444856364388,
"learning_rate": 4.2770916863903295e-06,
"loss": 0.0908,
"step": 358
},
{
"epoch": 0.28679848212502496,
"grad_norm": 0.7854869982626375,
"learning_rate": 4.272435517043125e-06,
"loss": 0.1096,
"step": 359
},
{
"epoch": 0.28759736369083283,
"grad_norm": 0.8720888550038643,
"learning_rate": 4.267766952966369e-06,
"loss": 0.0888,
"step": 360
},
{
"epoch": 0.2883962452566407,
"grad_norm": 0.8053883429022618,
"learning_rate": 4.263086026807561e-06,
"loss": 0.0968,
"step": 361
},
{
"epoch": 0.2891951268224486,
"grad_norm": 0.8248678346640207,
"learning_rate": 4.258392771300649e-06,
"loss": 0.0958,
"step": 362
},
{
"epoch": 0.28999400838825645,
"grad_norm": 0.8242800118706491,
"learning_rate": 4.253687219265803e-06,
"loss": 0.0785,
"step": 363
},
{
"epoch": 0.2907928899540643,
"grad_norm": 0.8003059421660266,
"learning_rate": 4.248969403609182e-06,
"loss": 0.096,
"step": 364
},
{
"epoch": 0.2915917715198722,
"grad_norm": 0.8463871811311655,
"learning_rate": 4.244239357322705e-06,
"loss": 0.106,
"step": 365
},
{
"epoch": 0.2923906530856801,
"grad_norm": 0.8461024953606568,
"learning_rate": 4.239497113483819e-06,
"loss": 0.102,
"step": 366
},
{
"epoch": 0.2931895346514879,
"grad_norm": 0.7888813716191685,
"learning_rate": 4.2347427052552725e-06,
"loss": 0.0965,
"step": 367
},
{
"epoch": 0.29398841621729577,
"grad_norm": 0.8119369304603392,
"learning_rate": 4.2299761658848775e-06,
"loss": 0.0988,
"step": 368
},
{
"epoch": 0.29478729778310364,
"grad_norm": 1.014324893212535,
"learning_rate": 4.2251975287052804e-06,
"loss": 0.1018,
"step": 369
},
{
"epoch": 0.2955861793489115,
"grad_norm": 0.8064204728119285,
"learning_rate": 4.220406827133728e-06,
"loss": 0.106,
"step": 370
},
{
"epoch": 0.2963850609147194,
"grad_norm": 0.822626417850115,
"learning_rate": 4.215604094671835e-06,
"loss": 0.0901,
"step": 371
},
{
"epoch": 0.29718394248052726,
"grad_norm": 0.9253154637853676,
"learning_rate": 4.2107893649053465e-06,
"loss": 0.1098,
"step": 372
},
{
"epoch": 0.29798282404633514,
"grad_norm": 0.7253683875908528,
"learning_rate": 4.205962671503907e-06,
"loss": 0.105,
"step": 373
},
{
"epoch": 0.298781705612143,
"grad_norm": 0.8070549842742699,
"learning_rate": 4.201124048220825e-06,
"loss": 0.0902,
"step": 374
},
{
"epoch": 0.2995805871779509,
"grad_norm": 0.8993069957177886,
"learning_rate": 4.196273528892831e-06,
"loss": 0.1005,
"step": 375
},
{
"epoch": 0.30037946874375876,
"grad_norm": 0.7182651047515527,
"learning_rate": 4.191411147439849e-06,
"loss": 0.0905,
"step": 376
},
{
"epoch": 0.30117835030956663,
"grad_norm": 0.8275259104357634,
"learning_rate": 4.186536937864752e-06,
"loss": 0.0876,
"step": 377
},
{
"epoch": 0.30197723187537445,
"grad_norm": 0.7585687841737675,
"learning_rate": 4.181650934253132e-06,
"loss": 0.0926,
"step": 378
},
{
"epoch": 0.3027761134411823,
"grad_norm": 0.7808709996475448,
"learning_rate": 4.176753170773053e-06,
"loss": 0.0974,
"step": 379
},
{
"epoch": 0.3035749950069902,
"grad_norm": 0.8210683387788029,
"learning_rate": 4.171843681674818e-06,
"loss": 0.0999,
"step": 380
},
{
"epoch": 0.3043738765727981,
"grad_norm": 0.7953882184740051,
"learning_rate": 4.16692250129073e-06,
"loss": 0.0836,
"step": 381
},
{
"epoch": 0.30517275813860595,
"grad_norm": 0.7020914530946579,
"learning_rate": 4.161989664034844e-06,
"loss": 0.0822,
"step": 382
},
{
"epoch": 0.3059716397044138,
"grad_norm": 0.820358474319794,
"learning_rate": 4.157045204402741e-06,
"loss": 0.0901,
"step": 383
},
{
"epoch": 0.3067705212702217,
"grad_norm": 0.7488124118932843,
"learning_rate": 4.152089156971268e-06,
"loss": 0.0898,
"step": 384
},
{
"epoch": 0.30756940283602957,
"grad_norm": 0.7513953195291163,
"learning_rate": 4.1471215563983125e-06,
"loss": 0.1036,
"step": 385
},
{
"epoch": 0.30836828440183744,
"grad_norm": 0.8572423002912835,
"learning_rate": 4.142142437422552e-06,
"loss": 0.101,
"step": 386
},
{
"epoch": 0.3091671659676453,
"grad_norm": 0.7548601875565724,
"learning_rate": 4.137151834863213e-06,
"loss": 0.0966,
"step": 387
},
{
"epoch": 0.3099660475334532,
"grad_norm": 0.7870823465308348,
"learning_rate": 4.132149783619826e-06,
"loss": 0.093,
"step": 388
},
{
"epoch": 0.310764929099261,
"grad_norm": 0.6611014210675613,
"learning_rate": 4.127136318671984e-06,
"loss": 0.0854,
"step": 389
},
{
"epoch": 0.3115638106650689,
"grad_norm": 0.7054296499042809,
"learning_rate": 4.122111475079097e-06,
"loss": 0.0922,
"step": 390
},
{
"epoch": 0.31236269223087676,
"grad_norm": 0.8038584158970129,
"learning_rate": 4.117075287980144e-06,
"loss": 0.0859,
"step": 391
},
{
"epoch": 0.31316157379668463,
"grad_norm": 0.7871099701396741,
"learning_rate": 4.112027792593433e-06,
"loss": 0.0889,
"step": 392
},
{
"epoch": 0.3139604553624925,
"grad_norm": 0.6799036215445816,
"learning_rate": 4.106969024216348e-06,
"loss": 0.0803,
"step": 393
},
{
"epoch": 0.3147593369283004,
"grad_norm": 0.8062246682119243,
"learning_rate": 4.101899018225111e-06,
"loss": 0.0911,
"step": 394
},
{
"epoch": 0.31555821849410826,
"grad_norm": 0.7615051736680087,
"learning_rate": 4.096817810074521e-06,
"loss": 0.0976,
"step": 395
},
{
"epoch": 0.31635710005991613,
"grad_norm": 0.7754952681063141,
"learning_rate": 4.091725435297721e-06,
"loss": 0.0915,
"step": 396
},
{
"epoch": 0.317155981625724,
"grad_norm": 0.8407847771110065,
"learning_rate": 4.086621929505941e-06,
"loss": 0.0931,
"step": 397
},
{
"epoch": 0.3179548631915319,
"grad_norm": 0.8100383074921174,
"learning_rate": 4.0815073283882495e-06,
"loss": 0.0834,
"step": 398
},
{
"epoch": 0.31875374475733975,
"grad_norm": 0.7969746133003707,
"learning_rate": 4.076381667711306e-06,
"loss": 0.0874,
"step": 399
},
{
"epoch": 0.31955262632314757,
"grad_norm": 0.8152506864022063,
"learning_rate": 4.0712449833191115e-06,
"loss": 0.0893,
"step": 400
},
{
"epoch": 0.32035150788895544,
"grad_norm": 0.8233851359298267,
"learning_rate": 4.066097311132754e-06,
"loss": 0.0838,
"step": 401
},
{
"epoch": 0.3211503894547633,
"grad_norm": 0.8539151154626614,
"learning_rate": 4.060938687150159e-06,
"loss": 0.0989,
"step": 402
},
{
"epoch": 0.3219492710205712,
"grad_norm": 0.6995749387992817,
"learning_rate": 4.055769147445842e-06,
"loss": 0.0861,
"step": 403
},
{
"epoch": 0.32274815258637907,
"grad_norm": 0.8859127143940589,
"learning_rate": 4.0505887281706505e-06,
"loss": 0.1025,
"step": 404
},
{
"epoch": 0.32354703415218694,
"grad_norm": 0.694119492128483,
"learning_rate": 4.045397465551513e-06,
"loss": 0.0808,
"step": 405
},
{
"epoch": 0.3243459157179948,
"grad_norm": 0.7492015043916129,
"learning_rate": 4.040195395891187e-06,
"loss": 0.0873,
"step": 406
},
{
"epoch": 0.3251447972838027,
"grad_norm": 0.7380399378402056,
"learning_rate": 4.034982555568005e-06,
"loss": 0.0874,
"step": 407
},
{
"epoch": 0.32594367884961056,
"grad_norm": 0.7576142318886974,
"learning_rate": 4.029758981035617e-06,
"loss": 0.0887,
"step": 408
},
{
"epoch": 0.32674256041541844,
"grad_norm": 0.6820402330521123,
"learning_rate": 4.024524708822739e-06,
"loss": 0.0876,
"step": 409
},
{
"epoch": 0.32754144198122626,
"grad_norm": 0.8158511029737862,
"learning_rate": 4.019279775532896e-06,
"loss": 0.0912,
"step": 410
},
{
"epoch": 0.32834032354703413,
"grad_norm": 0.7272739087436181,
"learning_rate": 4.014024217844167e-06,
"loss": 0.0889,
"step": 411
},
{
"epoch": 0.329139205112842,
"grad_norm": 0.712392288383156,
"learning_rate": 4.008758072508929e-06,
"loss": 0.0885,
"step": 412
},
{
"epoch": 0.3299380866786499,
"grad_norm": 0.7332274428055798,
"learning_rate": 4.0034813763535965e-06,
"loss": 0.0952,
"step": 413
},
{
"epoch": 0.33073696824445775,
"grad_norm": 0.8308294351934334,
"learning_rate": 3.9981941662783675e-06,
"loss": 0.0943,
"step": 414
},
{
"epoch": 0.3315358498102656,
"grad_norm": 0.7987444077880893,
"learning_rate": 3.992896479256966e-06,
"loss": 0.0905,
"step": 415
},
{
"epoch": 0.3323347313760735,
"grad_norm": 0.752508121022912,
"learning_rate": 3.987588352336379e-06,
"loss": 0.0882,
"step": 416
},
{
"epoch": 0.3331336129418814,
"grad_norm": 0.7419767282856443,
"learning_rate": 3.982269822636602e-06,
"loss": 0.0812,
"step": 417
},
{
"epoch": 0.33393249450768925,
"grad_norm": 0.6909171898250309,
"learning_rate": 3.976940927350377e-06,
"loss": 0.0837,
"step": 418
},
{
"epoch": 0.3347313760734971,
"grad_norm": 0.7683212481615431,
"learning_rate": 3.971601703742932e-06,
"loss": 0.0873,
"step": 419
},
{
"epoch": 0.335530257639305,
"grad_norm": 0.6937329591034055,
"learning_rate": 3.966252189151726e-06,
"loss": 0.0794,
"step": 420
},
{
"epoch": 0.3363291392051128,
"grad_norm": 0.7515329965394459,
"learning_rate": 3.960892420986177e-06,
"loss": 0.0842,
"step": 421
},
{
"epoch": 0.3371280207709207,
"grad_norm": 0.726559526657185,
"learning_rate": 3.955522436727412e-06,
"loss": 0.097,
"step": 422
},
{
"epoch": 0.33792690233672856,
"grad_norm": 0.7327126701823631,
"learning_rate": 3.950142273927996e-06,
"loss": 0.0785,
"step": 423
},
{
"epoch": 0.33872578390253644,
"grad_norm": 0.7546091429104848,
"learning_rate": 3.944751970211675e-06,
"loss": 0.0777,
"step": 424
},
{
"epoch": 0.3395246654683443,
"grad_norm": 0.6886480296858932,
"learning_rate": 3.93935156327311e-06,
"loss": 0.0863,
"step": 425
},
{
"epoch": 0.3403235470341522,
"grad_norm": 0.7426135389310096,
"learning_rate": 3.933941090877615e-06,
"loss": 0.0916,
"step": 426
},
{
"epoch": 0.34112242859996006,
"grad_norm": 0.766034385340495,
"learning_rate": 3.928520590860894e-06,
"loss": 0.0936,
"step": 427
},
{
"epoch": 0.34192131016576793,
"grad_norm": 0.6947732694520958,
"learning_rate": 3.9230901011287695e-06,
"loss": 0.0843,
"step": 428
},
{
"epoch": 0.3427201917315758,
"grad_norm": 0.7502696533350824,
"learning_rate": 3.917649659656927e-06,
"loss": 0.0894,
"step": 429
},
{
"epoch": 0.3435190732973837,
"grad_norm": 0.6987339211471317,
"learning_rate": 3.912199304490645e-06,
"loss": 0.0841,
"step": 430
},
{
"epoch": 0.34431795486319156,
"grad_norm": 0.7484094471603621,
"learning_rate": 3.906739073744526e-06,
"loss": 0.0867,
"step": 431
},
{
"epoch": 0.3451168364289994,
"grad_norm": 0.8136583365831624,
"learning_rate": 3.901269005602235e-06,
"loss": 0.09,
"step": 432
},
{
"epoch": 0.34591571799480725,
"grad_norm": 0.6891207387873332,
"learning_rate": 3.895789138316231e-06,
"loss": 0.0757,
"step": 433
},
{
"epoch": 0.3467145995606151,
"grad_norm": 0.7235748175530092,
"learning_rate": 3.8902995102074985e-06,
"loss": 0.0829,
"step": 434
},
{
"epoch": 0.347513481126423,
"grad_norm": 0.7886249459447628,
"learning_rate": 3.8848001596652765e-06,
"loss": 0.0834,
"step": 435
},
{
"epoch": 0.34831236269223087,
"grad_norm": 0.7917287800334782,
"learning_rate": 3.879291125146798e-06,
"loss": 0.0863,
"step": 436
},
{
"epoch": 0.34911124425803874,
"grad_norm": 0.7593521963308881,
"learning_rate": 3.8737724451770155e-06,
"loss": 0.0855,
"step": 437
},
{
"epoch": 0.3499101258238466,
"grad_norm": 0.675290830162991,
"learning_rate": 3.868244158348331e-06,
"loss": 0.0705,
"step": 438
},
{
"epoch": 0.3507090073896545,
"grad_norm": 0.7483252051084053,
"learning_rate": 3.862706303320329e-06,
"loss": 0.0879,
"step": 439
},
{
"epoch": 0.35150788895546237,
"grad_norm": 0.7298703252341586,
"learning_rate": 3.857158918819506e-06,
"loss": 0.0797,
"step": 440
},
{
"epoch": 0.35230677052127024,
"grad_norm": 0.8238714701549442,
"learning_rate": 3.8516020436389945e-06,
"loss": 0.0941,
"step": 441
},
{
"epoch": 0.3531056520870781,
"grad_norm": 0.7126221155492483,
"learning_rate": 3.8460357166383e-06,
"loss": 0.0851,
"step": 442
},
{
"epoch": 0.35390453365288593,
"grad_norm": 0.8197766203707731,
"learning_rate": 3.840459976743024e-06,
"loss": 0.0957,
"step": 443
},
{
"epoch": 0.3547034152186938,
"grad_norm": 0.7846087618909572,
"learning_rate": 3.834874862944591e-06,
"loss": 0.099,
"step": 444
},
{
"epoch": 0.3555022967845017,
"grad_norm": 0.716374377569519,
"learning_rate": 3.82928041429998e-06,
"loss": 0.0968,
"step": 445
},
{
"epoch": 0.35630117835030956,
"grad_norm": 0.8390389818745712,
"learning_rate": 3.823676669931448e-06,
"loss": 0.089,
"step": 446
},
{
"epoch": 0.35710005991611743,
"grad_norm": 0.747973364886596,
"learning_rate": 3.8180636690262565e-06,
"loss": 0.0958,
"step": 447
},
{
"epoch": 0.3578989414819253,
"grad_norm": 0.6950670448192524,
"learning_rate": 3.8124414508364005e-06,
"loss": 0.0802,
"step": 448
},
{
"epoch": 0.3586978230477332,
"grad_norm": 0.7708194029959454,
"learning_rate": 3.8068100546783315e-06,
"loss": 0.0795,
"step": 449
},
{
"epoch": 0.35949670461354105,
"grad_norm": 0.7493448729691877,
"learning_rate": 3.801169519932681e-06,
"loss": 0.0884,
"step": 450
},
{
"epoch": 0.3602955861793489,
"grad_norm": 0.8085585194492837,
"learning_rate": 3.7955198860439892e-06,
"loss": 0.0981,
"step": 451
},
{
"epoch": 0.3610944677451568,
"grad_norm": 0.7586954756100585,
"learning_rate": 3.789861192520426e-06,
"loss": 0.096,
"step": 452
},
{
"epoch": 0.3618933493109647,
"grad_norm": 0.7843231944695468,
"learning_rate": 3.7841934789335167e-06,
"loss": 0.0896,
"step": 453
},
{
"epoch": 0.3626922308767725,
"grad_norm": 0.7476912899547512,
"learning_rate": 3.778516784917863e-06,
"loss": 0.0787,
"step": 454
},
{
"epoch": 0.36349111244258037,
"grad_norm": 0.7189383293665426,
"learning_rate": 3.772831150170868e-06,
"loss": 0.0879,
"step": 455
},
{
"epoch": 0.36428999400838824,
"grad_norm": 0.7132741109475154,
"learning_rate": 3.767136614452458e-06,
"loss": 0.0865,
"step": 456
},
{
"epoch": 0.3650888755741961,
"grad_norm": 0.7493526039741611,
"learning_rate": 3.761433217584803e-06,
"loss": 0.0869,
"step": 457
},
{
"epoch": 0.365887757140004,
"grad_norm": 0.8337392813152291,
"learning_rate": 3.7557209994520428e-06,
"loss": 0.0908,
"step": 458
},
{
"epoch": 0.36668663870581186,
"grad_norm": 0.7810036583492975,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0785,
"step": 459
},
{
"epoch": 0.36748552027161974,
"grad_norm": 0.8293572960190608,
"learning_rate": 3.7442702592359094e-06,
"loss": 0.0855,
"step": 460
},
{
"epoch": 0.3682844018374276,
"grad_norm": 0.7663051871847771,
"learning_rate": 3.7385318172281314e-06,
"loss": 0.0821,
"step": 461
},
{
"epoch": 0.3690832834032355,
"grad_norm": 0.6698128445071535,
"learning_rate": 3.732784714105876e-06,
"loss": 0.0814,
"step": 462
},
{
"epoch": 0.36988216496904336,
"grad_norm": 0.741297757096265,
"learning_rate": 3.727028990058921e-06,
"loss": 0.0788,
"step": 463
},
{
"epoch": 0.37068104653485123,
"grad_norm": 0.8326064497418644,
"learning_rate": 3.7212646853373304e-06,
"loss": 0.0913,
"step": 464
},
{
"epoch": 0.37147992810065905,
"grad_norm": 0.7519122240289404,
"learning_rate": 3.715491840251172e-06,
"loss": 0.0923,
"step": 465
},
{
"epoch": 0.3722788096664669,
"grad_norm": 0.8495937228845698,
"learning_rate": 3.70971049517024e-06,
"loss": 0.0991,
"step": 466
},
{
"epoch": 0.3730776912322748,
"grad_norm": 0.7980569435136234,
"learning_rate": 3.7039206905237663e-06,
"loss": 0.0846,
"step": 467
},
{
"epoch": 0.3738765727980827,
"grad_norm": 0.82253196121602,
"learning_rate": 3.6981224668001427e-06,
"loss": 0.0907,
"step": 468
},
{
"epoch": 0.37467545436389055,
"grad_norm": 0.7813567054207068,
"learning_rate": 3.692315864546635e-06,
"loss": 0.0954,
"step": 469
},
{
"epoch": 0.3754743359296984,
"grad_norm": 0.6900644306541571,
"learning_rate": 3.6865009243691015e-06,
"loss": 0.0852,
"step": 470
},
{
"epoch": 0.3762732174955063,
"grad_norm": 0.8286216266975185,
"learning_rate": 3.6806776869317074e-06,
"loss": 0.0971,
"step": 471
},
{
"epoch": 0.37707209906131417,
"grad_norm": 0.8411427447979212,
"learning_rate": 3.6748461929566405e-06,
"loss": 0.0976,
"step": 472
},
{
"epoch": 0.37787098062712204,
"grad_norm": 0.7097804298197564,
"learning_rate": 3.6690064832238287e-06,
"loss": 0.084,
"step": 473
},
{
"epoch": 0.3786698621929299,
"grad_norm": 0.6997311886122694,
"learning_rate": 3.663158598570652e-06,
"loss": 0.0879,
"step": 474
},
{
"epoch": 0.3794687437587378,
"grad_norm": 0.7892931222413458,
"learning_rate": 3.6573025798916566e-06,
"loss": 0.0858,
"step": 475
},
{
"epoch": 0.3802676253245456,
"grad_norm": 0.7881856449766588,
"learning_rate": 3.6514384681382736e-06,
"loss": 0.0952,
"step": 476
},
{
"epoch": 0.3810665068903535,
"grad_norm": 0.7870138639216214,
"learning_rate": 3.6455663043185264e-06,
"loss": 0.1006,
"step": 477
},
{
"epoch": 0.38186538845616136,
"grad_norm": 0.7825807381829545,
"learning_rate": 3.639686129496749e-06,
"loss": 0.0831,
"step": 478
},
{
"epoch": 0.38266427002196923,
"grad_norm": 0.8412645195997293,
"learning_rate": 3.6337979847932948e-06,
"loss": 0.0878,
"step": 479
},
{
"epoch": 0.3834631515877771,
"grad_norm": 0.7619513322856116,
"learning_rate": 3.627901911384252e-06,
"loss": 0.0857,
"step": 480
},
{
"epoch": 0.384262033153585,
"grad_norm": 0.920197415542654,
"learning_rate": 3.621997950501156e-06,
"loss": 0.0923,
"step": 481
},
{
"epoch": 0.38506091471939285,
"grad_norm": 0.9089054249688453,
"learning_rate": 3.616086143430697e-06,
"loss": 0.0831,
"step": 482
},
{
"epoch": 0.38585979628520073,
"grad_norm": 0.8882827945787267,
"learning_rate": 3.6101665315144357e-06,
"loss": 0.0879,
"step": 483
},
{
"epoch": 0.3866586778510086,
"grad_norm": 0.853626379217836,
"learning_rate": 3.604239156148512e-06,
"loss": 0.0856,
"step": 484
},
{
"epoch": 0.3874575594168165,
"grad_norm": 0.7468567313834733,
"learning_rate": 3.598304058783357e-06,
"loss": 0.0919,
"step": 485
},
{
"epoch": 0.38825644098262435,
"grad_norm": 0.8236774995238476,
"learning_rate": 3.5923612809233987e-06,
"loss": 0.0958,
"step": 486
},
{
"epoch": 0.38905532254843217,
"grad_norm": 0.8150930399340047,
"learning_rate": 3.5864108641267815e-06,
"loss": 0.089,
"step": 487
},
{
"epoch": 0.38985420411424004,
"grad_norm": 0.7645579281892109,
"learning_rate": 3.580452850005061e-06,
"loss": 0.1047,
"step": 488
},
{
"epoch": 0.3906530856800479,
"grad_norm": 0.8299900496448547,
"learning_rate": 3.5744872802229296e-06,
"loss": 0.088,
"step": 489
},
{
"epoch": 0.3914519672458558,
"grad_norm": 0.8345475593778248,
"learning_rate": 3.56851419649791e-06,
"loss": 0.0878,
"step": 490
},
{
"epoch": 0.39225084881166367,
"grad_norm": 0.7331030137600181,
"learning_rate": 3.5625336406000752e-06,
"loss": 0.0873,
"step": 491
},
{
"epoch": 0.39304973037747154,
"grad_norm": 0.751621872360305,
"learning_rate": 3.556545654351749e-06,
"loss": 0.0885,
"step": 492
},
{
"epoch": 0.3938486119432794,
"grad_norm": 1.0573113079653964,
"learning_rate": 3.5505502796272157e-06,
"loss": 0.0967,
"step": 493
},
{
"epoch": 0.3946474935090873,
"grad_norm": 0.8576970399627939,
"learning_rate": 3.5445475583524293e-06,
"loss": 0.0929,
"step": 494
},
{
"epoch": 0.39544637507489516,
"grad_norm": 0.7726071680089382,
"learning_rate": 3.5385375325047167e-06,
"loss": 0.0861,
"step": 495
},
{
"epoch": 0.39624525664070304,
"grad_norm": 0.8365490675498872,
"learning_rate": 3.5325202441124875e-06,
"loss": 0.085,
"step": 496
},
{
"epoch": 0.3970441382065109,
"grad_norm": 0.8195417566783639,
"learning_rate": 3.5264957352549378e-06,
"loss": 0.0978,
"step": 497
},
{
"epoch": 0.39784301977231873,
"grad_norm": 0.787890923867209,
"learning_rate": 3.520464048061758e-06,
"loss": 0.0958,
"step": 498
},
{
"epoch": 0.3986419013381266,
"grad_norm": 0.7294064863532496,
"learning_rate": 3.514425224712835e-06,
"loss": 0.0768,
"step": 499
},
{
"epoch": 0.3994407829039345,
"grad_norm": 0.8964503937639478,
"learning_rate": 3.5083793074379607e-06,
"loss": 0.0779,
"step": 500
},
{
"epoch": 0.3994407829039345,
"eval_loss": 0.08617319911718369,
"eval_runtime": 16.1068,
"eval_samples_per_second": 50.289,
"eval_steps_per_second": 6.333,
"step": 500
},
{
"epoch": 0.40023966446974235,
"grad_norm": 0.699956224503503,
"learning_rate": 3.5023263385165346e-06,
"loss": 0.0782,
"step": 501
},
{
"epoch": 0.4010385460355502,
"grad_norm": 0.8000522503429608,
"learning_rate": 3.496266360277269e-06,
"loss": 0.0823,
"step": 502
},
{
"epoch": 0.4018374276013581,
"grad_norm": 0.8310072491457343,
"learning_rate": 3.4901994150978926e-06,
"loss": 0.087,
"step": 503
},
{
"epoch": 0.402636309167166,
"grad_norm": 0.7698651148649212,
"learning_rate": 3.484125545404854e-06,
"loss": 0.0776,
"step": 504
},
{
"epoch": 0.40343519073297385,
"grad_norm": 0.6925316210594149,
"learning_rate": 3.478044793673025e-06,
"loss": 0.071,
"step": 505
},
{
"epoch": 0.4042340722987817,
"grad_norm": 0.8646957127737731,
"learning_rate": 3.4719572024254057e-06,
"loss": 0.0854,
"step": 506
},
{
"epoch": 0.4050329538645896,
"grad_norm": 0.8538481314053985,
"learning_rate": 3.4658628142328215e-06,
"loss": 0.0876,
"step": 507
},
{
"epoch": 0.40583183543039747,
"grad_norm": 0.6801192727986217,
"learning_rate": 3.4597616717136344e-06,
"loss": 0.0871,
"step": 508
},
{
"epoch": 0.4066307169962053,
"grad_norm": 0.7685526683593954,
"learning_rate": 3.453653817533435e-06,
"loss": 0.0786,
"step": 509
},
{
"epoch": 0.40742959856201316,
"grad_norm": 0.898757778816308,
"learning_rate": 3.4475392944047514e-06,
"loss": 0.0957,
"step": 510
},
{
"epoch": 0.40822848012782104,
"grad_norm": 0.6611162518031449,
"learning_rate": 3.4414181450867466e-06,
"loss": 0.073,
"step": 511
},
{
"epoch": 0.4090273616936289,
"grad_norm": 0.727245696789342,
"learning_rate": 3.435290412384924e-06,
"loss": 0.0893,
"step": 512
},
{
"epoch": 0.4098262432594368,
"grad_norm": 0.8581774504750863,
"learning_rate": 3.429156139150819e-06,
"loss": 0.0961,
"step": 513
},
{
"epoch": 0.41062512482524466,
"grad_norm": 0.8327009556188208,
"learning_rate": 3.4230153682817112e-06,
"loss": 0.0868,
"step": 514
},
{
"epoch": 0.41142400639105253,
"grad_norm": 0.7252986562598082,
"learning_rate": 3.416868142720316e-06,
"loss": 0.0885,
"step": 515
},
{
"epoch": 0.4122228879568604,
"grad_norm": 0.7778675249904831,
"learning_rate": 3.410714505454486e-06,
"loss": 0.0856,
"step": 516
},
{
"epoch": 0.4130217695226683,
"grad_norm": 0.9287656547088576,
"learning_rate": 3.4045544995169126e-06,
"loss": 0.0904,
"step": 517
},
{
"epoch": 0.41382065108847615,
"grad_norm": 0.6975552514483802,
"learning_rate": 3.398388167984823e-06,
"loss": 0.0883,
"step": 518
},
{
"epoch": 0.41461953265428403,
"grad_norm": 0.7566168004464808,
"learning_rate": 3.39221555397968e-06,
"loss": 0.0876,
"step": 519
},
{
"epoch": 0.41541841422009185,
"grad_norm": 0.8230391216164356,
"learning_rate": 3.386036700666879e-06,
"loss": 0.0781,
"step": 520
},
{
"epoch": 0.4162172957858997,
"grad_norm": 0.825174450398779,
"learning_rate": 3.379851651255449e-06,
"loss": 0.0899,
"step": 521
},
{
"epoch": 0.4170161773517076,
"grad_norm": 0.7049547674213612,
"learning_rate": 3.3736604489977465e-06,
"loss": 0.0813,
"step": 522
},
{
"epoch": 0.41781505891751547,
"grad_norm": 0.7945037941643481,
"learning_rate": 3.3674631371891564e-06,
"loss": 0.0721,
"step": 523
},
{
"epoch": 0.41861394048332334,
"grad_norm": 0.728321463459383,
"learning_rate": 3.361259759167788e-06,
"loss": 0.0957,
"step": 524
},
{
"epoch": 0.4194128220491312,
"grad_norm": 0.7250715129014192,
"learning_rate": 3.3550503583141726e-06,
"loss": 0.0727,
"step": 525
},
{
"epoch": 0.4202117036149391,
"grad_norm": 0.7777154233383422,
"learning_rate": 3.348834978050957e-06,
"loss": 0.0848,
"step": 526
},
{
"epoch": 0.42101058518074697,
"grad_norm": 0.7030006532784772,
"learning_rate": 3.3426136618426045e-06,
"loss": 0.0806,
"step": 527
},
{
"epoch": 0.42180946674655484,
"grad_norm": 0.7258714063056894,
"learning_rate": 3.3363864531950884e-06,
"loss": 0.0798,
"step": 528
},
{
"epoch": 0.4226083483123627,
"grad_norm": 0.7687609634365744,
"learning_rate": 3.3301533956555886e-06,
"loss": 0.0951,
"step": 529
},
{
"epoch": 0.4234072298781706,
"grad_norm": 0.7146960561827889,
"learning_rate": 3.323914532812184e-06,
"loss": 0.0828,
"step": 530
},
{
"epoch": 0.4242061114439784,
"grad_norm": 0.8026753326015152,
"learning_rate": 3.3176699082935546e-06,
"loss": 0.0876,
"step": 531
},
{
"epoch": 0.4250049930097863,
"grad_norm": 0.7107423975805593,
"learning_rate": 3.311419565768667e-06,
"loss": 0.0753,
"step": 532
},
{
"epoch": 0.42580387457559415,
"grad_norm": 0.7112921804432786,
"learning_rate": 3.3051635489464793e-06,
"loss": 0.083,
"step": 533
},
{
"epoch": 0.42660275614140203,
"grad_norm": 0.7737378542138326,
"learning_rate": 3.2989019015756253e-06,
"loss": 0.0932,
"step": 534
},
{
"epoch": 0.4274016377072099,
"grad_norm": 0.7077901777401819,
"learning_rate": 3.2926346674441173e-06,
"loss": 0.0785,
"step": 535
},
{
"epoch": 0.4282005192730178,
"grad_norm": 0.7856747325005221,
"learning_rate": 3.2863618903790346e-06,
"loss": 0.095,
"step": 536
},
{
"epoch": 0.42899940083882565,
"grad_norm": 0.6681798933516729,
"learning_rate": 3.280083614246218e-06,
"loss": 0.077,
"step": 537
},
{
"epoch": 0.4297982824046335,
"grad_norm": 0.7425470556630313,
"learning_rate": 3.2737998829499645e-06,
"loss": 0.0843,
"step": 538
},
{
"epoch": 0.4305971639704414,
"grad_norm": 0.7984293642051749,
"learning_rate": 3.2675107404327195e-06,
"loss": 0.0949,
"step": 539
},
{
"epoch": 0.4313960455362493,
"grad_norm": 0.8364406080946829,
"learning_rate": 3.261216230674768e-06,
"loss": 0.0914,
"step": 540
},
{
"epoch": 0.43219492710205715,
"grad_norm": 0.7017424910882903,
"learning_rate": 3.2549163976939292e-06,
"loss": 0.0841,
"step": 541
},
{
"epoch": 0.43299380866786497,
"grad_norm": 0.6971449842375312,
"learning_rate": 3.2486112855452485e-06,
"loss": 0.0797,
"step": 542
},
{
"epoch": 0.43379269023367284,
"grad_norm": 0.7830009514828633,
"learning_rate": 3.2423009383206876e-06,
"loss": 0.0933,
"step": 543
},
{
"epoch": 0.4345915717994807,
"grad_norm": 0.7647997750857403,
"learning_rate": 3.2359854001488178e-06,
"loss": 0.083,
"step": 544
},
{
"epoch": 0.4353904533652886,
"grad_norm": 0.6895639349115328,
"learning_rate": 3.2296647151945116e-06,
"loss": 0.0767,
"step": 545
},
{
"epoch": 0.43618933493109646,
"grad_norm": 0.7409218395782419,
"learning_rate": 3.2233389276586325e-06,
"loss": 0.0802,
"step": 546
},
{
"epoch": 0.43698821649690434,
"grad_norm": 0.7255590501259186,
"learning_rate": 3.217008081777726e-06,
"loss": 0.0824,
"step": 547
},
{
"epoch": 0.4377870980627122,
"grad_norm": 0.7111795815121589,
"learning_rate": 3.2106722218237124e-06,
"loss": 0.0822,
"step": 548
},
{
"epoch": 0.4385859796285201,
"grad_norm": 0.7793608395589445,
"learning_rate": 3.2043313921035747e-06,
"loss": 0.0883,
"step": 549
},
{
"epoch": 0.43938486119432796,
"grad_norm": 0.7229437869187035,
"learning_rate": 3.19798563695905e-06,
"loss": 0.0791,
"step": 550
},
{
"epoch": 0.44018374276013583,
"grad_norm": 0.7477705839069457,
"learning_rate": 3.191635000766318e-06,
"loss": 0.0851,
"step": 551
},
{
"epoch": 0.4409826243259437,
"grad_norm": 0.8331293508094471,
"learning_rate": 3.1852795279356946e-06,
"loss": 0.1016,
"step": 552
},
{
"epoch": 0.4417815058917515,
"grad_norm": 0.7681413958594786,
"learning_rate": 3.1789192629113147e-06,
"loss": 0.0875,
"step": 553
},
{
"epoch": 0.4425803874575594,
"grad_norm": 0.7256511341858173,
"learning_rate": 3.1725542501708302e-06,
"loss": 0.0835,
"step": 554
},
{
"epoch": 0.4433792690233673,
"grad_norm": 0.6987031139534794,
"learning_rate": 3.1661845342250874e-06,
"loss": 0.0832,
"step": 555
},
{
"epoch": 0.44417815058917515,
"grad_norm": 0.6784474789881185,
"learning_rate": 3.159810159617829e-06,
"loss": 0.0783,
"step": 556
},
{
"epoch": 0.444977032154983,
"grad_norm": 0.7254284540533688,
"learning_rate": 3.153431170925373e-06,
"loss": 0.0894,
"step": 557
},
{
"epoch": 0.4457759137207909,
"grad_norm": 0.787126427068804,
"learning_rate": 3.147047612756302e-06,
"loss": 0.0896,
"step": 558
},
{
"epoch": 0.44657479528659877,
"grad_norm": 0.7412461593538884,
"learning_rate": 3.1406595297511568e-06,
"loss": 0.0843,
"step": 559
},
{
"epoch": 0.44737367685240664,
"grad_norm": 0.7092573353441867,
"learning_rate": 3.1342669665821183e-06,
"loss": 0.087,
"step": 560
},
{
"epoch": 0.4481725584182145,
"grad_norm": 0.7691676535522879,
"learning_rate": 3.127869967952698e-06,
"loss": 0.0955,
"step": 561
},
{
"epoch": 0.4489714399840224,
"grad_norm": 0.6881651194944849,
"learning_rate": 3.1214685785974253e-06,
"loss": 0.079,
"step": 562
},
{
"epoch": 0.44977032154983027,
"grad_norm": 0.7008388621611868,
"learning_rate": 3.115062843281534e-06,
"loss": 0.0884,
"step": 563
},
{
"epoch": 0.4505692031156381,
"grad_norm": 0.6735736908138019,
"learning_rate": 3.108652806800648e-06,
"loss": 0.0706,
"step": 564
},
{
"epoch": 0.45136808468144596,
"grad_norm": 0.7315110483129635,
"learning_rate": 3.102238513980471e-06,
"loss": 0.0904,
"step": 565
},
{
"epoch": 0.45216696624725383,
"grad_norm": 0.7103583696801924,
"learning_rate": 3.095820009676471e-06,
"loss": 0.0846,
"step": 566
},
{
"epoch": 0.4529658478130617,
"grad_norm": 0.7891761809600559,
"learning_rate": 3.089397338773569e-06,
"loss": 0.0878,
"step": 567
},
{
"epoch": 0.4537647293788696,
"grad_norm": 0.7306565027752118,
"learning_rate": 3.0829705461858183e-06,
"loss": 0.0912,
"step": 568
},
{
"epoch": 0.45456361094467745,
"grad_norm": 0.6606967844352585,
"learning_rate": 3.0765396768561005e-06,
"loss": 0.0825,
"step": 569
},
{
"epoch": 0.45536249251048533,
"grad_norm": 0.7038331346600363,
"learning_rate": 3.0701047757558046e-06,
"loss": 0.0789,
"step": 570
},
{
"epoch": 0.4561613740762932,
"grad_norm": 0.7487353386872159,
"learning_rate": 3.0636658878845116e-06,
"loss": 0.0764,
"step": 571
},
{
"epoch": 0.4569602556421011,
"grad_norm": 0.8082694931973815,
"learning_rate": 3.0572230582696844e-06,
"loss": 0.0902,
"step": 572
},
{
"epoch": 0.45775913720790895,
"grad_norm": 0.6660374363774814,
"learning_rate": 3.050776331966352e-06,
"loss": 0.0666,
"step": 573
},
{
"epoch": 0.45855801877371677,
"grad_norm": 0.7753707402424571,
"learning_rate": 3.0443257540567896e-06,
"loss": 0.0831,
"step": 574
},
{
"epoch": 0.45935690033952464,
"grad_norm": 0.7653911797345625,
"learning_rate": 3.03787136965021e-06,
"loss": 0.0811,
"step": 575
},
{
"epoch": 0.4601557819053325,
"grad_norm": 0.7319992183728828,
"learning_rate": 3.0314132238824416e-06,
"loss": 0.0846,
"step": 576
},
{
"epoch": 0.4609546634711404,
"grad_norm": 0.7956127905075492,
"learning_rate": 3.0249513619156213e-06,
"loss": 0.0893,
"step": 577
},
{
"epoch": 0.46175354503694827,
"grad_norm": 0.7781976763250263,
"learning_rate": 3.018485828937868e-06,
"loss": 0.081,
"step": 578
},
{
"epoch": 0.46255242660275614,
"grad_norm": 0.7686793219032871,
"learning_rate": 3.012016670162977e-06,
"loss": 0.0821,
"step": 579
},
{
"epoch": 0.463351308168564,
"grad_norm": 0.698093145167194,
"learning_rate": 3.0055439308300954e-06,
"loss": 0.0865,
"step": 580
},
{
"epoch": 0.4641501897343719,
"grad_norm": 0.7476901736827252,
"learning_rate": 2.9990676562034105e-06,
"loss": 0.0815,
"step": 581
},
{
"epoch": 0.46494907130017976,
"grad_norm": 0.8062220941858489,
"learning_rate": 2.992587891571833e-06,
"loss": 0.0884,
"step": 582
},
{
"epoch": 0.46574795286598764,
"grad_norm": 0.743755942738479,
"learning_rate": 2.9861046822486774e-06,
"loss": 0.0796,
"step": 583
},
{
"epoch": 0.4665468344317955,
"grad_norm": 0.7501077059752908,
"learning_rate": 2.9796180735713505e-06,
"loss": 0.0856,
"step": 584
},
{
"epoch": 0.46734571599760333,
"grad_norm": 0.7429044140887799,
"learning_rate": 2.973128110901026e-06,
"loss": 0.0869,
"step": 585
},
{
"epoch": 0.4681445975634112,
"grad_norm": 0.7755885874852674,
"learning_rate": 2.9666348396223367e-06,
"loss": 0.0959,
"step": 586
},
{
"epoch": 0.4689434791292191,
"grad_norm": 0.6801456079207552,
"learning_rate": 2.960138305143051e-06,
"loss": 0.0753,
"step": 587
},
{
"epoch": 0.46974236069502695,
"grad_norm": 0.6757854650456917,
"learning_rate": 2.9536385528937566e-06,
"loss": 0.0715,
"step": 588
},
{
"epoch": 0.4705412422608348,
"grad_norm": 0.6902680146186407,
"learning_rate": 2.9471356283275444e-06,
"loss": 0.0751,
"step": 589
},
{
"epoch": 0.4713401238266427,
"grad_norm": 0.7534035340349238,
"learning_rate": 2.9406295769196868e-06,
"loss": 0.0886,
"step": 590
},
{
"epoch": 0.4721390053924506,
"grad_norm": 0.7774300987221902,
"learning_rate": 2.9341204441673267e-06,
"loss": 0.083,
"step": 591
},
{
"epoch": 0.47293788695825845,
"grad_norm": 0.6527035296264435,
"learning_rate": 2.92760827558915e-06,
"loss": 0.0702,
"step": 592
},
{
"epoch": 0.4737367685240663,
"grad_norm": 0.783007220913397,
"learning_rate": 2.9210931167250766e-06,
"loss": 0.0833,
"step": 593
},
{
"epoch": 0.4745356500898742,
"grad_norm": 0.7250650434550187,
"learning_rate": 2.9145750131359356e-06,
"loss": 0.0887,
"step": 594
},
{
"epoch": 0.47533453165568207,
"grad_norm": 0.769904889862114,
"learning_rate": 2.9080540104031487e-06,
"loss": 0.0839,
"step": 595
},
{
"epoch": 0.4761334132214899,
"grad_norm": 0.7189156153496629,
"learning_rate": 2.901530154128412e-06,
"loss": 0.0844,
"step": 596
},
{
"epoch": 0.47693229478729776,
"grad_norm": 0.7161425909317404,
"learning_rate": 2.895003489933375e-06,
"loss": 0.0723,
"step": 597
},
{
"epoch": 0.47773117635310564,
"grad_norm": 0.708947660281964,
"learning_rate": 2.888474063459326e-06,
"loss": 0.081,
"step": 598
},
{
"epoch": 0.4785300579189135,
"grad_norm": 0.7377958722078087,
"learning_rate": 2.881941920366868e-06,
"loss": 0.0818,
"step": 599
},
{
"epoch": 0.4793289394847214,
"grad_norm": 0.6998439723966013,
"learning_rate": 2.875407106335601e-06,
"loss": 0.0908,
"step": 600
},
{
"epoch": 0.48012782105052926,
"grad_norm": 0.7234747480385919,
"learning_rate": 2.8688696670638057e-06,
"loss": 0.0893,
"step": 601
},
{
"epoch": 0.48092670261633713,
"grad_norm": 0.6802799089422156,
"learning_rate": 2.862329648268117e-06,
"loss": 0.0741,
"step": 602
},
{
"epoch": 0.481725584182145,
"grad_norm": 0.7529275025549153,
"learning_rate": 2.8557870956832135e-06,
"loss": 0.0889,
"step": 603
},
{
"epoch": 0.4825244657479529,
"grad_norm": 0.7184604651026569,
"learning_rate": 2.849242055061488e-06,
"loss": 0.0867,
"step": 604
},
{
"epoch": 0.48332334731376075,
"grad_norm": 0.6581820285749856,
"learning_rate": 2.842694572172737e-06,
"loss": 0.0741,
"step": 605
},
{
"epoch": 0.48412222887956863,
"grad_norm": 0.7922487604497372,
"learning_rate": 2.8361446928038298e-06,
"loss": 0.0852,
"step": 606
},
{
"epoch": 0.48492111044537645,
"grad_norm": 0.7531752613506526,
"learning_rate": 2.829592462758401e-06,
"loss": 0.0777,
"step": 607
},
{
"epoch": 0.4857199920111843,
"grad_norm": 0.8326963935548872,
"learning_rate": 2.8230379278565197e-06,
"loss": 0.0805,
"step": 608
},
{
"epoch": 0.4865188735769922,
"grad_norm": 0.8237145760032943,
"learning_rate": 2.8164811339343736e-06,
"loss": 0.0856,
"step": 609
},
{
"epoch": 0.48731775514280007,
"grad_norm": 0.7065915753904118,
"learning_rate": 2.809922126843948e-06,
"loss": 0.0853,
"step": 610
},
{
"epoch": 0.48811663670860794,
"grad_norm": 0.736514710504635,
"learning_rate": 2.803360952452705e-06,
"loss": 0.0723,
"step": 611
},
{
"epoch": 0.4889155182744158,
"grad_norm": 0.7872810616902329,
"learning_rate": 2.796797656643263e-06,
"loss": 0.0821,
"step": 612
},
{
"epoch": 0.4897143998402237,
"grad_norm": 0.7936471445453519,
"learning_rate": 2.7902322853130758e-06,
"loss": 0.0796,
"step": 613
},
{
"epoch": 0.49051328140603156,
"grad_norm": 0.7465503003939695,
"learning_rate": 2.7836648843741105e-06,
"loss": 0.0918,
"step": 614
},
{
"epoch": 0.49131216297183944,
"grad_norm": 0.7549873274087162,
"learning_rate": 2.7770954997525277e-06,
"loss": 0.0791,
"step": 615
},
{
"epoch": 0.4921110445376473,
"grad_norm": 0.7330346635199269,
"learning_rate": 2.7705241773883607e-06,
"loss": 0.0775,
"step": 616
},
{
"epoch": 0.4929099261034552,
"grad_norm": 0.7262078474613537,
"learning_rate": 2.763950963235193e-06,
"loss": 0.0787,
"step": 617
},
{
"epoch": 0.493708807669263,
"grad_norm": 0.6433172738294531,
"learning_rate": 2.7573759032598367e-06,
"loss": 0.0736,
"step": 618
},
{
"epoch": 0.4945076892350709,
"grad_norm": 0.7007070811492951,
"learning_rate": 2.7507990434420127e-06,
"loss": 0.0845,
"step": 619
},
{
"epoch": 0.49530657080087875,
"grad_norm": 0.7649670953818944,
"learning_rate": 2.7442204297740295e-06,
"loss": 0.0851,
"step": 620
},
{
"epoch": 0.49610545236668663,
"grad_norm": 0.7312713852144791,
"learning_rate": 2.7376401082604563e-06,
"loss": 0.0846,
"step": 621
},
{
"epoch": 0.4969043339324945,
"grad_norm": 0.7290141779950609,
"learning_rate": 2.731058124917812e-06,
"loss": 0.0808,
"step": 622
},
{
"epoch": 0.4977032154983024,
"grad_norm": 0.704482891496262,
"learning_rate": 2.7244745257742293e-06,
"loss": 0.073,
"step": 623
},
{
"epoch": 0.49850209706411025,
"grad_norm": 0.8613003123017297,
"learning_rate": 2.717889356869146e-06,
"loss": 0.0931,
"step": 624
},
{
"epoch": 0.4993009786299181,
"grad_norm": 0.6828247716612095,
"learning_rate": 2.7113026642529733e-06,
"loss": 0.0844,
"step": 625
},
{
"epoch": 0.5000998601957259,
"grad_norm": 0.6921552059802246,
"learning_rate": 2.704714493986782e-06,
"loss": 0.0759,
"step": 626
},
{
"epoch": 0.5008987417615338,
"grad_norm": 0.7382737232554466,
"learning_rate": 2.6981248921419713e-06,
"loss": 0.0834,
"step": 627
},
{
"epoch": 0.5016976233273417,
"grad_norm": 0.7643514787883916,
"learning_rate": 2.6915339047999563e-06,
"loss": 0.0808,
"step": 628
},
{
"epoch": 0.5024965048931496,
"grad_norm": 0.8171733325574314,
"learning_rate": 2.684941578051836e-06,
"loss": 0.0744,
"step": 629
},
{
"epoch": 0.5032953864589574,
"grad_norm": 0.6993831939859319,
"learning_rate": 2.678347957998081e-06,
"loss": 0.0776,
"step": 630
},
{
"epoch": 0.5040942680247653,
"grad_norm": 0.7118175146789436,
"learning_rate": 2.6717530907482027e-06,
"loss": 0.0733,
"step": 631
},
{
"epoch": 0.5048931495905732,
"grad_norm": 0.7288191153869615,
"learning_rate": 2.6651570224204355e-06,
"loss": 0.0815,
"step": 632
},
{
"epoch": 0.5056920311563811,
"grad_norm": 0.713314556630966,
"learning_rate": 2.6585597991414115e-06,
"loss": 0.076,
"step": 633
},
{
"epoch": 0.5064909127221889,
"grad_norm": 0.7568168430439096,
"learning_rate": 2.651961467045842e-06,
"loss": 0.0912,
"step": 634
},
{
"epoch": 0.5072897942879968,
"grad_norm": 0.7245241109183076,
"learning_rate": 2.6453620722761897e-06,
"loss": 0.0737,
"step": 635
},
{
"epoch": 0.5080886758538047,
"grad_norm": 0.7262894883310934,
"learning_rate": 2.6387616609823506e-06,
"loss": 0.0761,
"step": 636
},
{
"epoch": 0.5088875574196126,
"grad_norm": 0.6999036722622294,
"learning_rate": 2.6321602793213287e-06,
"loss": 0.0741,
"step": 637
},
{
"epoch": 0.5096864389854204,
"grad_norm": 0.6976916656678082,
"learning_rate": 2.625557973456913e-06,
"loss": 0.0667,
"step": 638
},
{
"epoch": 0.5104853205512283,
"grad_norm": 0.6808266325469274,
"learning_rate": 2.6189547895593565e-06,
"loss": 0.0703,
"step": 639
},
{
"epoch": 0.5112842021170362,
"grad_norm": 0.7465421794886118,
"learning_rate": 2.6123507738050513e-06,
"loss": 0.0758,
"step": 640
},
{
"epoch": 0.512083083682844,
"grad_norm": 0.6816895308085987,
"learning_rate": 2.6057459723762078e-06,
"loss": 0.0834,
"step": 641
},
{
"epoch": 0.5128819652486519,
"grad_norm": 0.7286471926630991,
"learning_rate": 2.599140431460531e-06,
"loss": 0.0792,
"step": 642
},
{
"epoch": 0.5136808468144598,
"grad_norm": 0.7546333240801898,
"learning_rate": 2.5925341972508954e-06,
"loss": 0.0796,
"step": 643
},
{
"epoch": 0.5144797283802677,
"grad_norm": 0.7683040441468793,
"learning_rate": 2.5859273159450247e-06,
"loss": 0.0736,
"step": 644
},
{
"epoch": 0.5152786099460755,
"grad_norm": 0.7653812473026139,
"learning_rate": 2.57931983374517e-06,
"loss": 0.0802,
"step": 645
},
{
"epoch": 0.5160774915118833,
"grad_norm": 0.7601939158747375,
"learning_rate": 2.572711796857779e-06,
"loss": 0.072,
"step": 646
},
{
"epoch": 0.5168763730776912,
"grad_norm": 0.7195969610874816,
"learning_rate": 2.566103251493184e-06,
"loss": 0.0859,
"step": 647
},
{
"epoch": 0.5176752546434991,
"grad_norm": 0.7884083401959953,
"learning_rate": 2.5594942438652685e-06,
"loss": 0.0761,
"step": 648
},
{
"epoch": 0.5184741362093069,
"grad_norm": 0.6291975339991476,
"learning_rate": 2.5528848201911543e-06,
"loss": 0.0739,
"step": 649
},
{
"epoch": 0.5192730177751148,
"grad_norm": 0.6813656053055375,
"learning_rate": 2.5462750266908667e-06,
"loss": 0.0712,
"step": 650
},
{
"epoch": 0.5200718993409227,
"grad_norm": 0.710394947419851,
"learning_rate": 2.53966490958702e-06,
"loss": 0.0831,
"step": 651
},
{
"epoch": 0.5208707809067306,
"grad_norm": 0.7481196303136334,
"learning_rate": 2.5330545151044923e-06,
"loss": 0.0774,
"step": 652
},
{
"epoch": 0.5216696624725384,
"grad_norm": 0.7096377432791056,
"learning_rate": 2.5264438894700994e-06,
"loss": 0.0811,
"step": 653
},
{
"epoch": 0.5224685440383463,
"grad_norm": 0.820591327390357,
"learning_rate": 2.5198330789122743e-06,
"loss": 0.0844,
"step": 654
},
{
"epoch": 0.5232674256041542,
"grad_norm": 0.7737582631833925,
"learning_rate": 2.5132221296607446e-06,
"loss": 0.0777,
"step": 655
},
{
"epoch": 0.524066307169962,
"grad_norm": 0.7251927679308194,
"learning_rate": 2.5066110879462056e-06,
"loss": 0.0813,
"step": 656
},
{
"epoch": 0.5248651887357699,
"grad_norm": 0.7687650559033102,
"learning_rate": 2.5e-06,
"loss": 0.0756,
"step": 657
},
{
"epoch": 0.5256640703015778,
"grad_norm": 0.6769216177565233,
"learning_rate": 2.493388912053795e-06,
"loss": 0.0688,
"step": 658
},
{
"epoch": 0.5264629518673857,
"grad_norm": 0.7872330702674987,
"learning_rate": 2.486777870339256e-06,
"loss": 0.0823,
"step": 659
},
{
"epoch": 0.5272618334331935,
"grad_norm": 0.6870673717044596,
"learning_rate": 2.4801669210877265e-06,
"loss": 0.0768,
"step": 660
},
{
"epoch": 0.5280607149990014,
"grad_norm": 0.7150216139915514,
"learning_rate": 2.4735561105299014e-06,
"loss": 0.0797,
"step": 661
},
{
"epoch": 0.5288595965648093,
"grad_norm": 0.6773205786007382,
"learning_rate": 2.466945484895509e-06,
"loss": 0.074,
"step": 662
},
{
"epoch": 0.5296584781306172,
"grad_norm": 0.735174296317646,
"learning_rate": 2.4603350904129802e-06,
"loss": 0.0793,
"step": 663
},
{
"epoch": 0.530457359696425,
"grad_norm": 0.7042052228414556,
"learning_rate": 2.453724973309134e-06,
"loss": 0.0752,
"step": 664
},
{
"epoch": 0.5312562412622329,
"grad_norm": 0.7035863201389916,
"learning_rate": 2.4471151798088465e-06,
"loss": 0.0792,
"step": 665
},
{
"epoch": 0.5320551228280408,
"grad_norm": 0.6908163217373349,
"learning_rate": 2.440505756134732e-06,
"loss": 0.0834,
"step": 666
},
{
"epoch": 0.5328540043938486,
"grad_norm": 0.6998637701979498,
"learning_rate": 2.433896748506817e-06,
"loss": 0.076,
"step": 667
},
{
"epoch": 0.5336528859596564,
"grad_norm": 0.6806996354305866,
"learning_rate": 2.4272882031422216e-06,
"loss": 0.0737,
"step": 668
},
{
"epoch": 0.5344517675254643,
"grad_norm": 0.7690510059537475,
"learning_rate": 2.4206801662548314e-06,
"loss": 0.082,
"step": 669
},
{
"epoch": 0.5352506490912722,
"grad_norm": 0.7280524380434941,
"learning_rate": 2.4140726840549757e-06,
"loss": 0.0775,
"step": 670
},
{
"epoch": 0.53604953065708,
"grad_norm": 0.7265244259182214,
"learning_rate": 2.407465802749105e-06,
"loss": 0.0872,
"step": 671
},
{
"epoch": 0.5368484122228879,
"grad_norm": 0.7274938352887984,
"learning_rate": 2.4008595685394694e-06,
"loss": 0.0734,
"step": 672
},
{
"epoch": 0.5376472937886958,
"grad_norm": 0.715787901834074,
"learning_rate": 2.3942540276237926e-06,
"loss": 0.0752,
"step": 673
},
{
"epoch": 0.5384461753545037,
"grad_norm": 0.7044859562275388,
"learning_rate": 2.38764922619495e-06,
"loss": 0.0757,
"step": 674
},
{
"epoch": 0.5392450569203115,
"grad_norm": 0.7252744856596492,
"learning_rate": 2.3810452104406444e-06,
"loss": 0.0867,
"step": 675
},
{
"epoch": 0.5400439384861194,
"grad_norm": 0.7614651844151401,
"learning_rate": 2.3744420265430877e-06,
"loss": 0.0764,
"step": 676
},
{
"epoch": 0.5408428200519273,
"grad_norm": 0.6892639381460605,
"learning_rate": 2.3678397206786717e-06,
"loss": 0.0803,
"step": 677
},
{
"epoch": 0.5416417016177352,
"grad_norm": 0.7211223831734932,
"learning_rate": 2.3612383390176503e-06,
"loss": 0.0731,
"step": 678
},
{
"epoch": 0.542440583183543,
"grad_norm": 0.7664217487678445,
"learning_rate": 2.3546379277238107e-06,
"loss": 0.0846,
"step": 679
},
{
"epoch": 0.5432394647493509,
"grad_norm": 0.7047009823702027,
"learning_rate": 2.3480385329541587e-06,
"loss": 0.0799,
"step": 680
},
{
"epoch": 0.5440383463151588,
"grad_norm": 0.6821519058187036,
"learning_rate": 2.341440200858589e-06,
"loss": 0.0836,
"step": 681
},
{
"epoch": 0.5448372278809667,
"grad_norm": 0.7121462307791131,
"learning_rate": 2.334842977579565e-06,
"loss": 0.0769,
"step": 682
},
{
"epoch": 0.5456361094467745,
"grad_norm": 0.6546815768203035,
"learning_rate": 2.3282469092517977e-06,
"loss": 0.0735,
"step": 683
},
{
"epoch": 0.5464349910125824,
"grad_norm": 0.6652984324823661,
"learning_rate": 2.3216520420019194e-06,
"loss": 0.0631,
"step": 684
},
{
"epoch": 0.5472338725783903,
"grad_norm": 0.6826190647506108,
"learning_rate": 2.3150584219481644e-06,
"loss": 0.0772,
"step": 685
},
{
"epoch": 0.5480327541441982,
"grad_norm": 0.7082080750495795,
"learning_rate": 2.3084660952000446e-06,
"loss": 0.083,
"step": 686
},
{
"epoch": 0.548831635710006,
"grad_norm": 0.6804378348146185,
"learning_rate": 2.3018751078580287e-06,
"loss": 0.0764,
"step": 687
},
{
"epoch": 0.5496305172758139,
"grad_norm": 0.6949030543943242,
"learning_rate": 2.2952855060132192e-06,
"loss": 0.0736,
"step": 688
},
{
"epoch": 0.5504293988416217,
"grad_norm": 0.767211454774038,
"learning_rate": 2.288697335747027e-06,
"loss": 0.0782,
"step": 689
},
{
"epoch": 0.5512282804074295,
"grad_norm": 0.6859601195405323,
"learning_rate": 2.2821106431308546e-06,
"loss": 0.0674,
"step": 690
},
{
"epoch": 0.5520271619732374,
"grad_norm": 0.857452027423427,
"learning_rate": 2.275525474225771e-06,
"loss": 0.0913,
"step": 691
},
{
"epoch": 0.5528260435390453,
"grad_norm": 0.7934337014749047,
"learning_rate": 2.2689418750821893e-06,
"loss": 0.0801,
"step": 692
},
{
"epoch": 0.5536249251048532,
"grad_norm": 0.7855467335399277,
"learning_rate": 2.262359891739544e-06,
"loss": 0.0827,
"step": 693
},
{
"epoch": 0.554423806670661,
"grad_norm": 0.7328485240790014,
"learning_rate": 2.2557795702259717e-06,
"loss": 0.0757,
"step": 694
},
{
"epoch": 0.5552226882364689,
"grad_norm": 0.7502799227901668,
"learning_rate": 2.2492009565579877e-06,
"loss": 0.0798,
"step": 695
},
{
"epoch": 0.5560215698022768,
"grad_norm": 0.6942830722487661,
"learning_rate": 2.242624096740164e-06,
"loss": 0.0774,
"step": 696
},
{
"epoch": 0.5568204513680847,
"grad_norm": 0.6930784219010419,
"learning_rate": 2.2360490367648083e-06,
"loss": 0.0782,
"step": 697
},
{
"epoch": 0.5576193329338925,
"grad_norm": 0.7163999480603681,
"learning_rate": 2.2294758226116397e-06,
"loss": 0.0798,
"step": 698
},
{
"epoch": 0.5584182144997004,
"grad_norm": 0.734750686150335,
"learning_rate": 2.2229045002474727e-06,
"loss": 0.0796,
"step": 699
},
{
"epoch": 0.5592170960655083,
"grad_norm": 0.766514584513075,
"learning_rate": 2.21633511562589e-06,
"loss": 0.0798,
"step": 700
},
{
"epoch": 0.5600159776313162,
"grad_norm": 0.68052355751497,
"learning_rate": 2.2097677146869242e-06,
"loss": 0.0775,
"step": 701
},
{
"epoch": 0.560814859197124,
"grad_norm": 0.7544498594904364,
"learning_rate": 2.2032023433567377e-06,
"loss": 0.0842,
"step": 702
},
{
"epoch": 0.5616137407629319,
"grad_norm": 0.7664316105448243,
"learning_rate": 2.1966390475472957e-06,
"loss": 0.0776,
"step": 703
},
{
"epoch": 0.5624126223287398,
"grad_norm": 0.7004685245151888,
"learning_rate": 2.190077873156053e-06,
"loss": 0.0736,
"step": 704
},
{
"epoch": 0.5632115038945477,
"grad_norm": 0.6859491595173901,
"learning_rate": 2.183518866065627e-06,
"loss": 0.0715,
"step": 705
},
{
"epoch": 0.5640103854603555,
"grad_norm": 0.7118232103440043,
"learning_rate": 2.1769620721434816e-06,
"loss": 0.0708,
"step": 706
},
{
"epoch": 0.5648092670261634,
"grad_norm": 0.748576065988089,
"learning_rate": 2.1704075372415993e-06,
"loss": 0.0722,
"step": 707
},
{
"epoch": 0.5656081485919713,
"grad_norm": 0.8434443127714014,
"learning_rate": 2.1638553071961706e-06,
"loss": 0.0786,
"step": 708
},
{
"epoch": 0.5664070301577792,
"grad_norm": 0.7026564190808876,
"learning_rate": 2.157305427827264e-06,
"loss": 0.0736,
"step": 709
},
{
"epoch": 0.567205911723587,
"grad_norm": 0.7639438616377601,
"learning_rate": 2.1507579449385122e-06,
"loss": 0.0838,
"step": 710
},
{
"epoch": 0.5680047932893948,
"grad_norm": 0.817227335374872,
"learning_rate": 2.1442129043167877e-06,
"loss": 0.0814,
"step": 711
},
{
"epoch": 0.5688036748552027,
"grad_norm": 0.8785656644132279,
"learning_rate": 2.1376703517318835e-06,
"loss": 0.0798,
"step": 712
},
{
"epoch": 0.5696025564210105,
"grad_norm": 0.7552630348609908,
"learning_rate": 2.131130332936195e-06,
"loss": 0.0807,
"step": 713
},
{
"epoch": 0.5704014379868184,
"grad_norm": 0.7272052290720102,
"learning_rate": 2.124592893664399e-06,
"loss": 0.0716,
"step": 714
},
{
"epoch": 0.5712003195526263,
"grad_norm": 0.7859800162478867,
"learning_rate": 2.1180580796331327e-06,
"loss": 0.0731,
"step": 715
},
{
"epoch": 0.5719992011184342,
"grad_norm": 0.7161265041455283,
"learning_rate": 2.1115259365406748e-06,
"loss": 0.0693,
"step": 716
},
{
"epoch": 0.572798082684242,
"grad_norm": 0.7447936254123377,
"learning_rate": 2.1049965100666255e-06,
"loss": 0.0715,
"step": 717
},
{
"epoch": 0.5735969642500499,
"grad_norm": 0.6911997124068688,
"learning_rate": 2.098469845871589e-06,
"loss": 0.0765,
"step": 718
},
{
"epoch": 0.5743958458158578,
"grad_norm": 0.7134320614605544,
"learning_rate": 2.0919459895968517e-06,
"loss": 0.0728,
"step": 719
},
{
"epoch": 0.5751947273816657,
"grad_norm": 0.7691509712775914,
"learning_rate": 2.0854249868640653e-06,
"loss": 0.0756,
"step": 720
},
{
"epoch": 0.5759936089474735,
"grad_norm": 0.8006755711473525,
"learning_rate": 2.0789068832749242e-06,
"loss": 0.0785,
"step": 721
},
{
"epoch": 0.5767924905132814,
"grad_norm": 0.697757987372097,
"learning_rate": 2.0723917244108503e-06,
"loss": 0.0786,
"step": 722
},
{
"epoch": 0.5775913720790893,
"grad_norm": 0.7270865955123159,
"learning_rate": 2.0658795558326745e-06,
"loss": 0.0712,
"step": 723
},
{
"epoch": 0.5783902536448972,
"grad_norm": 0.7385054287885843,
"learning_rate": 2.059370423080313e-06,
"loss": 0.0721,
"step": 724
},
{
"epoch": 0.579189135210705,
"grad_norm": 0.7038344544129048,
"learning_rate": 2.0528643716724572e-06,
"loss": 0.0732,
"step": 725
},
{
"epoch": 0.5799880167765129,
"grad_norm": 0.6790065697614551,
"learning_rate": 2.046361447106244e-06,
"loss": 0.0765,
"step": 726
},
{
"epoch": 0.5807868983423208,
"grad_norm": 0.7507492177887934,
"learning_rate": 2.0398616948569495e-06,
"loss": 0.0823,
"step": 727
},
{
"epoch": 0.5815857799081287,
"grad_norm": 0.7816194240267051,
"learning_rate": 2.0333651603776633e-06,
"loss": 0.0818,
"step": 728
},
{
"epoch": 0.5823846614739365,
"grad_norm": 0.6962266578054711,
"learning_rate": 2.0268718890989754e-06,
"loss": 0.0737,
"step": 729
},
{
"epoch": 0.5831835430397444,
"grad_norm": 0.6988143771822519,
"learning_rate": 2.0203819264286512e-06,
"loss": 0.0691,
"step": 730
},
{
"epoch": 0.5839824246055523,
"grad_norm": 0.7030353986937218,
"learning_rate": 2.013895317751323e-06,
"loss": 0.0785,
"step": 731
},
{
"epoch": 0.5847813061713601,
"grad_norm": 0.6909853071674142,
"learning_rate": 2.007412108428168e-06,
"loss": 0.0675,
"step": 732
},
{
"epoch": 0.5855801877371679,
"grad_norm": 0.8305603761683259,
"learning_rate": 2.00093234379659e-06,
"loss": 0.0928,
"step": 733
},
{
"epoch": 0.5863790693029758,
"grad_norm": 0.7351496762893693,
"learning_rate": 1.994456069169906e-06,
"loss": 0.086,
"step": 734
},
{
"epoch": 0.5871779508687837,
"grad_norm": 0.7255003573385873,
"learning_rate": 1.987983329837024e-06,
"loss": 0.0721,
"step": 735
},
{
"epoch": 0.5879768324345915,
"grad_norm": 0.7135520451688296,
"learning_rate": 1.9815141710621323e-06,
"loss": 0.072,
"step": 736
},
{
"epoch": 0.5887757140003994,
"grad_norm": 0.6674729499297173,
"learning_rate": 1.975048638084379e-06,
"loss": 0.0667,
"step": 737
},
{
"epoch": 0.5895745955662073,
"grad_norm": 0.7056694883980723,
"learning_rate": 1.9685867761175584e-06,
"loss": 0.0734,
"step": 738
},
{
"epoch": 0.5903734771320152,
"grad_norm": 0.7122399768693654,
"learning_rate": 1.9621286303497917e-06,
"loss": 0.08,
"step": 739
},
{
"epoch": 0.591172358697823,
"grad_norm": 0.7093962111319365,
"learning_rate": 1.9556742459432117e-06,
"loss": 0.0719,
"step": 740
},
{
"epoch": 0.5919712402636309,
"grad_norm": 0.6852115591601023,
"learning_rate": 1.9492236680336486e-06,
"loss": 0.0705,
"step": 741
},
{
"epoch": 0.5927701218294388,
"grad_norm": 0.6325866257954883,
"learning_rate": 1.9427769417303156e-06,
"loss": 0.0643,
"step": 742
},
{
"epoch": 0.5935690033952467,
"grad_norm": 0.7204075648734956,
"learning_rate": 1.9363341121154896e-06,
"loss": 0.0757,
"step": 743
},
{
"epoch": 0.5943678849610545,
"grad_norm": 0.8149667568559875,
"learning_rate": 1.929895224244197e-06,
"loss": 0.0773,
"step": 744
},
{
"epoch": 0.5951667665268624,
"grad_norm": 0.8545046907554561,
"learning_rate": 1.9234603231439e-06,
"loss": 0.081,
"step": 745
},
{
"epoch": 0.5959656480926703,
"grad_norm": 0.6657089521252548,
"learning_rate": 1.9170294538141825e-06,
"loss": 0.0711,
"step": 746
},
{
"epoch": 0.5967645296584781,
"grad_norm": 0.6978460546960862,
"learning_rate": 1.9106026612264316e-06,
"loss": 0.0762,
"step": 747
},
{
"epoch": 0.597563411224286,
"grad_norm": 0.7011638489461521,
"learning_rate": 1.9041799903235297e-06,
"loss": 0.0675,
"step": 748
},
{
"epoch": 0.5983622927900939,
"grad_norm": 0.6939104019221446,
"learning_rate": 1.8977614860195297e-06,
"loss": 0.0711,
"step": 749
},
{
"epoch": 0.5991611743559018,
"grad_norm": 0.6869972949250588,
"learning_rate": 1.891347193199353e-06,
"loss": 0.0727,
"step": 750
},
{
"epoch": 0.5999600559217096,
"grad_norm": 0.680739504999969,
"learning_rate": 1.8849371567184665e-06,
"loss": 0.0674,
"step": 751
},
{
"epoch": 0.6007589374875175,
"grad_norm": 0.6915313764434776,
"learning_rate": 1.8785314214025747e-06,
"loss": 0.0671,
"step": 752
},
{
"epoch": 0.6015578190533254,
"grad_norm": 0.7851580548586923,
"learning_rate": 1.8721300320473023e-06,
"loss": 0.0849,
"step": 753
},
{
"epoch": 0.6023567006191333,
"grad_norm": 0.8202601700256574,
"learning_rate": 1.8657330334178825e-06,
"loss": 0.0842,
"step": 754
},
{
"epoch": 0.603155582184941,
"grad_norm": 0.7717659960029647,
"learning_rate": 1.8593404702488439e-06,
"loss": 0.0817,
"step": 755
},
{
"epoch": 0.6039544637507489,
"grad_norm": 0.679886045166462,
"learning_rate": 1.852952387243698e-06,
"loss": 0.0712,
"step": 756
},
{
"epoch": 0.6047533453165568,
"grad_norm": 0.7584532497859835,
"learning_rate": 1.8465688290746282e-06,
"loss": 0.072,
"step": 757
},
{
"epoch": 0.6055522268823647,
"grad_norm": 0.8239865399284856,
"learning_rate": 1.8401898403821713e-06,
"loss": 0.0722,
"step": 758
},
{
"epoch": 0.6063511084481725,
"grad_norm": 0.7143969459985934,
"learning_rate": 1.833815465774913e-06,
"loss": 0.0781,
"step": 759
},
{
"epoch": 0.6071499900139804,
"grad_norm": 0.73089086118113,
"learning_rate": 1.8274457498291708e-06,
"loss": 0.0701,
"step": 760
},
{
"epoch": 0.6079488715797883,
"grad_norm": 0.7070920900790865,
"learning_rate": 1.8210807370886851e-06,
"loss": 0.0763,
"step": 761
},
{
"epoch": 0.6087477531455961,
"grad_norm": 0.744531738394602,
"learning_rate": 1.8147204720643066e-06,
"loss": 0.0811,
"step": 762
},
{
"epoch": 0.609546634711404,
"grad_norm": 0.7196531438853129,
"learning_rate": 1.8083649992336827e-06,
"loss": 0.074,
"step": 763
},
{
"epoch": 0.6103455162772119,
"grad_norm": 0.6935193915987503,
"learning_rate": 1.8020143630409508e-06,
"loss": 0.065,
"step": 764
},
{
"epoch": 0.6111443978430198,
"grad_norm": 0.7035531928233766,
"learning_rate": 1.7956686078964257e-06,
"loss": 0.0653,
"step": 765
},
{
"epoch": 0.6119432794088276,
"grad_norm": 0.716461549347938,
"learning_rate": 1.7893277781762874e-06,
"loss": 0.0763,
"step": 766
},
{
"epoch": 0.6127421609746355,
"grad_norm": 0.7124602838364704,
"learning_rate": 1.7829919182222752e-06,
"loss": 0.0695,
"step": 767
},
{
"epoch": 0.6135410425404434,
"grad_norm": 0.6532144524523916,
"learning_rate": 1.7766610723413686e-06,
"loss": 0.0709,
"step": 768
},
{
"epoch": 0.6143399241062513,
"grad_norm": 0.7262090328751356,
"learning_rate": 1.7703352848054888e-06,
"loss": 0.0729,
"step": 769
},
{
"epoch": 0.6151388056720591,
"grad_norm": 0.769557941797719,
"learning_rate": 1.7640145998511827e-06,
"loss": 0.0755,
"step": 770
},
{
"epoch": 0.615937687237867,
"grad_norm": 0.7645534014831973,
"learning_rate": 1.7576990616793139e-06,
"loss": 0.0864,
"step": 771
},
{
"epoch": 0.6167365688036749,
"grad_norm": 0.6820825425481823,
"learning_rate": 1.7513887144547523e-06,
"loss": 0.0774,
"step": 772
},
{
"epoch": 0.6175354503694828,
"grad_norm": 0.7137728940600104,
"learning_rate": 1.7450836023060714e-06,
"loss": 0.0776,
"step": 773
},
{
"epoch": 0.6183343319352906,
"grad_norm": 0.7428801445111053,
"learning_rate": 1.738783769325233e-06,
"loss": 0.0809,
"step": 774
},
{
"epoch": 0.6191332135010985,
"grad_norm": 0.7006322216060636,
"learning_rate": 1.7324892595672807e-06,
"loss": 0.0715,
"step": 775
},
{
"epoch": 0.6199320950669064,
"grad_norm": 0.7084563324616765,
"learning_rate": 1.726200117050036e-06,
"loss": 0.072,
"step": 776
},
{
"epoch": 0.6207309766327141,
"grad_norm": 0.6783579971505013,
"learning_rate": 1.7199163857537825e-06,
"loss": 0.0752,
"step": 777
},
{
"epoch": 0.621529858198522,
"grad_norm": 0.7378250263764751,
"learning_rate": 1.7136381096209665e-06,
"loss": 0.0782,
"step": 778
},
{
"epoch": 0.6223287397643299,
"grad_norm": 0.7384439686090273,
"learning_rate": 1.7073653325558831e-06,
"loss": 0.0752,
"step": 779
},
{
"epoch": 0.6231276213301378,
"grad_norm": 0.7462193513330629,
"learning_rate": 1.7010980984243756e-06,
"loss": 0.0695,
"step": 780
},
{
"epoch": 0.6239265028959456,
"grad_norm": 0.7130619553135901,
"learning_rate": 1.694836451053522e-06,
"loss": 0.0744,
"step": 781
},
{
"epoch": 0.6247253844617535,
"grad_norm": 0.7748719793441474,
"learning_rate": 1.6885804342313334e-06,
"loss": 0.0805,
"step": 782
},
{
"epoch": 0.6255242660275614,
"grad_norm": 0.6944961145847144,
"learning_rate": 1.6823300917064462e-06,
"loss": 0.0684,
"step": 783
},
{
"epoch": 0.6263231475933693,
"grad_norm": 0.6865884916668862,
"learning_rate": 1.6760854671878158e-06,
"loss": 0.0737,
"step": 784
},
{
"epoch": 0.6271220291591771,
"grad_norm": 0.74800317438897,
"learning_rate": 1.6698466043444122e-06,
"loss": 0.0669,
"step": 785
},
{
"epoch": 0.627920910724985,
"grad_norm": 0.7581085007620311,
"learning_rate": 1.6636135468049122e-06,
"loss": 0.0784,
"step": 786
},
{
"epoch": 0.6287197922907929,
"grad_norm": 0.7494255172335546,
"learning_rate": 1.6573863381573957e-06,
"loss": 0.0829,
"step": 787
},
{
"epoch": 0.6295186738566008,
"grad_norm": 0.6879131849166294,
"learning_rate": 1.651165021949044e-06,
"loss": 0.0744,
"step": 788
},
{
"epoch": 0.6303175554224086,
"grad_norm": 0.703349951642232,
"learning_rate": 1.6449496416858285e-06,
"loss": 0.0759,
"step": 789
},
{
"epoch": 0.6311164369882165,
"grad_norm": 0.6929744305760658,
"learning_rate": 1.6387402408322128e-06,
"loss": 0.0731,
"step": 790
},
{
"epoch": 0.6319153185540244,
"grad_norm": 0.7454533302688348,
"learning_rate": 1.6325368628108442e-06,
"loss": 0.0758,
"step": 791
},
{
"epoch": 0.6327142001198323,
"grad_norm": 0.742041894343771,
"learning_rate": 1.6263395510022546e-06,
"loss": 0.07,
"step": 792
},
{
"epoch": 0.6335130816856401,
"grad_norm": 0.6948013730334685,
"learning_rate": 1.620148348744552e-06,
"loss": 0.0766,
"step": 793
},
{
"epoch": 0.634311963251448,
"grad_norm": 0.659586046661146,
"learning_rate": 1.613963299333122e-06,
"loss": 0.0655,
"step": 794
},
{
"epoch": 0.6351108448172559,
"grad_norm": 0.6773851759495868,
"learning_rate": 1.6077844460203207e-06,
"loss": 0.0752,
"step": 795
},
{
"epoch": 0.6359097263830638,
"grad_norm": 0.7605812887772487,
"learning_rate": 1.6016118320151775e-06,
"loss": 0.0788,
"step": 796
},
{
"epoch": 0.6367086079488716,
"grad_norm": 0.7086132798372856,
"learning_rate": 1.595445500483088e-06,
"loss": 0.0715,
"step": 797
},
{
"epoch": 0.6375074895146795,
"grad_norm": 0.7633813224754403,
"learning_rate": 1.589285494545514e-06,
"loss": 0.0795,
"step": 798
},
{
"epoch": 0.6383063710804873,
"grad_norm": 0.7913546925027637,
"learning_rate": 1.583131857279685e-06,
"loss": 0.0763,
"step": 799
},
{
"epoch": 0.6391052526462951,
"grad_norm": 0.6935716177124382,
"learning_rate": 1.5769846317182894e-06,
"loss": 0.0709,
"step": 800
},
{
"epoch": 0.639904134212103,
"grad_norm": 0.676940355586407,
"learning_rate": 1.5708438608491816e-06,
"loss": 0.0709,
"step": 801
},
{
"epoch": 0.6407030157779109,
"grad_norm": 0.7665289081799466,
"learning_rate": 1.564709587615077e-06,
"loss": 0.0708,
"step": 802
},
{
"epoch": 0.6415018973437188,
"grad_norm": 0.7660458120935258,
"learning_rate": 1.5585818549132532e-06,
"loss": 0.0683,
"step": 803
},
{
"epoch": 0.6423007789095266,
"grad_norm": 0.8084359075182547,
"learning_rate": 1.5524607055952495e-06,
"loss": 0.07,
"step": 804
},
{
"epoch": 0.6430996604753345,
"grad_norm": 0.6834537386319198,
"learning_rate": 1.546346182466566e-06,
"loss": 0.0696,
"step": 805
},
{
"epoch": 0.6438985420411424,
"grad_norm": 0.6882281367042749,
"learning_rate": 1.540238328286366e-06,
"loss": 0.0715,
"step": 806
},
{
"epoch": 0.6446974236069503,
"grad_norm": 0.7798427307031544,
"learning_rate": 1.5341371857671782e-06,
"loss": 0.0816,
"step": 807
},
{
"epoch": 0.6454963051727581,
"grad_norm": 0.7749157428603176,
"learning_rate": 1.528042797574596e-06,
"loss": 0.072,
"step": 808
},
{
"epoch": 0.646295186738566,
"grad_norm": 0.7503425799671355,
"learning_rate": 1.521955206326976e-06,
"loss": 0.0682,
"step": 809
},
{
"epoch": 0.6470940683043739,
"grad_norm": 0.7353119596263072,
"learning_rate": 1.5158744545951468e-06,
"loss": 0.074,
"step": 810
},
{
"epoch": 0.6478929498701818,
"grad_norm": 0.7031039234622585,
"learning_rate": 1.509800584902108e-06,
"loss": 0.069,
"step": 811
},
{
"epoch": 0.6486918314359896,
"grad_norm": 0.7552246369614947,
"learning_rate": 1.5037336397227315e-06,
"loss": 0.0695,
"step": 812
},
{
"epoch": 0.6494907130017975,
"grad_norm": 0.795857866513945,
"learning_rate": 1.4976736614834664e-06,
"loss": 0.07,
"step": 813
},
{
"epoch": 0.6502895945676054,
"grad_norm": 0.6939165529698337,
"learning_rate": 1.4916206925620402e-06,
"loss": 0.0698,
"step": 814
},
{
"epoch": 0.6510884761334133,
"grad_norm": 0.7274411868739028,
"learning_rate": 1.4855747752871659e-06,
"loss": 0.0667,
"step": 815
},
{
"epoch": 0.6518873576992211,
"grad_norm": 0.6874520685052125,
"learning_rate": 1.479535951938243e-06,
"loss": 0.0672,
"step": 816
},
{
"epoch": 0.652686239265029,
"grad_norm": 0.7012426759740858,
"learning_rate": 1.4735042647450622e-06,
"loss": 0.0655,
"step": 817
},
{
"epoch": 0.6534851208308369,
"grad_norm": 0.7406300636188733,
"learning_rate": 1.4674797558875134e-06,
"loss": 0.0744,
"step": 818
},
{
"epoch": 0.6542840023966447,
"grad_norm": 0.7259284513518274,
"learning_rate": 1.4614624674952843e-06,
"loss": 0.0672,
"step": 819
},
{
"epoch": 0.6550828839624525,
"grad_norm": 0.7178372562662225,
"learning_rate": 1.4554524416475718e-06,
"loss": 0.0781,
"step": 820
},
{
"epoch": 0.6558817655282604,
"grad_norm": 0.7020878145356227,
"learning_rate": 1.4494497203727845e-06,
"loss": 0.0667,
"step": 821
},
{
"epoch": 0.6566806470940683,
"grad_norm": 0.7464350415019525,
"learning_rate": 1.443454345648252e-06,
"loss": 0.0702,
"step": 822
},
{
"epoch": 0.6574795286598761,
"grad_norm": 0.722737579725714,
"learning_rate": 1.4374663593999258e-06,
"loss": 0.0746,
"step": 823
},
{
"epoch": 0.658278410225684,
"grad_norm": 0.7740457479905273,
"learning_rate": 1.4314858035020905e-06,
"loss": 0.0733,
"step": 824
},
{
"epoch": 0.6590772917914919,
"grad_norm": 0.7125957730795915,
"learning_rate": 1.425512719777071e-06,
"loss": 0.0783,
"step": 825
},
{
"epoch": 0.6598761733572998,
"grad_norm": 0.6616100002029853,
"learning_rate": 1.4195471499949384e-06,
"loss": 0.0693,
"step": 826
},
{
"epoch": 0.6606750549231076,
"grad_norm": 0.6565339471175381,
"learning_rate": 1.4135891358732206e-06,
"loss": 0.0612,
"step": 827
},
{
"epoch": 0.6614739364889155,
"grad_norm": 0.6514137764011035,
"learning_rate": 1.4076387190766017e-06,
"loss": 0.0611,
"step": 828
},
{
"epoch": 0.6622728180547234,
"grad_norm": 0.770350092313725,
"learning_rate": 1.401695941216644e-06,
"loss": 0.0839,
"step": 829
},
{
"epoch": 0.6630716996205313,
"grad_norm": 0.6938098303809659,
"learning_rate": 1.3957608438514877e-06,
"loss": 0.069,
"step": 830
},
{
"epoch": 0.6638705811863391,
"grad_norm": 0.7265480986058926,
"learning_rate": 1.3898334684855647e-06,
"loss": 0.0749,
"step": 831
},
{
"epoch": 0.664669462752147,
"grad_norm": 0.7054094782275767,
"learning_rate": 1.3839138565693043e-06,
"loss": 0.0756,
"step": 832
},
{
"epoch": 0.6654683443179549,
"grad_norm": 0.6829270622586088,
"learning_rate": 1.3780020494988447e-06,
"loss": 0.0709,
"step": 833
},
{
"epoch": 0.6662672258837627,
"grad_norm": 0.7504241485173638,
"learning_rate": 1.372098088615749e-06,
"loss": 0.0791,
"step": 834
},
{
"epoch": 0.6670661074495706,
"grad_norm": 0.6924213329453016,
"learning_rate": 1.3662020152067063e-06,
"loss": 0.0765,
"step": 835
},
{
"epoch": 0.6678649890153785,
"grad_norm": 0.7087521677704813,
"learning_rate": 1.3603138705032526e-06,
"loss": 0.0749,
"step": 836
},
{
"epoch": 0.6686638705811864,
"grad_norm": 0.705323244510612,
"learning_rate": 1.354433695681474e-06,
"loss": 0.0752,
"step": 837
},
{
"epoch": 0.6694627521469942,
"grad_norm": 0.6616264857193134,
"learning_rate": 1.3485615318617277e-06,
"loss": 0.0672,
"step": 838
},
{
"epoch": 0.6702616337128021,
"grad_norm": 0.7512713777570923,
"learning_rate": 1.342697420108344e-06,
"loss": 0.0713,
"step": 839
},
{
"epoch": 0.67106051527861,
"grad_norm": 0.669997265805606,
"learning_rate": 1.3368414014293485e-06,
"loss": 0.0637,
"step": 840
},
{
"epoch": 0.6718593968444179,
"grad_norm": 0.7096381426024272,
"learning_rate": 1.3309935167761717e-06,
"loss": 0.0659,
"step": 841
},
{
"epoch": 0.6726582784102256,
"grad_norm": 0.6742198646615256,
"learning_rate": 1.3251538070433605e-06,
"loss": 0.0728,
"step": 842
},
{
"epoch": 0.6734571599760335,
"grad_norm": 0.6805614755628161,
"learning_rate": 1.3193223130682937e-06,
"loss": 0.0625,
"step": 843
},
{
"epoch": 0.6742560415418414,
"grad_norm": 0.748759021598399,
"learning_rate": 1.313499075630899e-06,
"loss": 0.0702,
"step": 844
},
{
"epoch": 0.6750549231076493,
"grad_norm": 0.7274840234475171,
"learning_rate": 1.3076841354533658e-06,
"loss": 0.0658,
"step": 845
},
{
"epoch": 0.6758538046734571,
"grad_norm": 0.6791766672113594,
"learning_rate": 1.301877533199859e-06,
"loss": 0.065,
"step": 846
},
{
"epoch": 0.676652686239265,
"grad_norm": 0.7002421668580668,
"learning_rate": 1.2960793094762347e-06,
"loss": 0.0679,
"step": 847
},
{
"epoch": 0.6774515678050729,
"grad_norm": 0.7623718570311951,
"learning_rate": 1.2902895048297602e-06,
"loss": 0.0751,
"step": 848
},
{
"epoch": 0.6782504493708807,
"grad_norm": 0.7237394593375889,
"learning_rate": 1.2845081597488288e-06,
"loss": 0.07,
"step": 849
},
{
"epoch": 0.6790493309366886,
"grad_norm": 0.787447112328663,
"learning_rate": 1.2787353146626715e-06,
"loss": 0.0786,
"step": 850
},
{
"epoch": 0.6798482125024965,
"grad_norm": 0.6916026537019957,
"learning_rate": 1.2729710099410802e-06,
"loss": 0.0673,
"step": 851
},
{
"epoch": 0.6806470940683044,
"grad_norm": 0.7256469081673769,
"learning_rate": 1.2672152858941244e-06,
"loss": 0.0753,
"step": 852
},
{
"epoch": 0.6814459756341122,
"grad_norm": 0.6749009978752328,
"learning_rate": 1.2614681827718697e-06,
"loss": 0.0689,
"step": 853
},
{
"epoch": 0.6822448571999201,
"grad_norm": 0.7376514885118693,
"learning_rate": 1.255729740764091e-06,
"loss": 0.0761,
"step": 854
},
{
"epoch": 0.683043738765728,
"grad_norm": 0.6933564376107897,
"learning_rate": 1.2500000000000007e-06,
"loss": 0.0682,
"step": 855
},
{
"epoch": 0.6838426203315359,
"grad_norm": 0.6933674935192211,
"learning_rate": 1.2442790005479576e-06,
"loss": 0.0693,
"step": 856
},
{
"epoch": 0.6846415018973437,
"grad_norm": 0.7682302900116749,
"learning_rate": 1.2385667824151972e-06,
"loss": 0.0722,
"step": 857
},
{
"epoch": 0.6854403834631516,
"grad_norm": 0.6524546380366343,
"learning_rate": 1.232863385547543e-06,
"loss": 0.0677,
"step": 858
},
{
"epoch": 0.6862392650289595,
"grad_norm": 0.6905473772407033,
"learning_rate": 1.2271688498291335e-06,
"loss": 0.0755,
"step": 859
},
{
"epoch": 0.6870381465947674,
"grad_norm": 0.7156212244362496,
"learning_rate": 1.2214832150821381e-06,
"loss": 0.0721,
"step": 860
},
{
"epoch": 0.6878370281605752,
"grad_norm": 0.6922194723160341,
"learning_rate": 1.2158065210664848e-06,
"loss": 0.0705,
"step": 861
},
{
"epoch": 0.6886359097263831,
"grad_norm": 0.7115096619101899,
"learning_rate": 1.2101388074795747e-06,
"loss": 0.0775,
"step": 862
},
{
"epoch": 0.689434791292191,
"grad_norm": 0.702553187808918,
"learning_rate": 1.2044801139560112e-06,
"loss": 0.074,
"step": 863
},
{
"epoch": 0.6902336728579987,
"grad_norm": 0.7364941389984857,
"learning_rate": 1.1988304800673197e-06,
"loss": 0.0762,
"step": 864
},
{
"epoch": 0.6910325544238066,
"grad_norm": 0.7658537575082363,
"learning_rate": 1.1931899453216698e-06,
"loss": 0.0775,
"step": 865
},
{
"epoch": 0.6918314359896145,
"grad_norm": 0.6908885154530607,
"learning_rate": 1.1875585491636e-06,
"loss": 0.0678,
"step": 866
},
{
"epoch": 0.6926303175554224,
"grad_norm": 0.7042159975428317,
"learning_rate": 1.181936330973744e-06,
"loss": 0.0618,
"step": 867
},
{
"epoch": 0.6934291991212302,
"grad_norm": 0.6852437432679035,
"learning_rate": 1.1763233300685533e-06,
"loss": 0.0695,
"step": 868
},
{
"epoch": 0.6942280806870381,
"grad_norm": 0.7176643894140547,
"learning_rate": 1.1707195857000218e-06,
"loss": 0.0682,
"step": 869
},
{
"epoch": 0.695026962252846,
"grad_norm": 0.6925346322936237,
"learning_rate": 1.16512513705541e-06,
"loss": 0.0688,
"step": 870
},
{
"epoch": 0.6958258438186539,
"grad_norm": 0.7196442480030203,
"learning_rate": 1.159540023256977e-06,
"loss": 0.0805,
"step": 871
},
{
"epoch": 0.6966247253844617,
"grad_norm": 0.7037757813966624,
"learning_rate": 1.1539642833617009e-06,
"loss": 0.0669,
"step": 872
},
{
"epoch": 0.6974236069502696,
"grad_norm": 0.6920780498431013,
"learning_rate": 1.148397956361007e-06,
"loss": 0.068,
"step": 873
},
{
"epoch": 0.6982224885160775,
"grad_norm": 0.7361982589542638,
"learning_rate": 1.1428410811804955e-06,
"loss": 0.0712,
"step": 874
},
{
"epoch": 0.6990213700818854,
"grad_norm": 0.7177168044634765,
"learning_rate": 1.137293696679671e-06,
"loss": 0.0685,
"step": 875
},
{
"epoch": 0.6998202516476932,
"grad_norm": 0.6916265035922443,
"learning_rate": 1.1317558416516696e-06,
"loss": 0.066,
"step": 876
},
{
"epoch": 0.7006191332135011,
"grad_norm": 0.7698316471528456,
"learning_rate": 1.1262275548229852e-06,
"loss": 0.0794,
"step": 877
},
{
"epoch": 0.701418014779309,
"grad_norm": 0.6875784567261796,
"learning_rate": 1.120708874853203e-06,
"loss": 0.0718,
"step": 878
},
{
"epoch": 0.7022168963451169,
"grad_norm": 0.7595905093967557,
"learning_rate": 1.1151998403347245e-06,
"loss": 0.0708,
"step": 879
},
{
"epoch": 0.7030157779109247,
"grad_norm": 0.682293145613161,
"learning_rate": 1.1097004897925034e-06,
"loss": 0.0621,
"step": 880
},
{
"epoch": 0.7038146594767326,
"grad_norm": 0.7513445525180765,
"learning_rate": 1.1042108616837693e-06,
"loss": 0.0715,
"step": 881
},
{
"epoch": 0.7046135410425405,
"grad_norm": 0.6956635070524043,
"learning_rate": 1.0987309943977647e-06,
"loss": 0.0659,
"step": 882
},
{
"epoch": 0.7054124226083484,
"grad_norm": 0.7790067433851642,
"learning_rate": 1.0932609262554748e-06,
"loss": 0.0654,
"step": 883
},
{
"epoch": 0.7062113041741562,
"grad_norm": 0.6972694194690369,
"learning_rate": 1.0878006955093566e-06,
"loss": 0.0707,
"step": 884
},
{
"epoch": 0.7070101857399641,
"grad_norm": 0.8285471186520841,
"learning_rate": 1.0823503403430736e-06,
"loss": 0.0701,
"step": 885
},
{
"epoch": 0.7078090673057719,
"grad_norm": 0.6731927224763115,
"learning_rate": 1.076909898871231e-06,
"loss": 0.0611,
"step": 886
},
{
"epoch": 0.7086079488715797,
"grad_norm": 0.8386201804199398,
"learning_rate": 1.0714794091391074e-06,
"loss": 0.0672,
"step": 887
},
{
"epoch": 0.7094068304373876,
"grad_norm": 0.6763207712242757,
"learning_rate": 1.0660589091223854e-06,
"loss": 0.0669,
"step": 888
},
{
"epoch": 0.7102057120031955,
"grad_norm": 0.709984687452426,
"learning_rate": 1.0606484367268906e-06,
"loss": 0.0645,
"step": 889
},
{
"epoch": 0.7110045935690034,
"grad_norm": 0.6914376370508385,
"learning_rate": 1.0552480297883254e-06,
"loss": 0.0659,
"step": 890
},
{
"epoch": 0.7118034751348112,
"grad_norm": 0.6934501764327877,
"learning_rate": 1.049857726072005e-06,
"loss": 0.071,
"step": 891
},
{
"epoch": 0.7126023567006191,
"grad_norm": 0.6859987351243448,
"learning_rate": 1.0444775632725893e-06,
"loss": 0.0626,
"step": 892
},
{
"epoch": 0.713401238266427,
"grad_norm": 0.7094722335398237,
"learning_rate": 1.0391075790138234e-06,
"loss": 0.065,
"step": 893
},
{
"epoch": 0.7142001198322349,
"grad_norm": 0.7231364811936231,
"learning_rate": 1.0337478108482742e-06,
"loss": 0.0677,
"step": 894
},
{
"epoch": 0.7149990013980427,
"grad_norm": 0.6822025978665478,
"learning_rate": 1.0283982962570683e-06,
"loss": 0.0686,
"step": 895
},
{
"epoch": 0.7157978829638506,
"grad_norm": 0.7410985657955034,
"learning_rate": 1.0230590726496247e-06,
"loss": 0.0726,
"step": 896
},
{
"epoch": 0.7165967645296585,
"grad_norm": 0.8311400386801739,
"learning_rate": 1.0177301773633993e-06,
"loss": 0.0652,
"step": 897
},
{
"epoch": 0.7173956460954664,
"grad_norm": 0.7744302013134797,
"learning_rate": 1.0124116476636218e-06,
"loss": 0.0704,
"step": 898
},
{
"epoch": 0.7181945276612742,
"grad_norm": 0.771110146977676,
"learning_rate": 1.0071035207430352e-06,
"loss": 0.0777,
"step": 899
},
{
"epoch": 0.7189934092270821,
"grad_norm": 0.7051521238306805,
"learning_rate": 1.0018058337216327e-06,
"loss": 0.0717,
"step": 900
},
{
"epoch": 0.71979229079289,
"grad_norm": 0.69799351583451,
"learning_rate": 9.965186236464047e-07,
"loss": 0.074,
"step": 901
},
{
"epoch": 0.7205911723586979,
"grad_norm": 0.6916047989468586,
"learning_rate": 9.912419274910717e-07,
"loss": 0.0757,
"step": 902
},
{
"epoch": 0.7213900539245057,
"grad_norm": 0.7923334873265707,
"learning_rate": 9.85975782155834e-07,
"loss": 0.0673,
"step": 903
},
{
"epoch": 0.7221889354903136,
"grad_norm": 0.7968420285846334,
"learning_rate": 9.807202244671049e-07,
"loss": 0.0707,
"step": 904
},
{
"epoch": 0.7229878170561215,
"grad_norm": 0.7609740552798956,
"learning_rate": 9.754752911772616e-07,
"loss": 0.0661,
"step": 905
},
{
"epoch": 0.7237866986219293,
"grad_norm": 0.7002229457651694,
"learning_rate": 9.702410189643838e-07,
"loss": 0.0761,
"step": 906
},
{
"epoch": 0.7245855801877372,
"grad_norm": 0.6715239067203957,
"learning_rate": 9.650174444319957e-07,
"loss": 0.0643,
"step": 907
},
{
"epoch": 0.725384461753545,
"grad_norm": 0.7107880210572147,
"learning_rate": 9.598046041088127e-07,
"loss": 0.0698,
"step": 908
},
{
"epoch": 0.7261833433193529,
"grad_norm": 0.7691864270821177,
"learning_rate": 9.546025344484868e-07,
"loss": 0.0738,
"step": 909
},
{
"epoch": 0.7269822248851607,
"grad_norm": 0.8738867314756472,
"learning_rate": 9.494112718293503e-07,
"loss": 0.0765,
"step": 910
},
{
"epoch": 0.7277811064509686,
"grad_norm": 0.7710218454079526,
"learning_rate": 9.442308525541591e-07,
"loss": 0.0707,
"step": 911
},
{
"epoch": 0.7285799880167765,
"grad_norm": 0.7302992785764044,
"learning_rate": 9.390613128498419e-07,
"loss": 0.0719,
"step": 912
},
{
"epoch": 0.7293788695825844,
"grad_norm": 0.724458938096606,
"learning_rate": 9.33902688867247e-07,
"loss": 0.072,
"step": 913
},
{
"epoch": 0.7301777511483922,
"grad_norm": 0.7537364633847227,
"learning_rate": 9.287550166808892e-07,
"loss": 0.0821,
"step": 914
},
{
"epoch": 0.7309766327142001,
"grad_norm": 0.8784951133416654,
"learning_rate": 9.236183322886946e-07,
"loss": 0.0658,
"step": 915
},
{
"epoch": 0.731775514280008,
"grad_norm": 0.7486887717067566,
"learning_rate": 9.184926716117512e-07,
"loss": 0.0682,
"step": 916
},
{
"epoch": 0.7325743958458159,
"grad_norm": 0.806504603566172,
"learning_rate": 9.133780704940595e-07,
"loss": 0.081,
"step": 917
},
{
"epoch": 0.7333732774116237,
"grad_norm": 0.7241082712472302,
"learning_rate": 9.082745647022798e-07,
"loss": 0.0704,
"step": 918
},
{
"epoch": 0.7341721589774316,
"grad_norm": 0.7656468783833981,
"learning_rate": 9.031821899254797e-07,
"loss": 0.0759,
"step": 919
},
{
"epoch": 0.7349710405432395,
"grad_norm": 0.7501681101809219,
"learning_rate": 8.981009817748906e-07,
"loss": 0.064,
"step": 920
},
{
"epoch": 0.7357699221090473,
"grad_norm": 0.7665143868951155,
"learning_rate": 8.930309757836517e-07,
"loss": 0.0782,
"step": 921
},
{
"epoch": 0.7365688036748552,
"grad_norm": 0.7184986991412127,
"learning_rate": 8.879722074065681e-07,
"loss": 0.0671,
"step": 922
},
{
"epoch": 0.7373676852406631,
"grad_norm": 0.7364620852989838,
"learning_rate": 8.829247120198564e-07,
"loss": 0.0655,
"step": 923
},
{
"epoch": 0.738166566806471,
"grad_norm": 0.7004868837398479,
"learning_rate": 8.778885249209044e-07,
"loss": 0.0665,
"step": 924
},
{
"epoch": 0.7389654483722788,
"grad_norm": 0.6917945728795157,
"learning_rate": 8.728636813280164e-07,
"loss": 0.0673,
"step": 925
},
{
"epoch": 0.7397643299380867,
"grad_norm": 0.6869868967472806,
"learning_rate": 8.67850216380175e-07,
"loss": 0.0719,
"step": 926
},
{
"epoch": 0.7405632115038946,
"grad_norm": 0.6947452348575369,
"learning_rate": 8.628481651367876e-07,
"loss": 0.0695,
"step": 927
},
{
"epoch": 0.7413620930697025,
"grad_norm": 0.7189114026034129,
"learning_rate": 8.578575625774476e-07,
"loss": 0.0699,
"step": 928
},
{
"epoch": 0.7421609746355103,
"grad_norm": 0.7220306089561607,
"learning_rate": 8.528784436016879e-07,
"loss": 0.0632,
"step": 929
},
{
"epoch": 0.7429598562013181,
"grad_norm": 0.700585631371324,
"learning_rate": 8.479108430287331e-07,
"loss": 0.0661,
"step": 930
},
{
"epoch": 0.743758737767126,
"grad_norm": 0.7070648082934727,
"learning_rate": 8.4295479559726e-07,
"loss": 0.0691,
"step": 931
},
{
"epoch": 0.7445576193329339,
"grad_norm": 0.6752223726474184,
"learning_rate": 8.380103359651554e-07,
"loss": 0.0654,
"step": 932
},
{
"epoch": 0.7453565008987417,
"grad_norm": 0.7385870850797042,
"learning_rate": 8.330774987092713e-07,
"loss": 0.0665,
"step": 933
},
{
"epoch": 0.7461553824645496,
"grad_norm": 0.7249724054176636,
"learning_rate": 8.281563183251828e-07,
"loss": 0.0753,
"step": 934
},
{
"epoch": 0.7469542640303575,
"grad_norm": 0.7451824876122812,
"learning_rate": 8.23246829226948e-07,
"loss": 0.0671,
"step": 935
},
{
"epoch": 0.7477531455961653,
"grad_norm": 0.7807983611829348,
"learning_rate": 8.183490657468687e-07,
"loss": 0.0757,
"step": 936
},
{
"epoch": 0.7485520271619732,
"grad_norm": 0.7735152540739959,
"learning_rate": 8.134630621352485e-07,
"loss": 0.071,
"step": 937
},
{
"epoch": 0.7493509087277811,
"grad_norm": 0.7090083764371655,
"learning_rate": 8.085888525601526e-07,
"loss": 0.0644,
"step": 938
},
{
"epoch": 0.750149790293589,
"grad_norm": 0.6867898835739341,
"learning_rate": 8.037264711071699e-07,
"loss": 0.0647,
"step": 939
},
{
"epoch": 0.7509486718593968,
"grad_norm": 0.7196342412506768,
"learning_rate": 7.988759517791759e-07,
"loss": 0.073,
"step": 940
},
{
"epoch": 0.7517475534252047,
"grad_norm": 0.7958248682226311,
"learning_rate": 7.940373284960934e-07,
"loss": 0.0771,
"step": 941
},
{
"epoch": 0.7525464349910126,
"grad_norm": 0.6455907886178134,
"learning_rate": 7.892106350946544e-07,
"loss": 0.0566,
"step": 942
},
{
"epoch": 0.7533453165568205,
"grad_norm": 0.730260246281351,
"learning_rate": 7.843959053281663e-07,
"loss": 0.0744,
"step": 943
},
{
"epoch": 0.7541441981226283,
"grad_norm": 0.660307763513869,
"learning_rate": 7.795931728662726e-07,
"loss": 0.0661,
"step": 944
},
{
"epoch": 0.7549430796884362,
"grad_norm": 0.7058124058214891,
"learning_rate": 7.748024712947205e-07,
"loss": 0.0635,
"step": 945
},
{
"epoch": 0.7557419612542441,
"grad_norm": 0.7017538636058471,
"learning_rate": 7.700238341151228e-07,
"loss": 0.0622,
"step": 946
},
{
"epoch": 0.756540842820052,
"grad_norm": 0.7205050140658313,
"learning_rate": 7.652572947447273e-07,
"loss": 0.0707,
"step": 947
},
{
"epoch": 0.7573397243858598,
"grad_norm": 0.7416351156017427,
"learning_rate": 7.60502886516181e-07,
"loss": 0.0669,
"step": 948
},
{
"epoch": 0.7581386059516677,
"grad_norm": 0.7168506440477357,
"learning_rate": 7.557606426772962e-07,
"loss": 0.0622,
"step": 949
},
{
"epoch": 0.7589374875174756,
"grad_norm": 0.6768418626524917,
"learning_rate": 7.510305963908183e-07,
"loss": 0.0641,
"step": 950
},
{
"epoch": 0.7597363690832835,
"grad_norm": 0.606897378091034,
"learning_rate": 7.463127807341966e-07,
"loss": 0.0571,
"step": 951
},
{
"epoch": 0.7605352506490912,
"grad_norm": 0.7488996315192771,
"learning_rate": 7.416072286993511e-07,
"loss": 0.0656,
"step": 952
},
{
"epoch": 0.7613341322148991,
"grad_norm": 0.6788393377637245,
"learning_rate": 7.369139731924401e-07,
"loss": 0.0674,
"step": 953
},
{
"epoch": 0.762133013780707,
"grad_norm": 0.6565644655695145,
"learning_rate": 7.322330470336314e-07,
"loss": 0.0639,
"step": 954
},
{
"epoch": 0.7629318953465148,
"grad_norm": 0.7490737747780284,
"learning_rate": 7.275644829568748e-07,
"loss": 0.074,
"step": 955
},
{
"epoch": 0.7637307769123227,
"grad_norm": 0.8045168221078167,
"learning_rate": 7.229083136096712e-07,
"loss": 0.069,
"step": 956
},
{
"epoch": 0.7645296584781306,
"grad_norm": 0.6683960326626098,
"learning_rate": 7.182645715528436e-07,
"loss": 0.0651,
"step": 957
},
{
"epoch": 0.7653285400439385,
"grad_norm": 0.6902827678888699,
"learning_rate": 7.136332892603095e-07,
"loss": 0.0664,
"step": 958
},
{
"epoch": 0.7661274216097463,
"grad_norm": 0.7837175885990169,
"learning_rate": 7.090144991188569e-07,
"loss": 0.0647,
"step": 959
},
{
"epoch": 0.7669263031755542,
"grad_norm": 0.712113865239181,
"learning_rate": 7.044082334279151e-07,
"loss": 0.0614,
"step": 960
},
{
"epoch": 0.7677251847413621,
"grad_norm": 0.7273442302087524,
"learning_rate": 6.998145243993284e-07,
"loss": 0.0698,
"step": 961
},
{
"epoch": 0.76852406630717,
"grad_norm": 0.7559547717566453,
"learning_rate": 6.952334041571307e-07,
"loss": 0.068,
"step": 962
},
{
"epoch": 0.7693229478729778,
"grad_norm": 0.7786708332752759,
"learning_rate": 6.906649047373246e-07,
"loss": 0.0696,
"step": 963
},
{
"epoch": 0.7701218294387857,
"grad_norm": 0.7891026104652957,
"learning_rate": 6.861090580876536e-07,
"loss": 0.0802,
"step": 964
},
{
"epoch": 0.7709207110045936,
"grad_norm": 0.7701906687064225,
"learning_rate": 6.815658960673782e-07,
"loss": 0.0673,
"step": 965
},
{
"epoch": 0.7717195925704015,
"grad_norm": 0.7634396300503498,
"learning_rate": 6.770354504470575e-07,
"loss": 0.0727,
"step": 966
},
{
"epoch": 0.7725184741362093,
"grad_norm": 0.7061753171390293,
"learning_rate": 6.72517752908321e-07,
"loss": 0.0645,
"step": 967
},
{
"epoch": 0.7733173557020172,
"grad_norm": 0.8029556468762414,
"learning_rate": 6.680128350436532e-07,
"loss": 0.0689,
"step": 968
},
{
"epoch": 0.7741162372678251,
"grad_norm": 0.6638509857091174,
"learning_rate": 6.635207283561671e-07,
"loss": 0.059,
"step": 969
},
{
"epoch": 0.774915118833633,
"grad_norm": 0.7287078746384597,
"learning_rate": 6.590414642593882e-07,
"loss": 0.0661,
"step": 970
},
{
"epoch": 0.7757140003994408,
"grad_norm": 0.722876613271591,
"learning_rate": 6.545750740770338e-07,
"loss": 0.0673,
"step": 971
},
{
"epoch": 0.7765128819652487,
"grad_norm": 0.723016808817902,
"learning_rate": 6.501215890427908e-07,
"loss": 0.0646,
"step": 972
},
{
"epoch": 0.7773117635310565,
"grad_norm": 0.7458933545086837,
"learning_rate": 6.456810403001013e-07,
"loss": 0.0654,
"step": 973
},
{
"epoch": 0.7781106450968643,
"grad_norm": 0.699991209892201,
"learning_rate": 6.412534589019429e-07,
"loss": 0.0633,
"step": 974
},
{
"epoch": 0.7789095266626722,
"grad_norm": 0.7015135117059649,
"learning_rate": 6.368388758106134e-07,
"loss": 0.0646,
"step": 975
},
{
"epoch": 0.7797084082284801,
"grad_norm": 0.6776739847527647,
"learning_rate": 6.324373218975105e-07,
"loss": 0.0621,
"step": 976
},
{
"epoch": 0.780507289794288,
"grad_norm": 0.6857624915005792,
"learning_rate": 6.280488279429186e-07,
"loss": 0.0665,
"step": 977
},
{
"epoch": 0.7813061713600958,
"grad_norm": 0.7100103784080846,
"learning_rate": 6.236734246357948e-07,
"loss": 0.0686,
"step": 978
},
{
"epoch": 0.7821050529259037,
"grad_norm": 0.6860904374907602,
"learning_rate": 6.193111425735515e-07,
"loss": 0.0664,
"step": 979
},
{
"epoch": 0.7829039344917116,
"grad_norm": 0.6653015404082253,
"learning_rate": 6.149620122618438e-07,
"loss": 0.0561,
"step": 980
},
{
"epoch": 0.7837028160575195,
"grad_norm": 0.6722184016732179,
"learning_rate": 6.106260641143547e-07,
"loss": 0.0686,
"step": 981
},
{
"epoch": 0.7845016976233273,
"grad_norm": 0.7017646955590284,
"learning_rate": 6.063033284525854e-07,
"loss": 0.0591,
"step": 982
},
{
"epoch": 0.7853005791891352,
"grad_norm": 0.6992849059506876,
"learning_rate": 6.019938355056423e-07,
"loss": 0.07,
"step": 983
},
{
"epoch": 0.7860994607549431,
"grad_norm": 0.7139128406279165,
"learning_rate": 5.976976154100214e-07,
"loss": 0.0683,
"step": 984
},
{
"epoch": 0.786898342320751,
"grad_norm": 0.6595093734404093,
"learning_rate": 5.934146982094049e-07,
"loss": 0.0609,
"step": 985
},
{
"epoch": 0.7876972238865588,
"grad_norm": 0.7832364853064704,
"learning_rate": 5.89145113854444e-07,
"loss": 0.0734,
"step": 986
},
{
"epoch": 0.7884961054523667,
"grad_norm": 0.6913376062451335,
"learning_rate": 5.848888922025553e-07,
"loss": 0.0665,
"step": 987
},
{
"epoch": 0.7892949870181746,
"grad_norm": 0.716823109643616,
"learning_rate": 5.806460630177066e-07,
"loss": 0.0612,
"step": 988
},
{
"epoch": 0.7900938685839825,
"grad_norm": 0.7083180087166591,
"learning_rate": 5.764166559702145e-07,
"loss": 0.0652,
"step": 989
},
{
"epoch": 0.7908927501497903,
"grad_norm": 0.711844832993962,
"learning_rate": 5.72200700636531e-07,
"loss": 0.07,
"step": 990
},
{
"epoch": 0.7916916317155982,
"grad_norm": 0.6612044131815857,
"learning_rate": 5.679982264990425e-07,
"loss": 0.0611,
"step": 991
},
{
"epoch": 0.7924905132814061,
"grad_norm": 0.6476141706558939,
"learning_rate": 5.638092629458577e-07,
"loss": 0.0631,
"step": 992
},
{
"epoch": 0.793289394847214,
"grad_norm": 0.7118404722072632,
"learning_rate": 5.596338392706077e-07,
"loss": 0.0756,
"step": 993
},
{
"epoch": 0.7940882764130218,
"grad_norm": 0.7168675764933466,
"learning_rate": 5.554719846722379e-07,
"loss": 0.0668,
"step": 994
},
{
"epoch": 0.7948871579788296,
"grad_norm": 0.72644232924385,
"learning_rate": 5.513237282548034e-07,
"loss": 0.0705,
"step": 995
},
{
"epoch": 0.7956860395446375,
"grad_norm": 0.7068188562442445,
"learning_rate": 5.471890990272666e-07,
"loss": 0.065,
"step": 996
},
{
"epoch": 0.7964849211104453,
"grad_norm": 0.6931576222204329,
"learning_rate": 5.430681259032958e-07,
"loss": 0.0648,
"step": 997
},
{
"epoch": 0.7972838026762532,
"grad_norm": 0.6720964929440794,
"learning_rate": 5.389608377010608e-07,
"loss": 0.0589,
"step": 998
},
{
"epoch": 0.7980826842420611,
"grad_norm": 0.6425174311401469,
"learning_rate": 5.348672631430319e-07,
"loss": 0.0566,
"step": 999
},
{
"epoch": 0.798881565807869,
"grad_norm": 0.7460943865422714,
"learning_rate": 5.307874308557778e-07,
"loss": 0.0734,
"step": 1000
},
{
"epoch": 0.798881565807869,
"eval_loss": 0.0669684186577797,
"eval_runtime": 16.0834,
"eval_samples_per_second": 50.362,
"eval_steps_per_second": 6.342,
"step": 1000
},
{
"epoch": 0.7996804473736768,
"grad_norm": 0.7220279454829013,
"learning_rate": 5.267213693697696e-07,
"loss": 0.0731,
"step": 1001
},
{
"epoch": 0.8004793289394847,
"grad_norm": 0.7666160582662389,
"learning_rate": 5.226691071191773e-07,
"loss": 0.0788,
"step": 1002
},
{
"epoch": 0.8012782105052926,
"grad_norm": 0.7134104534854722,
"learning_rate": 5.186306724416714e-07,
"loss": 0.0701,
"step": 1003
},
{
"epoch": 0.8020770920711005,
"grad_norm": 0.6726951640382407,
"learning_rate": 5.146060935782254e-07,
"loss": 0.0681,
"step": 1004
},
{
"epoch": 0.8028759736369083,
"grad_norm": 0.6771021941917784,
"learning_rate": 5.105953986729196e-07,
"loss": 0.0607,
"step": 1005
},
{
"epoch": 0.8036748552027162,
"grad_norm": 0.6801062661764998,
"learning_rate": 5.065986157727434e-07,
"loss": 0.0599,
"step": 1006
},
{
"epoch": 0.8044737367685241,
"grad_norm": 0.6720162232249413,
"learning_rate": 5.026157728273967e-07,
"loss": 0.0643,
"step": 1007
},
{
"epoch": 0.805272618334332,
"grad_norm": 0.6921585034116611,
"learning_rate": 4.986468976890993e-07,
"loss": 0.0619,
"step": 1008
},
{
"epoch": 0.8060714999001398,
"grad_norm": 0.7276889117024663,
"learning_rate": 4.946920181123904e-07,
"loss": 0.0622,
"step": 1009
},
{
"epoch": 0.8068703814659477,
"grad_norm": 0.694598262753411,
"learning_rate": 4.90751161753941e-07,
"loss": 0.0645,
"step": 1010
},
{
"epoch": 0.8076692630317556,
"grad_norm": 0.6632330746362429,
"learning_rate": 4.868243561723535e-07,
"loss": 0.0621,
"step": 1011
},
{
"epoch": 0.8084681445975634,
"grad_norm": 0.6879543731629699,
"learning_rate": 4.82911628827975e-07,
"loss": 0.0631,
"step": 1012
},
{
"epoch": 0.8092670261633713,
"grad_norm": 0.7095362884934977,
"learning_rate": 4.790130070827029e-07,
"loss": 0.0785,
"step": 1013
},
{
"epoch": 0.8100659077291792,
"grad_norm": 0.6697569093165596,
"learning_rate": 4.7512851819979196e-07,
"loss": 0.0623,
"step": 1014
},
{
"epoch": 0.8108647892949871,
"grad_norm": 0.7039158460244624,
"learning_rate": 4.712581893436646e-07,
"loss": 0.0634,
"step": 1015
},
{
"epoch": 0.8116636708607949,
"grad_norm": 0.7098347386238263,
"learning_rate": 4.674020475797239e-07,
"loss": 0.0657,
"step": 1016
},
{
"epoch": 0.8124625524266027,
"grad_norm": 0.7165736366942179,
"learning_rate": 4.635601198741607e-07,
"loss": 0.0673,
"step": 1017
},
{
"epoch": 0.8132614339924106,
"grad_norm": 0.7064304631677444,
"learning_rate": 4.597324330937661e-07,
"loss": 0.068,
"step": 1018
},
{
"epoch": 0.8140603155582185,
"grad_norm": 0.6877118664218234,
"learning_rate": 4.559190140057429e-07,
"loss": 0.0687,
"step": 1019
},
{
"epoch": 0.8148591971240263,
"grad_norm": 0.6944336714307168,
"learning_rate": 4.5211988927752026e-07,
"loss": 0.0638,
"step": 1020
},
{
"epoch": 0.8156580786898342,
"grad_norm": 0.6928572650124039,
"learning_rate": 4.483350854765672e-07,
"loss": 0.0653,
"step": 1021
},
{
"epoch": 0.8164569602556421,
"grad_norm": 0.7089184972604112,
"learning_rate": 4.445646290702038e-07,
"loss": 0.0691,
"step": 1022
},
{
"epoch": 0.81725584182145,
"grad_norm": 0.7001438532239804,
"learning_rate": 4.4080854642541833e-07,
"loss": 0.0663,
"step": 1023
},
{
"epoch": 0.8180547233872578,
"grad_norm": 0.6747722142421564,
"learning_rate": 4.3706686380868336e-07,
"loss": 0.0643,
"step": 1024
},
{
"epoch": 0.8188536049530657,
"grad_norm": 0.6712519301600179,
"learning_rate": 4.3333960738577236e-07,
"loss": 0.0673,
"step": 1025
},
{
"epoch": 0.8196524865188736,
"grad_norm": 0.7052187515868353,
"learning_rate": 4.2962680322157335e-07,
"loss": 0.0654,
"step": 1026
},
{
"epoch": 0.8204513680846814,
"grad_norm": 0.7134673191364518,
"learning_rate": 4.259284772799099e-07,
"loss": 0.067,
"step": 1027
},
{
"epoch": 0.8212502496504893,
"grad_norm": 0.7595854332327621,
"learning_rate": 4.222446554233597e-07,
"loss": 0.0766,
"step": 1028
},
{
"epoch": 0.8220491312162972,
"grad_norm": 0.6996209252612581,
"learning_rate": 4.185753634130718e-07,
"loss": 0.0666,
"step": 1029
},
{
"epoch": 0.8228480127821051,
"grad_norm": 0.7310601605335255,
"learning_rate": 4.1492062690858673e-07,
"loss": 0.0621,
"step": 1030
},
{
"epoch": 0.8236468943479129,
"grad_norm": 0.7071417923105128,
"learning_rate": 4.1128047146765936e-07,
"loss": 0.0588,
"step": 1031
},
{
"epoch": 0.8244457759137208,
"grad_norm": 0.6815499218511121,
"learning_rate": 4.076549225460757e-07,
"loss": 0.0632,
"step": 1032
},
{
"epoch": 0.8252446574795287,
"grad_norm": 0.7333725564876801,
"learning_rate": 4.040440054974815e-07,
"loss": 0.0741,
"step": 1033
},
{
"epoch": 0.8260435390453366,
"grad_norm": 0.6665570482826972,
"learning_rate": 4.004477455731967e-07,
"loss": 0.0654,
"step": 1034
},
{
"epoch": 0.8268424206111444,
"grad_norm": 0.7246490725287029,
"learning_rate": 3.9686616792204677e-07,
"loss": 0.0714,
"step": 1035
},
{
"epoch": 0.8276413021769523,
"grad_norm": 0.7130119435912701,
"learning_rate": 3.932992975901823e-07,
"loss": 0.0647,
"step": 1036
},
{
"epoch": 0.8284401837427602,
"grad_norm": 0.6612979017180838,
"learning_rate": 3.89747159520904e-07,
"loss": 0.0622,
"step": 1037
},
{
"epoch": 0.8292390653085681,
"grad_norm": 0.7209814178475339,
"learning_rate": 3.8620977855448937e-07,
"loss": 0.0682,
"step": 1038
},
{
"epoch": 0.8300379468743758,
"grad_norm": 0.665248664635542,
"learning_rate": 3.8268717942801933e-07,
"loss": 0.0613,
"step": 1039
},
{
"epoch": 0.8308368284401837,
"grad_norm": 0.7005544280482982,
"learning_rate": 3.791793867752053e-07,
"loss": 0.0619,
"step": 1040
},
{
"epoch": 0.8316357100059916,
"grad_norm": 0.7061284542991632,
"learning_rate": 3.756864251262143e-07,
"loss": 0.062,
"step": 1041
},
{
"epoch": 0.8324345915717994,
"grad_norm": 0.7190713395533443,
"learning_rate": 3.722083189075007e-07,
"loss": 0.0673,
"step": 1042
},
{
"epoch": 0.8332334731376073,
"grad_norm": 0.7365262841063743,
"learning_rate": 3.6874509244163414e-07,
"loss": 0.071,
"step": 1043
},
{
"epoch": 0.8340323547034152,
"grad_norm": 0.6987401896536586,
"learning_rate": 3.652967699471299e-07,
"loss": 0.0678,
"step": 1044
},
{
"epoch": 0.8348312362692231,
"grad_norm": 0.6853720241515239,
"learning_rate": 3.6186337553827747e-07,
"loss": 0.0638,
"step": 1045
},
{
"epoch": 0.8356301178350309,
"grad_norm": 0.7344667190271366,
"learning_rate": 3.5844493322497425e-07,
"loss": 0.0738,
"step": 1046
},
{
"epoch": 0.8364289994008388,
"grad_norm": 0.6920294795948277,
"learning_rate": 3.5504146691255736e-07,
"loss": 0.0596,
"step": 1047
},
{
"epoch": 0.8372278809666467,
"grad_norm": 0.6869256839862279,
"learning_rate": 3.5165300040163606e-07,
"loss": 0.0601,
"step": 1048
},
{
"epoch": 0.8380267625324546,
"grad_norm": 0.757378304566195,
"learning_rate": 3.482795573879241e-07,
"loss": 0.0642,
"step": 1049
},
{
"epoch": 0.8388256440982624,
"grad_norm": 0.6437095281756511,
"learning_rate": 3.4492116146207677e-07,
"loss": 0.0563,
"step": 1050
},
{
"epoch": 0.8396245256640703,
"grad_norm": 0.6795702774739193,
"learning_rate": 3.4157783610952263e-07,
"loss": 0.0613,
"step": 1051
},
{
"epoch": 0.8404234072298782,
"grad_norm": 0.7352355084174876,
"learning_rate": 3.382496047103026e-07,
"loss": 0.0684,
"step": 1052
},
{
"epoch": 0.8412222887956861,
"grad_norm": 0.7290947348513277,
"learning_rate": 3.3493649053890325e-07,
"loss": 0.0697,
"step": 1053
},
{
"epoch": 0.8420211703614939,
"grad_norm": 0.698112148548848,
"learning_rate": 3.316385167640976e-07,
"loss": 0.0628,
"step": 1054
},
{
"epoch": 0.8428200519273018,
"grad_norm": 0.7086436557888751,
"learning_rate": 3.2835570644877854e-07,
"loss": 0.0644,
"step": 1055
},
{
"epoch": 0.8436189334931097,
"grad_norm": 0.7101992176912245,
"learning_rate": 3.250880825498026e-07,
"loss": 0.0703,
"step": 1056
},
{
"epoch": 0.8444178150589176,
"grad_norm": 0.7770285479833771,
"learning_rate": 3.218356679178253e-07,
"loss": 0.0705,
"step": 1057
},
{
"epoch": 0.8452166966247254,
"grad_norm": 0.7644864247124296,
"learning_rate": 3.1859848529714383e-07,
"loss": 0.0609,
"step": 1058
},
{
"epoch": 0.8460155781905333,
"grad_norm": 0.6867453662895092,
"learning_rate": 3.153765573255377e-07,
"loss": 0.0645,
"step": 1059
},
{
"epoch": 0.8468144597563412,
"grad_norm": 0.734264697694199,
"learning_rate": 3.1216990653410827e-07,
"loss": 0.0683,
"step": 1060
},
{
"epoch": 0.8476133413221489,
"grad_norm": 0.6732599728013793,
"learning_rate": 3.089785553471233e-07,
"loss": 0.0644,
"step": 1061
},
{
"epoch": 0.8484122228879568,
"grad_norm": 0.7192281176674538,
"learning_rate": 3.058025260818609e-07,
"loss": 0.0616,
"step": 1062
},
{
"epoch": 0.8492111044537647,
"grad_norm": 0.6708308363843544,
"learning_rate": 3.0264184094845135e-07,
"loss": 0.058,
"step": 1063
},
{
"epoch": 0.8500099860195726,
"grad_norm": 0.7397694215736755,
"learning_rate": 2.9949652204972257e-07,
"loss": 0.0595,
"step": 1064
},
{
"epoch": 0.8508088675853804,
"grad_norm": 0.6928595213205264,
"learning_rate": 2.963665913810451e-07,
"loss": 0.0666,
"step": 1065
},
{
"epoch": 0.8516077491511883,
"grad_norm": 0.6694794166827536,
"learning_rate": 2.9325207083018e-07,
"loss": 0.0611,
"step": 1066
},
{
"epoch": 0.8524066307169962,
"grad_norm": 0.666720861965053,
"learning_rate": 2.9015298217712455e-07,
"loss": 0.0617,
"step": 1067
},
{
"epoch": 0.8532055122828041,
"grad_norm": 0.6895017056366897,
"learning_rate": 2.8706934709395893e-07,
"loss": 0.0636,
"step": 1068
},
{
"epoch": 0.8540043938486119,
"grad_norm": 0.694641273220621,
"learning_rate": 2.840011871446963e-07,
"loss": 0.0615,
"step": 1069
},
{
"epoch": 0.8548032754144198,
"grad_norm": 0.6914158751921184,
"learning_rate": 2.8094852378513144e-07,
"loss": 0.064,
"step": 1070
},
{
"epoch": 0.8556021569802277,
"grad_norm": 0.6903052428146241,
"learning_rate": 2.779113783626916e-07,
"loss": 0.0662,
"step": 1071
},
{
"epoch": 0.8564010385460356,
"grad_norm": 0.6825082699898156,
"learning_rate": 2.748897721162841e-07,
"loss": 0.0666,
"step": 1072
},
{
"epoch": 0.8571999201118434,
"grad_norm": 0.7049176701648293,
"learning_rate": 2.718837261761528e-07,
"loss": 0.0591,
"step": 1073
},
{
"epoch": 0.8579988016776513,
"grad_norm": 0.6985195968248852,
"learning_rate": 2.688932615637252e-07,
"loss": 0.055,
"step": 1074
},
{
"epoch": 0.8587976832434592,
"grad_norm": 0.6724955324579563,
"learning_rate": 2.6591839919146963e-07,
"loss": 0.061,
"step": 1075
},
{
"epoch": 0.859596564809267,
"grad_norm": 0.7032826868960578,
"learning_rate": 2.6295915986274513e-07,
"loss": 0.0583,
"step": 1076
},
{
"epoch": 0.8603954463750749,
"grad_norm": 0.7608983776246196,
"learning_rate": 2.6001556427166064e-07,
"loss": 0.0731,
"step": 1077
},
{
"epoch": 0.8611943279408828,
"grad_norm": 0.6894275105789678,
"learning_rate": 2.570876330029254e-07,
"loss": 0.0682,
"step": 1078
},
{
"epoch": 0.8619932095066907,
"grad_norm": 0.7200233498632493,
"learning_rate": 2.541753865317076e-07,
"loss": 0.0685,
"step": 1079
},
{
"epoch": 0.8627920910724985,
"grad_norm": 0.6774539347803369,
"learning_rate": 2.512788452234921e-07,
"loss": 0.0675,
"step": 1080
},
{
"epoch": 0.8635909726383064,
"grad_norm": 0.7581447101213228,
"learning_rate": 2.483980293339361e-07,
"loss": 0.0716,
"step": 1081
},
{
"epoch": 0.8643898542041143,
"grad_norm": 0.7053069803143726,
"learning_rate": 2.4553295900872856e-07,
"loss": 0.0619,
"step": 1082
},
{
"epoch": 0.8651887357699221,
"grad_norm": 0.6478546849970215,
"learning_rate": 2.4268365428344737e-07,
"loss": 0.0577,
"step": 1083
},
{
"epoch": 0.8659876173357299,
"grad_norm": 0.6662934203390073,
"learning_rate": 2.3985013508342203e-07,
"loss": 0.058,
"step": 1084
},
{
"epoch": 0.8667864989015378,
"grad_norm": 0.6932899856482695,
"learning_rate": 2.370324212235936e-07,
"loss": 0.0604,
"step": 1085
},
{
"epoch": 0.8675853804673457,
"grad_norm": 0.7804082086146575,
"learning_rate": 2.3423053240837518e-07,
"loss": 0.0801,
"step": 1086
},
{
"epoch": 0.8683842620331536,
"grad_norm": 0.6912072674130147,
"learning_rate": 2.3144448823151394e-07,
"loss": 0.0606,
"step": 1087
},
{
"epoch": 0.8691831435989614,
"grad_norm": 0.7051436324198396,
"learning_rate": 2.2867430817595432e-07,
"loss": 0.0632,
"step": 1088
},
{
"epoch": 0.8699820251647693,
"grad_norm": 0.7357501611930702,
"learning_rate": 2.2592001161370392e-07,
"loss": 0.0698,
"step": 1089
},
{
"epoch": 0.8707809067305772,
"grad_norm": 0.7499428063687636,
"learning_rate": 2.2318161780569558e-07,
"loss": 0.0697,
"step": 1090
},
{
"epoch": 0.871579788296385,
"grad_norm": 0.6763629933404043,
"learning_rate": 2.2045914590165252e-07,
"loss": 0.066,
"step": 1091
},
{
"epoch": 0.8723786698621929,
"grad_norm": 0.6698488388739392,
"learning_rate": 2.177526149399556e-07,
"loss": 0.0626,
"step": 1092
},
{
"epoch": 0.8731775514280008,
"grad_norm": 0.6977284000368456,
"learning_rate": 2.1506204384751067e-07,
"loss": 0.0695,
"step": 1093
},
{
"epoch": 0.8739764329938087,
"grad_norm": 0.7417930261159534,
"learning_rate": 2.1238745143961513e-07,
"loss": 0.0697,
"step": 1094
},
{
"epoch": 0.8747753145596165,
"grad_norm": 0.7356179512760564,
"learning_rate": 2.0972885641982605e-07,
"loss": 0.0665,
"step": 1095
},
{
"epoch": 0.8755741961254244,
"grad_norm": 0.7922074218678701,
"learning_rate": 2.0708627737983073e-07,
"loss": 0.0724,
"step": 1096
},
{
"epoch": 0.8763730776912323,
"grad_norm": 0.6637572696928091,
"learning_rate": 2.044597327993153e-07,
"loss": 0.0613,
"step": 1097
},
{
"epoch": 0.8771719592570402,
"grad_norm": 0.6963583688371511,
"learning_rate": 2.0184924104583615e-07,
"loss": 0.0609,
"step": 1098
},
{
"epoch": 0.877970840822848,
"grad_norm": 0.6363025487174039,
"learning_rate": 1.992548203746919e-07,
"loss": 0.0628,
"step": 1099
},
{
"epoch": 0.8787697223886559,
"grad_norm": 0.7030310308314632,
"learning_rate": 1.9667648892879532e-07,
"loss": 0.0578,
"step": 1100
},
{
"epoch": 0.8795686039544638,
"grad_norm": 0.6918141758573743,
"learning_rate": 1.941142647385469e-07,
"loss": 0.0679,
"step": 1101
},
{
"epoch": 0.8803674855202717,
"grad_norm": 0.7414881920940897,
"learning_rate": 1.9156816572170582e-07,
"loss": 0.0681,
"step": 1102
},
{
"epoch": 0.8811663670860795,
"grad_norm": 0.6771177978269848,
"learning_rate": 1.8903820968326992e-07,
"loss": 0.0643,
"step": 1103
},
{
"epoch": 0.8819652486518874,
"grad_norm": 0.6904228007725154,
"learning_rate": 1.865244143153472e-07,
"loss": 0.0657,
"step": 1104
},
{
"epoch": 0.8827641302176952,
"grad_norm": 0.717450106791312,
"learning_rate": 1.840267971970344e-07,
"loss": 0.0645,
"step": 1105
},
{
"epoch": 0.883563011783503,
"grad_norm": 0.6979035372775874,
"learning_rate": 1.8154537579429004e-07,
"loss": 0.0658,
"step": 1106
},
{
"epoch": 0.8843618933493109,
"grad_norm": 0.6903449335982373,
"learning_rate": 1.790801674598186e-07,
"loss": 0.0649,
"step": 1107
},
{
"epoch": 0.8851607749151188,
"grad_norm": 0.7292202075227271,
"learning_rate": 1.7663118943294367e-07,
"loss": 0.062,
"step": 1108
},
{
"epoch": 0.8859596564809267,
"grad_norm": 0.698367465032237,
"learning_rate": 1.74198458839491e-07,
"loss": 0.0678,
"step": 1109
},
{
"epoch": 0.8867585380467345,
"grad_norm": 0.6677856153414419,
"learning_rate": 1.7178199269166584e-07,
"loss": 0.0596,
"step": 1110
},
{
"epoch": 0.8875574196125424,
"grad_norm": 0.7303795782927686,
"learning_rate": 1.6938180788793557e-07,
"loss": 0.0686,
"step": 1111
},
{
"epoch": 0.8883563011783503,
"grad_norm": 0.670118518726451,
"learning_rate": 1.6699792121291248e-07,
"loss": 0.0592,
"step": 1112
},
{
"epoch": 0.8891551827441582,
"grad_norm": 0.7462381511634325,
"learning_rate": 1.6463034933723336e-07,
"loss": 0.0768,
"step": 1113
},
{
"epoch": 0.889954064309966,
"grad_norm": 0.6549933917225375,
"learning_rate": 1.6227910881744634e-07,
"loss": 0.0682,
"step": 1114
},
{
"epoch": 0.8907529458757739,
"grad_norm": 0.6615109907339011,
"learning_rate": 1.5994421609589388e-07,
"loss": 0.0623,
"step": 1115
},
{
"epoch": 0.8915518274415818,
"grad_norm": 0.6974460937921593,
"learning_rate": 1.5762568750059604e-07,
"loss": 0.0669,
"step": 1116
},
{
"epoch": 0.8923507090073897,
"grad_norm": 0.6961469220676079,
"learning_rate": 1.553235392451377e-07,
"loss": 0.0646,
"step": 1117
},
{
"epoch": 0.8931495905731975,
"grad_norm": 0.6885675903125427,
"learning_rate": 1.5303778742855684e-07,
"loss": 0.0621,
"step": 1118
},
{
"epoch": 0.8939484721390054,
"grad_norm": 0.7228656239844607,
"learning_rate": 1.507684480352292e-07,
"loss": 0.0661,
"step": 1119
},
{
"epoch": 0.8947473537048133,
"grad_norm": 0.7300348116191165,
"learning_rate": 1.4851553693475768e-07,
"loss": 0.0664,
"step": 1120
},
{
"epoch": 0.8955462352706212,
"grad_norm": 0.7416976198498724,
"learning_rate": 1.4627906988186114e-07,
"loss": 0.0676,
"step": 1121
},
{
"epoch": 0.896345116836429,
"grad_norm": 0.6703200074841813,
"learning_rate": 1.4405906251626496e-07,
"loss": 0.0621,
"step": 1122
},
{
"epoch": 0.8971439984022369,
"grad_norm": 0.6213139962942775,
"learning_rate": 1.4185553036259097e-07,
"loss": 0.0604,
"step": 1123
},
{
"epoch": 0.8979428799680448,
"grad_norm": 0.686848444095792,
"learning_rate": 1.3966848883024936e-07,
"loss": 0.0678,
"step": 1124
},
{
"epoch": 0.8987417615338527,
"grad_norm": 0.7053960643095338,
"learning_rate": 1.3749795321332887e-07,
"loss": 0.0656,
"step": 1125
},
{
"epoch": 0.8995406430996605,
"grad_norm": 0.6970965606669406,
"learning_rate": 1.3534393869049367e-07,
"loss": 0.0654,
"step": 1126
},
{
"epoch": 0.9003395246654683,
"grad_norm": 0.6942277449574915,
"learning_rate": 1.3320646032487394e-07,
"loss": 0.0653,
"step": 1127
},
{
"epoch": 0.9011384062312762,
"grad_norm": 0.6744444320008497,
"learning_rate": 1.3108553306396265e-07,
"loss": 0.0666,
"step": 1128
},
{
"epoch": 0.901937287797084,
"grad_norm": 0.6873741371219347,
"learning_rate": 1.289811717395087e-07,
"loss": 0.069,
"step": 1129
},
{
"epoch": 0.9027361693628919,
"grad_norm": 0.6982296762549293,
"learning_rate": 1.2689339106741529e-07,
"loss": 0.0637,
"step": 1130
},
{
"epoch": 0.9035350509286998,
"grad_norm": 0.6785495778136779,
"learning_rate": 1.2482220564763669e-07,
"loss": 0.0643,
"step": 1131
},
{
"epoch": 0.9043339324945077,
"grad_norm": 0.734961547157777,
"learning_rate": 1.227676299640751e-07,
"loss": 0.0677,
"step": 1132
},
{
"epoch": 0.9051328140603155,
"grad_norm": 0.7425938420645702,
"learning_rate": 1.2072967838448053e-07,
"loss": 0.0721,
"step": 1133
},
{
"epoch": 0.9059316956261234,
"grad_norm": 0.6651013770128319,
"learning_rate": 1.1870836516034878e-07,
"loss": 0.0601,
"step": 1134
},
{
"epoch": 0.9067305771919313,
"grad_norm": 0.6809615376797185,
"learning_rate": 1.1670370442682461e-07,
"loss": 0.0627,
"step": 1135
},
{
"epoch": 0.9075294587577392,
"grad_norm": 0.7543516531209598,
"learning_rate": 1.1471571020259919e-07,
"loss": 0.0703,
"step": 1136
},
{
"epoch": 0.908328340323547,
"grad_norm": 0.7413805857709801,
"learning_rate": 1.1274439638981532e-07,
"loss": 0.0616,
"step": 1137
},
{
"epoch": 0.9091272218893549,
"grad_norm": 0.7298273683632146,
"learning_rate": 1.1078977677396824e-07,
"loss": 0.0661,
"step": 1138
},
{
"epoch": 0.9099261034551628,
"grad_norm": 0.7212675180454935,
"learning_rate": 1.0885186502381018e-07,
"loss": 0.0635,
"step": 1139
},
{
"epoch": 0.9107249850209707,
"grad_norm": 0.7198568372151669,
"learning_rate": 1.0693067469125323e-07,
"loss": 0.0783,
"step": 1140
},
{
"epoch": 0.9115238665867785,
"grad_norm": 0.6613205843381404,
"learning_rate": 1.0502621921127776e-07,
"loss": 0.0624,
"step": 1141
},
{
"epoch": 0.9123227481525864,
"grad_norm": 0.7217970551572686,
"learning_rate": 1.031385119018355e-07,
"loss": 0.0674,
"step": 1142
},
{
"epoch": 0.9131216297183943,
"grad_norm": 0.6811880251542707,
"learning_rate": 1.0126756596375687e-07,
"loss": 0.0694,
"step": 1143
},
{
"epoch": 0.9139205112842022,
"grad_norm": 0.6819091612837742,
"learning_rate": 9.94133944806594e-08,
"loss": 0.0622,
"step": 1144
},
{
"epoch": 0.91471939285001,
"grad_norm": 0.6431918047898731,
"learning_rate": 9.757601041885694e-08,
"loss": 0.059,
"step": 1145
},
{
"epoch": 0.9155182744158179,
"grad_norm": 0.6570994880947287,
"learning_rate": 9.575542662726756e-08,
"loss": 0.0589,
"step": 1146
},
{
"epoch": 0.9163171559816258,
"grad_norm": 0.7107622134999424,
"learning_rate": 9.395165583732379e-08,
"loss": 0.069,
"step": 1147
},
{
"epoch": 0.9171160375474335,
"grad_norm": 0.7012113103311711,
"learning_rate": 9.216471066288396e-08,
"loss": 0.0657,
"step": 1148
},
{
"epoch": 0.9179149191132414,
"grad_norm": 0.6983067783638615,
"learning_rate": 9.03946036001449e-08,
"loss": 0.0641,
"step": 1149
},
{
"epoch": 0.9187138006790493,
"grad_norm": 0.6830731940037315,
"learning_rate": 8.864134702755294e-08,
"loss": 0.0694,
"step": 1150
},
{
"epoch": 0.9195126822448572,
"grad_norm": 0.725735904073755,
"learning_rate": 8.69049532057184e-08,
"loss": 0.0687,
"step": 1151
},
{
"epoch": 0.920311563810665,
"grad_norm": 0.6978361407432903,
"learning_rate": 8.518543427732951e-08,
"loss": 0.0635,
"step": 1152
},
{
"epoch": 0.9211104453764729,
"grad_norm": 0.6789409308489616,
"learning_rate": 8.348280226706723e-08,
"loss": 0.0666,
"step": 1153
},
{
"epoch": 0.9219093269422808,
"grad_norm": 0.664337510813219,
"learning_rate": 8.179706908152202e-08,
"loss": 0.0598,
"step": 1154
},
{
"epoch": 0.9227082085080887,
"grad_norm": 0.66126807605164,
"learning_rate": 8.012824650910938e-08,
"loss": 0.0613,
"step": 1155
},
{
"epoch": 0.9235070900738965,
"grad_norm": 0.6873606113244728,
"learning_rate": 7.84763462199889e-08,
"loss": 0.0679,
"step": 1156
},
{
"epoch": 0.9243059716397044,
"grad_norm": 0.7083619043314441,
"learning_rate": 7.684137976598089e-08,
"loss": 0.0663,
"step": 1157
},
{
"epoch": 0.9251048532055123,
"grad_norm": 0.7168857530641378,
"learning_rate": 7.522335858048707e-08,
"loss": 0.0697,
"step": 1158
},
{
"epoch": 0.9259037347713202,
"grad_norm": 0.7075639541686788,
"learning_rate": 7.362229397840981e-08,
"loss": 0.0651,
"step": 1159
},
{
"epoch": 0.926702616337128,
"grad_norm": 0.6669509766718363,
"learning_rate": 7.203819715607352e-08,
"loss": 0.0636,
"step": 1160
},
{
"epoch": 0.9275014979029359,
"grad_norm": 0.6946840882259397,
"learning_rate": 7.047107919114588e-08,
"loss": 0.0677,
"step": 1161
},
{
"epoch": 0.9283003794687438,
"grad_norm": 0.703242279454599,
"learning_rate": 6.892095104256063e-08,
"loss": 0.0728,
"step": 1162
},
{
"epoch": 0.9290992610345516,
"grad_norm": 0.712084470546071,
"learning_rate": 6.738782355044048e-08,
"loss": 0.0555,
"step": 1163
},
{
"epoch": 0.9298981426003595,
"grad_norm": 0.6960123967278037,
"learning_rate": 6.587170743602239e-08,
"loss": 0.0611,
"step": 1164
},
{
"epoch": 0.9306970241661674,
"grad_norm": 0.6634823802335459,
"learning_rate": 6.437261330158206e-08,
"loss": 0.0688,
"step": 1165
},
{
"epoch": 0.9314959057319753,
"grad_norm": 0.6768814293109752,
"learning_rate": 6.289055163035851e-08,
"loss": 0.0684,
"step": 1166
},
{
"epoch": 0.9322947872977831,
"grad_norm": 0.6890187647538402,
"learning_rate": 6.142553278648239e-08,
"loss": 0.0658,
"step": 1167
},
{
"epoch": 0.933093668863591,
"grad_norm": 0.6771130077993985,
"learning_rate": 5.997756701490388e-08,
"loss": 0.0605,
"step": 1168
},
{
"epoch": 0.9338925504293989,
"grad_norm": 0.7082091564066306,
"learning_rate": 5.8546664441319346e-08,
"loss": 0.0659,
"step": 1169
},
{
"epoch": 0.9346914319952067,
"grad_norm": 0.7072470961017815,
"learning_rate": 5.7132835072101486e-08,
"loss": 0.0629,
"step": 1170
},
{
"epoch": 0.9354903135610145,
"grad_norm": 0.7300223405209104,
"learning_rate": 5.573608879422876e-08,
"loss": 0.0697,
"step": 1171
},
{
"epoch": 0.9362891951268224,
"grad_norm": 0.7374743059950585,
"learning_rate": 5.435643537521767e-08,
"loss": 0.0709,
"step": 1172
},
{
"epoch": 0.9370880766926303,
"grad_norm": 0.703493024507941,
"learning_rate": 5.2993884463053425e-08,
"loss": 0.0663,
"step": 1173
},
{
"epoch": 0.9378869582584382,
"grad_norm": 0.7044199552247555,
"learning_rate": 5.164844558612131e-08,
"loss": 0.0653,
"step": 1174
},
{
"epoch": 0.938685839824246,
"grad_norm": 0.6997176122396134,
"learning_rate": 5.032012815314291e-08,
"loss": 0.064,
"step": 1175
},
{
"epoch": 0.9394847213900539,
"grad_norm": 0.7051848938605311,
"learning_rate": 4.9008941453107527e-08,
"loss": 0.0672,
"step": 1176
},
{
"epoch": 0.9402836029558618,
"grad_norm": 0.6932942079850664,
"learning_rate": 4.7714894655209174e-08,
"loss": 0.0695,
"step": 1177
},
{
"epoch": 0.9410824845216696,
"grad_norm": 0.6990251492223958,
"learning_rate": 4.6437996808781086e-08,
"loss": 0.0675,
"step": 1178
},
{
"epoch": 0.9418813660874775,
"grad_norm": 0.6961097084977305,
"learning_rate": 4.5178256843233235e-08,
"loss": 0.0662,
"step": 1179
},
{
"epoch": 0.9426802476532854,
"grad_norm": 0.7074362157190855,
"learning_rate": 4.393568356799022e-08,
"loss": 0.0659,
"step": 1180
},
{
"epoch": 0.9434791292190933,
"grad_norm": 0.671938873807077,
"learning_rate": 4.271028567242819e-08,
"loss": 0.0574,
"step": 1181
},
{
"epoch": 0.9442780107849011,
"grad_norm": 0.6666315152137079,
"learning_rate": 4.1502071725815216e-08,
"loss": 0.0641,
"step": 1182
},
{
"epoch": 0.945076892350709,
"grad_norm": 0.6501129464536826,
"learning_rate": 4.03110501772519e-08,
"loss": 0.0581,
"step": 1183
},
{
"epoch": 0.9458757739165169,
"grad_norm": 0.7468643186181524,
"learning_rate": 3.91372293556111e-08,
"loss": 0.0714,
"step": 1184
},
{
"epoch": 0.9466746554823248,
"grad_norm": 0.7365552924599144,
"learning_rate": 3.798061746947995e-08,
"loss": 0.0738,
"step": 1185
},
{
"epoch": 0.9474735370481326,
"grad_norm": 0.7290239770751575,
"learning_rate": 3.684122260710243e-08,
"loss": 0.0691,
"step": 1186
},
{
"epoch": 0.9482724186139405,
"grad_norm": 0.7436081143712903,
"learning_rate": 3.571905273632381e-08,
"loss": 0.0706,
"step": 1187
},
{
"epoch": 0.9490713001797484,
"grad_norm": 0.7186680096892166,
"learning_rate": 3.461411570453377e-08,
"loss": 0.0702,
"step": 1188
},
{
"epoch": 0.9498701817455563,
"grad_norm": 0.6811106939384963,
"learning_rate": 3.352641923861144e-08,
"loss": 0.0629,
"step": 1189
},
{
"epoch": 0.9506690633113641,
"grad_norm": 0.6212175291097755,
"learning_rate": 3.245597094487213e-08,
"loss": 0.0574,
"step": 1190
},
{
"epoch": 0.951467944877172,
"grad_norm": 0.692435690978003,
"learning_rate": 3.1402778309014284e-08,
"loss": 0.069,
"step": 1191
},
{
"epoch": 0.9522668264429798,
"grad_norm": 0.6827344843106753,
"learning_rate": 3.0366848696066207e-08,
"loss": 0.0661,
"step": 1192
},
{
"epoch": 0.9530657080087876,
"grad_norm": 0.7180605472986129,
"learning_rate": 2.934818935033501e-08,
"loss": 0.0779,
"step": 1193
},
{
"epoch": 0.9538645895745955,
"grad_norm": 0.7019886548940459,
"learning_rate": 2.834680739535578e-08,
"loss": 0.0692,
"step": 1194
},
{
"epoch": 0.9546634711404034,
"grad_norm": 0.6873247468863092,
"learning_rate": 2.736270983384276e-08,
"loss": 0.0678,
"step": 1195
},
{
"epoch": 0.9554623527062113,
"grad_norm": 0.6594613297149347,
"learning_rate": 2.6395903547638825e-08,
"loss": 0.0579,
"step": 1196
},
{
"epoch": 0.9562612342720191,
"grad_norm": 0.7097562956461467,
"learning_rate": 2.544639529766829e-08,
"loss": 0.0703,
"step": 1197
},
{
"epoch": 0.957060115837827,
"grad_norm": 0.6788638450102409,
"learning_rate": 2.451419172388947e-08,
"loss": 0.0712,
"step": 1198
},
{
"epoch": 0.9578589974036349,
"grad_norm": 0.6941431715350456,
"learning_rate": 2.3599299345248294e-08,
"loss": 0.0649,
"step": 1199
},
{
"epoch": 0.9586578789694428,
"grad_norm": 0.6833715565456336,
"learning_rate": 2.2701724559632542e-08,
"loss": 0.0646,
"step": 1200
},
{
"epoch": 0.9594567605352506,
"grad_norm": 0.686174057700068,
"learning_rate": 2.1821473643827142e-08,
"loss": 0.069,
"step": 1201
},
{
"epoch": 0.9602556421010585,
"grad_norm": 0.720252925574729,
"learning_rate": 2.095855275347086e-08,
"loss": 0.0664,
"step": 1202
},
{
"epoch": 0.9610545236668664,
"grad_norm": 0.6866213263988475,
"learning_rate": 2.011296792301165e-08,
"loss": 0.0649,
"step": 1203
},
{
"epoch": 0.9618534052326743,
"grad_norm": 0.6823240540115703,
"learning_rate": 1.928472506566692e-08,
"loss": 0.0575,
"step": 1204
},
{
"epoch": 0.9626522867984821,
"grad_norm": 0.6850391701055302,
"learning_rate": 1.847382997337943e-08,
"loss": 0.0608,
"step": 1205
},
{
"epoch": 0.96345116836429,
"grad_norm": 0.6581036399279098,
"learning_rate": 1.768028831677926e-08,
"loss": 0.061,
"step": 1206
},
{
"epoch": 0.9642500499300979,
"grad_norm": 0.719520898830784,
"learning_rate": 1.6904105645142443e-08,
"loss": 0.0623,
"step": 1207
},
{
"epoch": 0.9650489314959058,
"grad_norm": 0.6445948484303641,
"learning_rate": 1.6145287386353236e-08,
"loss": 0.0603,
"step": 1208
},
{
"epoch": 0.9658478130617136,
"grad_norm": 0.6888857806767628,
"learning_rate": 1.5403838846864694e-08,
"loss": 0.0697,
"step": 1209
},
{
"epoch": 0.9666466946275215,
"grad_norm": 0.7515695400681041,
"learning_rate": 1.46797652116637e-08,
"loss": 0.0622,
"step": 1210
},
{
"epoch": 0.9674455761933294,
"grad_norm": 0.6790673279650511,
"learning_rate": 1.3973071544233219e-08,
"loss": 0.057,
"step": 1211
},
{
"epoch": 0.9682444577591373,
"grad_norm": 0.6443579913417355,
"learning_rate": 1.3283762786517051e-08,
"loss": 0.0575,
"step": 1212
},
{
"epoch": 0.9690433393249451,
"grad_norm": 0.7049514598463905,
"learning_rate": 1.2611843758885412e-08,
"loss": 0.0662,
"step": 1213
},
{
"epoch": 0.9698422208907529,
"grad_norm": 0.6778898956156996,
"learning_rate": 1.1957319160101621e-08,
"loss": 0.06,
"step": 1214
},
{
"epoch": 0.9706411024565608,
"grad_norm": 0.7373312914103637,
"learning_rate": 1.132019356728853e-08,
"loss": 0.0657,
"step": 1215
},
{
"epoch": 0.9714399840223686,
"grad_norm": 0.6786552950388112,
"learning_rate": 1.0700471435897142e-08,
"loss": 0.0575,
"step": 1216
},
{
"epoch": 0.9722388655881765,
"grad_norm": 0.6651591444631889,
"learning_rate": 1.0098157099674988e-08,
"loss": 0.0634,
"step": 1217
},
{
"epoch": 0.9730377471539844,
"grad_norm": 0.6701675123122681,
"learning_rate": 9.513254770636138e-09,
"loss": 0.0611,
"step": 1218
},
{
"epoch": 0.9738366287197923,
"grad_norm": 0.7844222447677253,
"learning_rate": 8.945768539031785e-09,
"loss": 0.0678,
"step": 1219
},
{
"epoch": 0.9746355102856001,
"grad_norm": 0.6919149315269185,
"learning_rate": 8.395702373321101e-09,
"loss": 0.0669,
"step": 1220
},
{
"epoch": 0.975434391851408,
"grad_norm": 0.7202289235797771,
"learning_rate": 7.863060120144316e-09,
"loss": 0.0671,
"step": 1221
},
{
"epoch": 0.9762332734172159,
"grad_norm": 0.7481230246681578,
"learning_rate": 7.3478455042946814e-09,
"loss": 0.0707,
"step": 1222
},
{
"epoch": 0.9770321549830238,
"grad_norm": 0.6638284936549793,
"learning_rate": 6.850062128694046e-09,
"loss": 0.0617,
"step": 1223
},
{
"epoch": 0.9778310365488316,
"grad_norm": 0.7160309081965106,
"learning_rate": 6.369713474366213e-09,
"loss": 0.0721,
"step": 1224
},
{
"epoch": 0.9786299181146395,
"grad_norm": 0.7047500452248895,
"learning_rate": 5.906802900412789e-09,
"loss": 0.0683,
"step": 1225
},
{
"epoch": 0.9794287996804474,
"grad_norm": 0.7197584007832565,
"learning_rate": 5.461333643990985e-09,
"loss": 0.066,
"step": 1226
},
{
"epoch": 0.9802276812462553,
"grad_norm": 0.7026888743515561,
"learning_rate": 5.033308820289185e-09,
"loss": 0.0666,
"step": 1227
},
{
"epoch": 0.9810265628120631,
"grad_norm": 0.6567740354974405,
"learning_rate": 4.622731422505855e-09,
"loss": 0.0635,
"step": 1228
},
{
"epoch": 0.981825444377871,
"grad_norm": 0.7601422587361972,
"learning_rate": 4.229604321829561e-09,
"loss": 0.0673,
"step": 1229
},
{
"epoch": 0.9826243259436789,
"grad_norm": 0.6905698857901205,
"learning_rate": 3.853930267417316e-09,
"loss": 0.064,
"step": 1230
},
{
"epoch": 0.9834232075094868,
"grad_norm": 0.664904341590391,
"learning_rate": 3.495711886376818e-09,
"loss": 0.0573,
"step": 1231
},
{
"epoch": 0.9842220890752946,
"grad_norm": 0.6960109391146085,
"learning_rate": 3.154951683746743e-09,
"loss": 0.0645,
"step": 1232
},
{
"epoch": 0.9850209706411025,
"grad_norm": 0.6882272325779095,
"learning_rate": 2.8316520424800933e-09,
"loss": 0.0646,
"step": 1233
},
{
"epoch": 0.9858198522069104,
"grad_norm": 0.6524760001643685,
"learning_rate": 2.5258152234272637e-09,
"loss": 0.0637,
"step": 1234
},
{
"epoch": 0.9866187337727182,
"grad_norm": 0.6953143167090238,
"learning_rate": 2.237443365320502e-09,
"loss": 0.0674,
"step": 1235
},
{
"epoch": 0.987417615338526,
"grad_norm": 0.6605513970439745,
"learning_rate": 1.9665384847583622e-09,
"loss": 0.0643,
"step": 1236
},
{
"epoch": 0.9882164969043339,
"grad_norm": 0.7309814335935096,
"learning_rate": 1.7131024761923854e-09,
"loss": 0.0657,
"step": 1237
},
{
"epoch": 0.9890153784701418,
"grad_norm": 0.6825626492438529,
"learning_rate": 1.4771371119126631e-09,
"loss": 0.0681,
"step": 1238
},
{
"epoch": 0.9898142600359496,
"grad_norm": 0.7310094281209002,
"learning_rate": 1.2586440420372936e-09,
"loss": 0.073,
"step": 1239
},
{
"epoch": 0.9906131416017575,
"grad_norm": 0.7203040005251995,
"learning_rate": 1.0576247944985018e-09,
"loss": 0.0584,
"step": 1240
},
{
"epoch": 0.9914120231675654,
"grad_norm": 0.7002852230433981,
"learning_rate": 8.740807750345914e-10,
"loss": 0.0687,
"step": 1241
},
{
"epoch": 0.9922109047333733,
"grad_norm": 0.7406493969310309,
"learning_rate": 7.080132671774542e-10,
"loss": 0.0748,
"step": 1242
},
{
"epoch": 0.9930097862991811,
"grad_norm": 0.6384302004543542,
"learning_rate": 5.59423432245354e-10,
"loss": 0.0579,
"step": 1243
},
{
"epoch": 0.993808667864989,
"grad_norm": 0.7209129050951093,
"learning_rate": 4.2831230933487735e-10,
"loss": 0.0721,
"step": 1244
},
{
"epoch": 0.9946075494307969,
"grad_norm": 0.6783623313747392,
"learning_rate": 3.146808153123293e-10,
"loss": 0.0668,
"step": 1245
},
{
"epoch": 0.9954064309966048,
"grad_norm": 0.6718991145044905,
"learning_rate": 2.1852974480846002e-10,
"loss": 0.0649,
"step": 1246
},
{
"epoch": 0.9962053125624126,
"grad_norm": 0.698720262635623,
"learning_rate": 1.398597702123583e-10,
"loss": 0.0659,
"step": 1247
},
{
"epoch": 0.9970041941282205,
"grad_norm": 0.6882411009274608,
"learning_rate": 7.867144166728846e-11,
"loss": 0.0668,
"step": 1248
},
{
"epoch": 0.9978030756940284,
"grad_norm": 0.7154829911093414,
"learning_rate": 3.496518706597174e-11,
"loss": 0.0678,
"step": 1249
},
{
"epoch": 0.9986019572598362,
"grad_norm": 0.6425326575457246,
"learning_rate": 8.74131204864348e-12,
"loss": 0.0605,
"step": 1250
},
{
"epoch": 0.9994008388256441,
"grad_norm": 0.6736534858888602,
"learning_rate": 0.0,
"loss": 0.0646,
"step": 1251
},
{
"epoch": 0.9994008388256441,
"step": 1251,
"total_flos": 161979572551680.0,
"train_loss": 0.08634093818035152,
"train_runtime": 5655.3143,
"train_samples_per_second": 14.165,
"train_steps_per_second": 0.221
}
],
"logging_steps": 1,
"max_steps": 1251,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 161979572551680.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}