TaiGary's picture
Add files using upload-large-folder tool
0c4fbc9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 200,
"global_step": 320,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00625,
"grad_norm": 0.40599126920339407,
"learning_rate": 3.125e-06,
"loss": 0.508,
"step": 1
},
{
"epoch": 0.0125,
"grad_norm": 0.16174971893093387,
"learning_rate": 6.25e-06,
"loss": 0.3989,
"step": 2
},
{
"epoch": 0.01875,
"grad_norm": 0.13980982904548378,
"learning_rate": 9.375000000000001e-06,
"loss": 0.3526,
"step": 3
},
{
"epoch": 0.025,
"grad_norm": 0.27727799449785184,
"learning_rate": 1.25e-05,
"loss": 0.3927,
"step": 4
},
{
"epoch": 0.03125,
"grad_norm": 0.1157104063128156,
"learning_rate": 1.5625e-05,
"loss": 0.3232,
"step": 5
},
{
"epoch": 0.0375,
"grad_norm": 0.14945724236967864,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.3486,
"step": 6
},
{
"epoch": 0.04375,
"grad_norm": 0.1806281329991288,
"learning_rate": 2.1875e-05,
"loss": 0.3894,
"step": 7
},
{
"epoch": 0.05,
"grad_norm": 0.16431928934147372,
"learning_rate": 2.5e-05,
"loss": 0.3606,
"step": 8
},
{
"epoch": 0.05625,
"grad_norm": 0.15979437230246274,
"learning_rate": 2.8125000000000003e-05,
"loss": 0.3323,
"step": 9
},
{
"epoch": 0.0625,
"grad_norm": 0.2004445206673931,
"learning_rate": 3.125e-05,
"loss": 0.3689,
"step": 10
},
{
"epoch": 0.06875,
"grad_norm": 0.20835456635890684,
"learning_rate": 3.4375e-05,
"loss": 0.3825,
"step": 11
},
{
"epoch": 0.075,
"grad_norm": 0.18507055166360947,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.3368,
"step": 12
},
{
"epoch": 0.08125,
"grad_norm": 0.21673005959559813,
"learning_rate": 4.0625000000000005e-05,
"loss": 0.3345,
"step": 13
},
{
"epoch": 0.0875,
"grad_norm": 0.17167000281963693,
"learning_rate": 4.375e-05,
"loss": 0.3176,
"step": 14
},
{
"epoch": 0.09375,
"grad_norm": 0.19654075830086185,
"learning_rate": 4.6875e-05,
"loss": 0.3136,
"step": 15
},
{
"epoch": 0.1,
"grad_norm": 0.16056560874254397,
"learning_rate": 5e-05,
"loss": 0.2986,
"step": 16
},
{
"epoch": 0.10625,
"grad_norm": 0.14196579272732793,
"learning_rate": 5.3125000000000004e-05,
"loss": 0.2691,
"step": 17
},
{
"epoch": 0.1125,
"grad_norm": 0.15400147007847648,
"learning_rate": 5.6250000000000005e-05,
"loss": 0.3044,
"step": 18
},
{
"epoch": 0.11875,
"grad_norm": 0.10944808517977976,
"learning_rate": 5.9375e-05,
"loss": 0.232,
"step": 19
},
{
"epoch": 0.125,
"grad_norm": 0.11702109032620692,
"learning_rate": 6.25e-05,
"loss": 0.237,
"step": 20
},
{
"epoch": 0.13125,
"grad_norm": 0.13599177017428848,
"learning_rate": 6.562500000000001e-05,
"loss": 0.2708,
"step": 21
},
{
"epoch": 0.1375,
"grad_norm": 0.10113783542128998,
"learning_rate": 6.875e-05,
"loss": 0.2334,
"step": 22
},
{
"epoch": 0.14375,
"grad_norm": 0.07723777905374925,
"learning_rate": 7.1875e-05,
"loss": 0.1994,
"step": 23
},
{
"epoch": 0.15,
"grad_norm": 0.08703618197718174,
"learning_rate": 7.500000000000001e-05,
"loss": 0.2253,
"step": 24
},
{
"epoch": 0.15625,
"grad_norm": 0.11206045033884303,
"learning_rate": 7.8125e-05,
"loss": 0.2677,
"step": 25
},
{
"epoch": 0.1625,
"grad_norm": 0.08484274071341459,
"learning_rate": 8.125000000000001e-05,
"loss": 0.1909,
"step": 26
},
{
"epoch": 0.16875,
"grad_norm": 0.13523622669439594,
"learning_rate": 8.4375e-05,
"loss": 0.2352,
"step": 27
},
{
"epoch": 0.175,
"grad_norm": 0.0979564733902834,
"learning_rate": 8.75e-05,
"loss": 0.2312,
"step": 28
},
{
"epoch": 0.18125,
"grad_norm": 0.10257710940833924,
"learning_rate": 9.062500000000001e-05,
"loss": 0.2571,
"step": 29
},
{
"epoch": 0.1875,
"grad_norm": 0.09612386608372053,
"learning_rate": 9.375e-05,
"loss": 0.2181,
"step": 30
},
{
"epoch": 0.19375,
"grad_norm": 0.08223867478377872,
"learning_rate": 9.687500000000001e-05,
"loss": 0.1768,
"step": 31
},
{
"epoch": 0.2,
"grad_norm": 0.08986243609911292,
"learning_rate": 0.0001,
"loss": 0.1995,
"step": 32
},
{
"epoch": 0.20625,
"grad_norm": 0.22662504165100472,
"learning_rate": 9.999702525000749e-05,
"loss": 0.2344,
"step": 33
},
{
"epoch": 0.2125,
"grad_norm": 0.11694951154367994,
"learning_rate": 9.998810135399546e-05,
"loss": 0.1581,
"step": 34
},
{
"epoch": 0.21875,
"grad_norm": 0.10469730042736397,
"learning_rate": 9.997322937381829e-05,
"loss": 0.2129,
"step": 35
},
{
"epoch": 0.225,
"grad_norm": 0.09098907134196822,
"learning_rate": 9.99524110790929e-05,
"loss": 0.1837,
"step": 36
},
{
"epoch": 0.23125,
"grad_norm": 0.10818262125011802,
"learning_rate": 9.992564894698816e-05,
"loss": 0.1881,
"step": 37
},
{
"epoch": 0.2375,
"grad_norm": 0.09073616923032347,
"learning_rate": 9.989294616193017e-05,
"loss": 0.1837,
"step": 38
},
{
"epoch": 0.24375,
"grad_norm": 0.08876555004490387,
"learning_rate": 9.985430661522333e-05,
"loss": 0.1918,
"step": 39
},
{
"epoch": 0.25,
"grad_norm": 0.1046229015794259,
"learning_rate": 9.980973490458728e-05,
"loss": 0.1823,
"step": 40
},
{
"epoch": 0.25625,
"grad_norm": 0.10749279775724821,
"learning_rate": 9.975923633360985e-05,
"loss": 0.2191,
"step": 41
},
{
"epoch": 0.2625,
"grad_norm": 0.07655181308042079,
"learning_rate": 9.970281691111598e-05,
"loss": 0.1671,
"step": 42
},
{
"epoch": 0.26875,
"grad_norm": 0.06987142581471417,
"learning_rate": 9.964048335045275e-05,
"loss": 0.1445,
"step": 43
},
{
"epoch": 0.275,
"grad_norm": 0.08578476236740849,
"learning_rate": 9.957224306869053e-05,
"loss": 0.1883,
"step": 44
},
{
"epoch": 0.28125,
"grad_norm": 0.08735769756820697,
"learning_rate": 9.949810418574039e-05,
"loss": 0.1912,
"step": 45
},
{
"epoch": 0.2875,
"grad_norm": 0.08454191122193365,
"learning_rate": 9.941807552338804e-05,
"loss": 0.1781,
"step": 46
},
{
"epoch": 0.29375,
"grad_norm": 0.07561417915639863,
"learning_rate": 9.933216660424395e-05,
"loss": 0.1741,
"step": 47
},
{
"epoch": 0.3,
"grad_norm": 0.0908845677740239,
"learning_rate": 9.924038765061042e-05,
"loss": 0.1862,
"step": 48
},
{
"epoch": 0.30625,
"grad_norm": 0.08957984885995234,
"learning_rate": 9.914274958326505e-05,
"loss": 0.1948,
"step": 49
},
{
"epoch": 0.3125,
"grad_norm": 0.06721787175218842,
"learning_rate": 9.903926402016153e-05,
"loss": 0.1387,
"step": 50
},
{
"epoch": 0.31875,
"grad_norm": 0.07776287708078782,
"learning_rate": 9.892994327504693e-05,
"loss": 0.1897,
"step": 51
},
{
"epoch": 0.325,
"grad_norm": 0.0792915804134436,
"learning_rate": 9.881480035599667e-05,
"loss": 0.1878,
"step": 52
},
{
"epoch": 0.33125,
"grad_norm": 0.11195588138897727,
"learning_rate": 9.869384896386668e-05,
"loss": 0.194,
"step": 53
},
{
"epoch": 0.3375,
"grad_norm": 0.10744243347143223,
"learning_rate": 9.856710349066307e-05,
"loss": 0.1881,
"step": 54
},
{
"epoch": 0.34375,
"grad_norm": 0.1162711120696835,
"learning_rate": 9.843457901782967e-05,
"loss": 0.2129,
"step": 55
},
{
"epoch": 0.35,
"grad_norm": 0.08320344631618402,
"learning_rate": 9.829629131445342e-05,
"loss": 0.187,
"step": 56
},
{
"epoch": 0.35625,
"grad_norm": 0.08853925634789721,
"learning_rate": 9.815225683538814e-05,
"loss": 0.1952,
"step": 57
},
{
"epoch": 0.3625,
"grad_norm": 0.08064278407146641,
"learning_rate": 9.800249271929645e-05,
"loss": 0.1722,
"step": 58
},
{
"epoch": 0.36875,
"grad_norm": 0.08627992209736293,
"learning_rate": 9.784701678661045e-05,
"loss": 0.1852,
"step": 59
},
{
"epoch": 0.375,
"grad_norm": 0.08472663945575681,
"learning_rate": 9.768584753741134e-05,
"loss": 0.1861,
"step": 60
},
{
"epoch": 0.38125,
"grad_norm": 0.0706516311545354,
"learning_rate": 9.751900414922805e-05,
"loss": 0.1681,
"step": 61
},
{
"epoch": 0.3875,
"grad_norm": 0.07830536169386694,
"learning_rate": 9.73465064747553e-05,
"loss": 0.1768,
"step": 62
},
{
"epoch": 0.39375,
"grad_norm": 0.07870447814947025,
"learning_rate": 9.716837503949127e-05,
"loss": 0.1592,
"step": 63
},
{
"epoch": 0.4,
"grad_norm": 0.11475798771048969,
"learning_rate": 9.698463103929542e-05,
"loss": 0.2007,
"step": 64
},
{
"epoch": 0.40625,
"grad_norm": 0.09295937705194224,
"learning_rate": 9.67952963378663e-05,
"loss": 0.1499,
"step": 65
},
{
"epoch": 0.4125,
"grad_norm": 0.09326386653332064,
"learning_rate": 9.660039346413994e-05,
"loss": 0.1702,
"step": 66
},
{
"epoch": 0.41875,
"grad_norm": 0.08131260921217849,
"learning_rate": 9.639994560960923e-05,
"loss": 0.172,
"step": 67
},
{
"epoch": 0.425,
"grad_norm": 0.07365872216537292,
"learning_rate": 9.619397662556435e-05,
"loss": 0.1363,
"step": 68
},
{
"epoch": 0.43125,
"grad_norm": 0.0834833164127378,
"learning_rate": 9.598251102025461e-05,
"loss": 0.1664,
"step": 69
},
{
"epoch": 0.4375,
"grad_norm": 0.07655054737496132,
"learning_rate": 9.576557395597236e-05,
"loss": 0.1439,
"step": 70
},
{
"epoch": 0.44375,
"grad_norm": 0.09641298517325338,
"learning_rate": 9.554319124605879e-05,
"loss": 0.1526,
"step": 71
},
{
"epoch": 0.45,
"grad_norm": 0.07985464053948435,
"learning_rate": 9.53153893518325e-05,
"loss": 0.1514,
"step": 72
},
{
"epoch": 0.45625,
"grad_norm": 0.08227789931433875,
"learning_rate": 9.508219537944081e-05,
"loss": 0.1481,
"step": 73
},
{
"epoch": 0.4625,
"grad_norm": 0.07992014265398506,
"learning_rate": 9.484363707663442e-05,
"loss": 0.1552,
"step": 74
},
{
"epoch": 0.46875,
"grad_norm": 0.08428886519848619,
"learning_rate": 9.459974282946571e-05,
"loss": 0.1533,
"step": 75
},
{
"epoch": 0.475,
"grad_norm": 0.07999237487858113,
"learning_rate": 9.435054165891109e-05,
"loss": 0.15,
"step": 76
},
{
"epoch": 0.48125,
"grad_norm": 0.08663635431835072,
"learning_rate": 9.409606321741775e-05,
"loss": 0.1633,
"step": 77
},
{
"epoch": 0.4875,
"grad_norm": 0.10421286796237919,
"learning_rate": 9.38363377853754e-05,
"loss": 0.1675,
"step": 78
},
{
"epoch": 0.49375,
"grad_norm": 0.08371956698091174,
"learning_rate": 9.357139626751308e-05,
"loss": 0.1649,
"step": 79
},
{
"epoch": 0.5,
"grad_norm": 0.11576971658648634,
"learning_rate": 9.330127018922194e-05,
"loss": 0.1818,
"step": 80
},
{
"epoch": 0.50625,
"grad_norm": 0.09180205457837495,
"learning_rate": 9.302599169280395e-05,
"loss": 0.1535,
"step": 81
},
{
"epoch": 0.5125,
"grad_norm": 0.11143842983351335,
"learning_rate": 9.274559353364734e-05,
"loss": 0.1573,
"step": 82
},
{
"epoch": 0.51875,
"grad_norm": 0.08382222694709042,
"learning_rate": 9.246010907632895e-05,
"loss": 0.1552,
"step": 83
},
{
"epoch": 0.525,
"grad_norm": 0.08169820823619627,
"learning_rate": 9.21695722906443e-05,
"loss": 0.1348,
"step": 84
},
{
"epoch": 0.53125,
"grad_norm": 0.09264402945807464,
"learning_rate": 9.18740177475654e-05,
"loss": 0.1541,
"step": 85
},
{
"epoch": 0.5375,
"grad_norm": 0.12035173607425367,
"learning_rate": 9.157348061512727e-05,
"loss": 0.1584,
"step": 86
},
{
"epoch": 0.54375,
"grad_norm": 0.0933034733005768,
"learning_rate": 9.126799665424319e-05,
"loss": 0.1558,
"step": 87
},
{
"epoch": 0.55,
"grad_norm": 0.09983789678085002,
"learning_rate": 9.09576022144496e-05,
"loss": 0.178,
"step": 88
},
{
"epoch": 0.55625,
"grad_norm": 0.08824386686645358,
"learning_rate": 9.064233422958077e-05,
"loss": 0.1382,
"step": 89
},
{
"epoch": 0.5625,
"grad_norm": 0.12496408760397422,
"learning_rate": 9.032223021337414e-05,
"loss": 0.2028,
"step": 90
},
{
"epoch": 0.56875,
"grad_norm": 0.08896745045700023,
"learning_rate": 8.999732825500648e-05,
"loss": 0.1564,
"step": 91
},
{
"epoch": 0.575,
"grad_norm": 0.1051267783567327,
"learning_rate": 8.966766701456177e-05,
"loss": 0.1748,
"step": 92
},
{
"epoch": 0.58125,
"grad_norm": 0.1068446272490391,
"learning_rate": 8.933328571843084e-05,
"loss": 0.166,
"step": 93
},
{
"epoch": 0.5875,
"grad_norm": 0.09693703827165154,
"learning_rate": 8.899422415464409e-05,
"loss": 0.1371,
"step": 94
},
{
"epoch": 0.59375,
"grad_norm": 0.11397584234437574,
"learning_rate": 8.865052266813685e-05,
"loss": 0.1695,
"step": 95
},
{
"epoch": 0.6,
"grad_norm": 0.08309347621522221,
"learning_rate": 8.83022221559489e-05,
"loss": 0.1397,
"step": 96
},
{
"epoch": 0.60625,
"grad_norm": 0.09441073972154676,
"learning_rate": 8.79493640623581e-05,
"loss": 0.1475,
"step": 97
},
{
"epoch": 0.6125,
"grad_norm": 0.10017885648608761,
"learning_rate": 8.759199037394887e-05,
"loss": 0.1455,
"step": 98
},
{
"epoch": 0.61875,
"grad_norm": 0.09389095525738553,
"learning_rate": 8.723014361461632e-05,
"loss": 0.155,
"step": 99
},
{
"epoch": 0.625,
"grad_norm": 0.08794088850259763,
"learning_rate": 8.68638668405062e-05,
"loss": 0.14,
"step": 100
},
{
"epoch": 0.63125,
"grad_norm": 0.08768231254617773,
"learning_rate": 8.649320363489179e-05,
"loss": 0.1268,
"step": 101
},
{
"epoch": 0.6375,
"grad_norm": 0.10989824440042859,
"learning_rate": 8.611819810298778e-05,
"loss": 0.174,
"step": 102
},
{
"epoch": 0.64375,
"grad_norm": 0.09207703015396494,
"learning_rate": 8.573889486670233e-05,
"loss": 0.1314,
"step": 103
},
{
"epoch": 0.65,
"grad_norm": 0.10433836170510401,
"learning_rate": 8.535533905932738e-05,
"loss": 0.1563,
"step": 104
},
{
"epoch": 0.65625,
"grad_norm": 0.1394424066715557,
"learning_rate": 8.496757632016836e-05,
"loss": 0.2076,
"step": 105
},
{
"epoch": 0.6625,
"grad_norm": 0.09605746900221901,
"learning_rate": 8.457565278911348e-05,
"loss": 0.1516,
"step": 106
},
{
"epoch": 0.66875,
"grad_norm": 0.09934008188904551,
"learning_rate": 8.417961510114356e-05,
"loss": 0.163,
"step": 107
},
{
"epoch": 0.675,
"grad_norm": 0.096551403883122,
"learning_rate": 8.377951038078302e-05,
"loss": 0.1351,
"step": 108
},
{
"epoch": 0.68125,
"grad_norm": 0.09261714349099234,
"learning_rate": 8.337538623649237e-05,
"loss": 0.1181,
"step": 109
},
{
"epoch": 0.6875,
"grad_norm": 0.09869545197682032,
"learning_rate": 8.296729075500344e-05,
"loss": 0.1404,
"step": 110
},
{
"epoch": 0.69375,
"grad_norm": 0.17566063934542003,
"learning_rate": 8.255527249559746e-05,
"loss": 0.1746,
"step": 111
},
{
"epoch": 0.7,
"grad_norm": 0.11448964041640074,
"learning_rate": 8.213938048432697e-05,
"loss": 0.1724,
"step": 112
},
{
"epoch": 0.70625,
"grad_norm": 0.09940589194936707,
"learning_rate": 8.171966420818228e-05,
"loss": 0.1469,
"step": 113
},
{
"epoch": 0.7125,
"grad_norm": 0.09560062944611337,
"learning_rate": 8.129617360920296e-05,
"loss": 0.1692,
"step": 114
},
{
"epoch": 0.71875,
"grad_norm": 0.11078503759259131,
"learning_rate": 8.086895907853526e-05,
"loss": 0.145,
"step": 115
},
{
"epoch": 0.725,
"grad_norm": 0.10258735833050488,
"learning_rate": 8.043807145043604e-05,
"loss": 0.1401,
"step": 116
},
{
"epoch": 0.73125,
"grad_norm": 0.09522206054276056,
"learning_rate": 8.000356199622405e-05,
"loss": 0.1289,
"step": 117
},
{
"epoch": 0.7375,
"grad_norm": 0.13908178309980257,
"learning_rate": 7.956548241817912e-05,
"loss": 0.1191,
"step": 118
},
{
"epoch": 0.74375,
"grad_norm": 0.10074203716975204,
"learning_rate": 7.912388484339012e-05,
"loss": 0.1679,
"step": 119
},
{
"epoch": 0.75,
"grad_norm": 0.09900042130140317,
"learning_rate": 7.86788218175523e-05,
"loss": 0.1455,
"step": 120
},
{
"epoch": 0.75625,
"grad_norm": 0.09784130256143893,
"learning_rate": 7.823034629871503e-05,
"loss": 0.1387,
"step": 121
},
{
"epoch": 0.7625,
"grad_norm": 0.10874950463645312,
"learning_rate": 7.777851165098012e-05,
"loss": 0.1595,
"step": 122
},
{
"epoch": 0.76875,
"grad_norm": 0.108024736321919,
"learning_rate": 7.732337163815217e-05,
"loss": 0.1591,
"step": 123
},
{
"epoch": 0.775,
"grad_norm": 0.10774049198035004,
"learning_rate": 7.68649804173412e-05,
"loss": 0.1623,
"step": 124
},
{
"epoch": 0.78125,
"grad_norm": 0.1071299904224286,
"learning_rate": 7.64033925325184e-05,
"loss": 0.155,
"step": 125
},
{
"epoch": 0.7875,
"grad_norm": 0.1000917142455837,
"learning_rate": 7.593866290802608e-05,
"loss": 0.1524,
"step": 126
},
{
"epoch": 0.79375,
"grad_norm": 0.10821955303820206,
"learning_rate": 7.54708468420421e-05,
"loss": 0.1599,
"step": 127
},
{
"epoch": 0.8,
"grad_norm": 0.09585433780364112,
"learning_rate": 7.500000000000001e-05,
"loss": 0.1554,
"step": 128
},
{
"epoch": 0.80625,
"grad_norm": 0.12861896583829888,
"learning_rate": 7.45261784079654e-05,
"loss": 0.1538,
"step": 129
},
{
"epoch": 0.8125,
"grad_norm": 0.09906883135040825,
"learning_rate": 7.404943844596939e-05,
"loss": 0.1525,
"step": 130
},
{
"epoch": 0.81875,
"grad_norm": 0.09731521778705561,
"learning_rate": 7.35698368412999e-05,
"loss": 0.1478,
"step": 131
},
{
"epoch": 0.825,
"grad_norm": 0.09827888495925427,
"learning_rate": 7.308743066175172e-05,
"loss": 0.1484,
"step": 132
},
{
"epoch": 0.83125,
"grad_norm": 0.09864917828015088,
"learning_rate": 7.2602277308836e-05,
"loss": 0.124,
"step": 133
},
{
"epoch": 0.8375,
"grad_norm": 0.1002828586258035,
"learning_rate": 7.211443451095007e-05,
"loss": 0.1422,
"step": 134
},
{
"epoch": 0.84375,
"grad_norm": 0.10620289407411102,
"learning_rate": 7.162396031650831e-05,
"loss": 0.1488,
"step": 135
},
{
"epoch": 0.85,
"grad_norm": 0.09917275385305373,
"learning_rate": 7.113091308703498e-05,
"loss": 0.1227,
"step": 136
},
{
"epoch": 0.85625,
"grad_norm": 0.09676283023246729,
"learning_rate": 7.063535149021973e-05,
"loss": 0.1397,
"step": 137
},
{
"epoch": 0.8625,
"grad_norm": 0.10405765108275118,
"learning_rate": 7.013733449293687e-05,
"loss": 0.1489,
"step": 138
},
{
"epoch": 0.86875,
"grad_norm": 0.11729246349725461,
"learning_rate": 6.96369213542287e-05,
"loss": 0.1625,
"step": 139
},
{
"epoch": 0.875,
"grad_norm": 0.11001117463075423,
"learning_rate": 6.91341716182545e-05,
"loss": 0.1575,
"step": 140
},
{
"epoch": 0.88125,
"grad_norm": 0.09836686384061544,
"learning_rate": 6.862914510720515e-05,
"loss": 0.1367,
"step": 141
},
{
"epoch": 0.8875,
"grad_norm": 0.09762698241197115,
"learning_rate": 6.812190191418508e-05,
"loss": 0.1269,
"step": 142
},
{
"epoch": 0.89375,
"grad_norm": 0.11873455243886127,
"learning_rate": 6.761250239606169e-05,
"loss": 0.1653,
"step": 143
},
{
"epoch": 0.9,
"grad_norm": 0.11143312549675602,
"learning_rate": 6.710100716628344e-05,
"loss": 0.163,
"step": 144
},
{
"epoch": 0.90625,
"grad_norm": 0.10224042656105548,
"learning_rate": 6.658747708766762e-05,
"loss": 0.1372,
"step": 145
},
{
"epoch": 0.9125,
"grad_norm": 0.10537786627093436,
"learning_rate": 6.607197326515808e-05,
"loss": 0.1666,
"step": 146
},
{
"epoch": 0.91875,
"grad_norm": 0.11598573107229065,
"learning_rate": 6.555455703855454e-05,
"loss": 0.1608,
"step": 147
},
{
"epoch": 0.925,
"grad_norm": 0.09995067555095581,
"learning_rate": 6.503528997521366e-05,
"loss": 0.1358,
"step": 148
},
{
"epoch": 0.93125,
"grad_norm": 0.1013311198834551,
"learning_rate": 6.451423386272312e-05,
"loss": 0.1467,
"step": 149
},
{
"epoch": 0.9375,
"grad_norm": 0.12955305425483832,
"learning_rate": 6.399145070154961e-05,
"loss": 0.1602,
"step": 150
},
{
"epoch": 0.94375,
"grad_norm": 0.11967014014153346,
"learning_rate": 6.346700269766132e-05,
"loss": 0.1491,
"step": 151
},
{
"epoch": 0.95,
"grad_norm": 0.09715431421285034,
"learning_rate": 6.294095225512603e-05,
"loss": 0.1503,
"step": 152
},
{
"epoch": 0.95625,
"grad_norm": 0.0993416996456672,
"learning_rate": 6.241336196868582e-05,
"loss": 0.1394,
"step": 153
},
{
"epoch": 0.9625,
"grad_norm": 0.11282985988320719,
"learning_rate": 6.188429461630866e-05,
"loss": 0.1471,
"step": 154
},
{
"epoch": 0.96875,
"grad_norm": 0.0953828839581251,
"learning_rate": 6.135381315171867e-05,
"loss": 0.1163,
"step": 155
},
{
"epoch": 0.975,
"grad_norm": 0.10585616381949887,
"learning_rate": 6.0821980696905146e-05,
"loss": 0.1414,
"step": 156
},
{
"epoch": 0.98125,
"grad_norm": 0.10303380839923655,
"learning_rate": 6.0288860534611745e-05,
"loss": 0.1371,
"step": 157
},
{
"epoch": 0.9875,
"grad_norm": 0.10581039986719955,
"learning_rate": 5.9754516100806423e-05,
"loss": 0.1591,
"step": 158
},
{
"epoch": 0.99375,
"grad_norm": 0.08949422220549504,
"learning_rate": 5.9219010977133173e-05,
"loss": 0.1136,
"step": 159
},
{
"epoch": 1.0,
"grad_norm": 0.2566631074675719,
"learning_rate": 5.868240888334653e-05,
"loss": 0.1355,
"step": 160
},
{
"epoch": 1.00625,
"grad_norm": 0.10977904119859523,
"learning_rate": 5.814477366972945e-05,
"loss": 0.1389,
"step": 161
},
{
"epoch": 1.0125,
"grad_norm": 0.09358588046249755,
"learning_rate": 5.7606169309495836e-05,
"loss": 0.1278,
"step": 162
},
{
"epoch": 1.01875,
"grad_norm": 0.1016639716973101,
"learning_rate": 5.706665989117839e-05,
"loss": 0.1266,
"step": 163
},
{
"epoch": 1.025,
"grad_norm": 0.10447824270529592,
"learning_rate": 5.6526309611002594e-05,
"loss": 0.1344,
"step": 164
},
{
"epoch": 1.03125,
"grad_norm": 0.09539752784620499,
"learning_rate": 5.5985182765248126e-05,
"loss": 0.11,
"step": 165
},
{
"epoch": 1.0375,
"grad_norm": 0.09870524777017345,
"learning_rate": 5.544334374259823e-05,
"loss": 0.1177,
"step": 166
},
{
"epoch": 1.04375,
"grad_norm": 0.1319083652806534,
"learning_rate": 5.490085701647805e-05,
"loss": 0.1519,
"step": 167
},
{
"epoch": 1.05,
"grad_norm": 0.11490589915219332,
"learning_rate": 5.435778713738292e-05,
"loss": 0.1388,
"step": 168
},
{
"epoch": 1.05625,
"grad_norm": 0.10960359392101734,
"learning_rate": 5.381419872519763e-05,
"loss": 0.1315,
"step": 169
},
{
"epoch": 1.0625,
"grad_norm": 0.11911874080836313,
"learning_rate": 5.327015646150716e-05,
"loss": 0.1513,
"step": 170
},
{
"epoch": 1.06875,
"grad_norm": 0.11667323294115564,
"learning_rate": 5.2725725081900325e-05,
"loss": 0.1265,
"step": 171
},
{
"epoch": 1.075,
"grad_norm": 0.12009873927124597,
"learning_rate": 5.218096936826681e-05,
"loss": 0.1346,
"step": 172
},
{
"epoch": 1.08125,
"grad_norm": 0.1096220905597413,
"learning_rate": 5.1635954141088813e-05,
"loss": 0.1129,
"step": 173
},
{
"epoch": 1.0875,
"grad_norm": 0.10881175717833912,
"learning_rate": 5.1090744251728064e-05,
"loss": 0.1031,
"step": 174
},
{
"epoch": 1.09375,
"grad_norm": 0.13152370066828684,
"learning_rate": 5.054540457470912e-05,
"loss": 0.1093,
"step": 175
},
{
"epoch": 1.1,
"grad_norm": 0.1363760209998771,
"learning_rate": 5e-05,
"loss": 0.1283,
"step": 176
},
{
"epoch": 1.10625,
"grad_norm": 0.11792037030369434,
"learning_rate": 4.945459542529089e-05,
"loss": 0.1163,
"step": 177
},
{
"epoch": 1.1125,
"grad_norm": 0.1291497029388642,
"learning_rate": 4.890925574827195e-05,
"loss": 0.1188,
"step": 178
},
{
"epoch": 1.11875,
"grad_norm": 0.15349633438432128,
"learning_rate": 4.83640458589112e-05,
"loss": 0.1742,
"step": 179
},
{
"epoch": 1.125,
"grad_norm": 0.13584411062076804,
"learning_rate": 4.781903063173321e-05,
"loss": 0.1217,
"step": 180
},
{
"epoch": 1.13125,
"grad_norm": 0.11830432386637907,
"learning_rate": 4.727427491809968e-05,
"loss": 0.1047,
"step": 181
},
{
"epoch": 1.1375,
"grad_norm": 0.13406559358836506,
"learning_rate": 4.6729843538492847e-05,
"loss": 0.1288,
"step": 182
},
{
"epoch": 1.14375,
"grad_norm": 0.12419804398498213,
"learning_rate": 4.618580127480238e-05,
"loss": 0.1123,
"step": 183
},
{
"epoch": 1.15,
"grad_norm": 0.11882741893557454,
"learning_rate": 4.564221286261709e-05,
"loss": 0.1096,
"step": 184
},
{
"epoch": 1.15625,
"grad_norm": 0.13277371284528894,
"learning_rate": 4.509914298352197e-05,
"loss": 0.1237,
"step": 185
},
{
"epoch": 1.1625,
"grad_norm": 0.13106816929984538,
"learning_rate": 4.4556656257401786e-05,
"loss": 0.1256,
"step": 186
},
{
"epoch": 1.16875,
"grad_norm": 0.10042628903496502,
"learning_rate": 4.4014817234751885e-05,
"loss": 0.087,
"step": 187
},
{
"epoch": 1.175,
"grad_norm": 0.12858065744554853,
"learning_rate": 4.347369038899744e-05,
"loss": 0.132,
"step": 188
},
{
"epoch": 1.18125,
"grad_norm": 0.13353933963282785,
"learning_rate": 4.2933340108821644e-05,
"loss": 0.1342,
"step": 189
},
{
"epoch": 1.1875,
"grad_norm": 0.12535560223690698,
"learning_rate": 4.239383069050417e-05,
"loss": 0.1144,
"step": 190
},
{
"epoch": 1.19375,
"grad_norm": 0.11919252614077346,
"learning_rate": 4.185522633027057e-05,
"loss": 0.11,
"step": 191
},
{
"epoch": 1.2,
"grad_norm": 0.12595711384481822,
"learning_rate": 4.131759111665349e-05,
"loss": 0.1256,
"step": 192
},
{
"epoch": 1.20625,
"grad_norm": 0.140189285907938,
"learning_rate": 4.078098902286683e-05,
"loss": 0.1387,
"step": 193
},
{
"epoch": 1.2125,
"grad_norm": 0.188759066767972,
"learning_rate": 4.0245483899193595e-05,
"loss": 0.1154,
"step": 194
},
{
"epoch": 1.21875,
"grad_norm": 0.12510322910576527,
"learning_rate": 3.971113946538826e-05,
"loss": 0.1277,
"step": 195
},
{
"epoch": 1.225,
"grad_norm": 0.13577535319914885,
"learning_rate": 3.917801930309486e-05,
"loss": 0.1245,
"step": 196
},
{
"epoch": 1.23125,
"grad_norm": 0.11279041105321824,
"learning_rate": 3.864618684828134e-05,
"loss": 0.1149,
"step": 197
},
{
"epoch": 1.2375,
"grad_norm": 0.11597618526997733,
"learning_rate": 3.8115705383691355e-05,
"loss": 0.1137,
"step": 198
},
{
"epoch": 1.24375,
"grad_norm": 0.21159784242538673,
"learning_rate": 3.758663803131418e-05,
"loss": 0.1411,
"step": 199
},
{
"epoch": 1.25,
"grad_norm": 0.12706926447242534,
"learning_rate": 3.705904774487396e-05,
"loss": 0.117,
"step": 200
},
{
"epoch": 1.25,
"eval_loss": 0.13555637001991272,
"eval_runtime": 15.1829,
"eval_samples_per_second": 0.461,
"eval_steps_per_second": 0.132,
"step": 200
},
{
"epoch": 1.25625,
"grad_norm": 0.140712680168253,
"learning_rate": 3.65329973023387e-05,
"loss": 0.1317,
"step": 201
},
{
"epoch": 1.2625,
"grad_norm": 0.14063508556547588,
"learning_rate": 3.60085492984504e-05,
"loss": 0.1485,
"step": 202
},
{
"epoch": 1.26875,
"grad_norm": 0.12420246892648465,
"learning_rate": 3.5485766137276894e-05,
"loss": 0.127,
"step": 203
},
{
"epoch": 1.275,
"grad_norm": 0.1375293323917167,
"learning_rate": 3.4964710024786354e-05,
"loss": 0.1298,
"step": 204
},
{
"epoch": 1.28125,
"grad_norm": 0.12567976945004108,
"learning_rate": 3.4445442961445464e-05,
"loss": 0.1218,
"step": 205
},
{
"epoch": 1.2875,
"grad_norm": 0.10902064060724474,
"learning_rate": 3.392802673484193e-05,
"loss": 0.0911,
"step": 206
},
{
"epoch": 1.29375,
"grad_norm": 0.1299840428381004,
"learning_rate": 3.341252291233241e-05,
"loss": 0.1105,
"step": 207
},
{
"epoch": 1.3,
"grad_norm": 0.14686027926986012,
"learning_rate": 3.289899283371657e-05,
"loss": 0.157,
"step": 208
},
{
"epoch": 1.30625,
"grad_norm": 0.13310525982308075,
"learning_rate": 3.2387497603938326e-05,
"loss": 0.1167,
"step": 209
},
{
"epoch": 1.3125,
"grad_norm": 0.23056279523041534,
"learning_rate": 3.1878098085814924e-05,
"loss": 0.1267,
"step": 210
},
{
"epoch": 1.31875,
"grad_norm": 0.13379086959343525,
"learning_rate": 3.137085489279485e-05,
"loss": 0.1405,
"step": 211
},
{
"epoch": 1.325,
"grad_norm": 0.16429012844656798,
"learning_rate": 3.086582838174551e-05,
"loss": 0.1454,
"step": 212
},
{
"epoch": 1.33125,
"grad_norm": 0.1472540477988046,
"learning_rate": 3.0363078645771303e-05,
"loss": 0.1228,
"step": 213
},
{
"epoch": 1.3375,
"grad_norm": 0.12810467911662524,
"learning_rate": 2.9862665507063147e-05,
"loss": 0.1155,
"step": 214
},
{
"epoch": 1.34375,
"grad_norm": 0.1349885611023629,
"learning_rate": 2.936464850978027e-05,
"loss": 0.114,
"step": 215
},
{
"epoch": 1.35,
"grad_norm": 0.2226264712891022,
"learning_rate": 2.886908691296504e-05,
"loss": 0.131,
"step": 216
},
{
"epoch": 1.35625,
"grad_norm": 0.14284289832580171,
"learning_rate": 2.8376039683491686e-05,
"loss": 0.118,
"step": 217
},
{
"epoch": 1.3625,
"grad_norm": 0.14670574239643006,
"learning_rate": 2.7885565489049946e-05,
"loss": 0.1206,
"step": 218
},
{
"epoch": 1.36875,
"grad_norm": 0.13807081126340223,
"learning_rate": 2.7397722691164018e-05,
"loss": 0.1196,
"step": 219
},
{
"epoch": 1.375,
"grad_norm": 0.1414426520091727,
"learning_rate": 2.6912569338248315e-05,
"loss": 0.1257,
"step": 220
},
{
"epoch": 1.38125,
"grad_norm": 0.12974232596021115,
"learning_rate": 2.6430163158700115e-05,
"loss": 0.1078,
"step": 221
},
{
"epoch": 1.3875,
"grad_norm": 0.12697025279359767,
"learning_rate": 2.595056155403063e-05,
"loss": 0.1039,
"step": 222
},
{
"epoch": 1.39375,
"grad_norm": 0.1291839881974509,
"learning_rate": 2.54738215920346e-05,
"loss": 0.1044,
"step": 223
},
{
"epoch": 1.4,
"grad_norm": 0.13740214702279938,
"learning_rate": 2.500000000000001e-05,
"loss": 0.1152,
"step": 224
},
{
"epoch": 1.40625,
"grad_norm": 0.1384802768122724,
"learning_rate": 2.4529153157957913e-05,
"loss": 0.1234,
"step": 225
},
{
"epoch": 1.4125,
"grad_norm": 0.13354781137733693,
"learning_rate": 2.4061337091973918e-05,
"loss": 0.1193,
"step": 226
},
{
"epoch": 1.41875,
"grad_norm": 0.1771648365806598,
"learning_rate": 2.3596607467481603e-05,
"loss": 0.1452,
"step": 227
},
{
"epoch": 1.425,
"grad_norm": 0.11638362220378876,
"learning_rate": 2.3135019582658802e-05,
"loss": 0.0936,
"step": 228
},
{
"epoch": 1.43125,
"grad_norm": 0.13555782216280093,
"learning_rate": 2.2676628361847836e-05,
"loss": 0.1194,
"step": 229
},
{
"epoch": 1.4375,
"grad_norm": 0.13796797972293184,
"learning_rate": 2.2221488349019903e-05,
"loss": 0.1328,
"step": 230
},
{
"epoch": 1.44375,
"grad_norm": 0.12977028041039126,
"learning_rate": 2.176965370128498e-05,
"loss": 0.1079,
"step": 231
},
{
"epoch": 1.45,
"grad_norm": 0.13740927778792686,
"learning_rate": 2.132117818244771e-05,
"loss": 0.1225,
"step": 232
},
{
"epoch": 1.45625,
"grad_norm": 0.15309969236799242,
"learning_rate": 2.08761151566099e-05,
"loss": 0.1521,
"step": 233
},
{
"epoch": 1.4625,
"grad_norm": 0.16080840678597888,
"learning_rate": 2.0434517581820896e-05,
"loss": 0.1214,
"step": 234
},
{
"epoch": 1.46875,
"grad_norm": 0.1462626524834281,
"learning_rate": 1.999643800377596e-05,
"loss": 0.1305,
"step": 235
},
{
"epoch": 1.475,
"grad_norm": 0.13708479721295597,
"learning_rate": 1.9561928549563968e-05,
"loss": 0.1259,
"step": 236
},
{
"epoch": 1.48125,
"grad_norm": 0.14485439862293895,
"learning_rate": 1.913104092146476e-05,
"loss": 0.1313,
"step": 237
},
{
"epoch": 1.4875,
"grad_norm": 0.1305624984599595,
"learning_rate": 1.8703826390797048e-05,
"loss": 0.1082,
"step": 238
},
{
"epoch": 1.49375,
"grad_norm": 0.12983841773778187,
"learning_rate": 1.8280335791817733e-05,
"loss": 0.1254,
"step": 239
},
{
"epoch": 1.5,
"grad_norm": 0.1305817772014699,
"learning_rate": 1.7860619515673033e-05,
"loss": 0.0976,
"step": 240
},
{
"epoch": 1.50625,
"grad_norm": 0.13523687056968922,
"learning_rate": 1.7444727504402553e-05,
"loss": 0.118,
"step": 241
},
{
"epoch": 1.5125,
"grad_norm": 0.13276802064818327,
"learning_rate": 1.703270924499656e-05,
"loss": 0.1221,
"step": 242
},
{
"epoch": 1.51875,
"grad_norm": 0.14813379345753966,
"learning_rate": 1.662461376350764e-05,
"loss": 0.1415,
"step": 243
},
{
"epoch": 1.525,
"grad_norm": 0.1373736965280858,
"learning_rate": 1.622048961921699e-05,
"loss": 0.1241,
"step": 244
},
{
"epoch": 1.53125,
"grad_norm": 0.12987502342407337,
"learning_rate": 1.5820384898856434e-05,
"loss": 0.1109,
"step": 245
},
{
"epoch": 1.5375,
"grad_norm": 0.14097871387133398,
"learning_rate": 1.5424347210886538e-05,
"loss": 0.1171,
"step": 246
},
{
"epoch": 1.54375,
"grad_norm": 0.12911360333125718,
"learning_rate": 1.5032423679831642e-05,
"loss": 0.1087,
"step": 247
},
{
"epoch": 1.55,
"grad_norm": 0.13452146268445594,
"learning_rate": 1.4644660940672627e-05,
"loss": 0.1228,
"step": 248
},
{
"epoch": 1.55625,
"grad_norm": 0.16983759281577876,
"learning_rate": 1.4261105133297692e-05,
"loss": 0.1217,
"step": 249
},
{
"epoch": 1.5625,
"grad_norm": 0.13444059283520182,
"learning_rate": 1.3881801897012225e-05,
"loss": 0.1114,
"step": 250
},
{
"epoch": 1.56875,
"grad_norm": 0.12954454455265535,
"learning_rate": 1.3506796365108232e-05,
"loss": 0.1022,
"step": 251
},
{
"epoch": 1.575,
"grad_norm": 0.17591202784634585,
"learning_rate": 1.3136133159493802e-05,
"loss": 0.1114,
"step": 252
},
{
"epoch": 1.58125,
"grad_norm": 0.1321105098614603,
"learning_rate": 1.2769856385383688e-05,
"loss": 0.1183,
"step": 253
},
{
"epoch": 1.5875,
"grad_norm": 0.1432184606258264,
"learning_rate": 1.2408009626051137e-05,
"loss": 0.1152,
"step": 254
},
{
"epoch": 1.59375,
"grad_norm": 0.1652313311913505,
"learning_rate": 1.2050635937641908e-05,
"loss": 0.0925,
"step": 255
},
{
"epoch": 1.6,
"grad_norm": 0.1363797254860259,
"learning_rate": 1.1697777844051105e-05,
"loss": 0.1283,
"step": 256
},
{
"epoch": 1.60625,
"grad_norm": 0.14243039726380746,
"learning_rate": 1.134947733186315e-05,
"loss": 0.13,
"step": 257
},
{
"epoch": 1.6125,
"grad_norm": 0.1407260083641188,
"learning_rate": 1.100577584535592e-05,
"loss": 0.1107,
"step": 258
},
{
"epoch": 1.61875,
"grad_norm": 0.1359874658608621,
"learning_rate": 1.0666714281569151e-05,
"loss": 0.1061,
"step": 259
},
{
"epoch": 1.625,
"grad_norm": 0.1619598596563152,
"learning_rate": 1.0332332985438248e-05,
"loss": 0.1151,
"step": 260
},
{
"epoch": 1.63125,
"grad_norm": 0.14589312780062294,
"learning_rate": 1.000267174499352e-05,
"loss": 0.1288,
"step": 261
},
{
"epoch": 1.6375,
"grad_norm": 0.1448713969935771,
"learning_rate": 9.677769786625867e-06,
"loss": 0.1096,
"step": 262
},
{
"epoch": 1.64375,
"grad_norm": 0.15204585922412353,
"learning_rate": 9.357665770419244e-06,
"loss": 0.0998,
"step": 263
},
{
"epoch": 1.65,
"grad_norm": 0.132551181762264,
"learning_rate": 9.042397785550405e-06,
"loss": 0.1083,
"step": 264
},
{
"epoch": 1.65625,
"grad_norm": 0.17245754726915724,
"learning_rate": 8.732003345756811e-06,
"loss": 0.118,
"step": 265
},
{
"epoch": 1.6625,
"grad_norm": 0.13788212679850506,
"learning_rate": 8.426519384872733e-06,
"loss": 0.1088,
"step": 266
},
{
"epoch": 1.66875,
"grad_norm": 0.16823545202946685,
"learning_rate": 8.125982252434611e-06,
"loss": 0.1239,
"step": 267
},
{
"epoch": 1.675,
"grad_norm": 0.13554434267304058,
"learning_rate": 7.830427709355725e-06,
"loss": 0.1164,
"step": 268
},
{
"epoch": 1.68125,
"grad_norm": 0.12270084667138997,
"learning_rate": 7.539890923671062e-06,
"loss": 0.0855,
"step": 269
},
{
"epoch": 1.6875,
"grad_norm": 0.1610350461050514,
"learning_rate": 7.2544064663526815e-06,
"loss": 0.1395,
"step": 270
},
{
"epoch": 1.69375,
"grad_norm": 0.13371983154995107,
"learning_rate": 6.974008307196056e-06,
"loss": 0.1081,
"step": 271
},
{
"epoch": 1.7,
"grad_norm": 0.15579745002988243,
"learning_rate": 6.698729810778065e-06,
"loss": 0.1358,
"step": 272
},
{
"epoch": 1.70625,
"grad_norm": 0.13901889620059626,
"learning_rate": 6.428603732486937e-06,
"loss": 0.1208,
"step": 273
},
{
"epoch": 1.7125,
"grad_norm": 0.15932824101956905,
"learning_rate": 6.163662214624616e-06,
"loss": 0.1208,
"step": 274
},
{
"epoch": 1.71875,
"grad_norm": 0.14809539420100104,
"learning_rate": 5.903936782582253e-06,
"loss": 0.0961,
"step": 275
},
{
"epoch": 1.725,
"grad_norm": 0.14762764130305206,
"learning_rate": 5.649458341088915e-06,
"loss": 0.1328,
"step": 276
},
{
"epoch": 1.73125,
"grad_norm": 0.14698627279133403,
"learning_rate": 5.400257170534295e-06,
"loss": 0.1137,
"step": 277
},
{
"epoch": 1.7375,
"grad_norm": 0.12766172584371513,
"learning_rate": 5.156362923365588e-06,
"loss": 0.0963,
"step": 278
},
{
"epoch": 1.74375,
"grad_norm": 0.15550557920695354,
"learning_rate": 4.917804620559202e-06,
"loss": 0.1299,
"step": 279
},
{
"epoch": 1.75,
"grad_norm": 0.13371200089329044,
"learning_rate": 4.684610648167503e-06,
"loss": 0.1003,
"step": 280
},
{
"epoch": 1.75625,
"grad_norm": 0.19718769962541566,
"learning_rate": 4.456808753941205e-06,
"loss": 0.1005,
"step": 281
},
{
"epoch": 1.7625,
"grad_norm": 0.14609277866526107,
"learning_rate": 4.234426044027645e-06,
"loss": 0.1184,
"step": 282
},
{
"epoch": 1.76875,
"grad_norm": 0.13846373587108526,
"learning_rate": 4.017488979745387e-06,
"loss": 0.1083,
"step": 283
},
{
"epoch": 1.775,
"grad_norm": 0.16152586345422645,
"learning_rate": 3.8060233744356633e-06,
"loss": 0.1349,
"step": 284
},
{
"epoch": 1.78125,
"grad_norm": 0.15220108167136218,
"learning_rate": 3.600054390390778e-06,
"loss": 0.1189,
"step": 285
},
{
"epoch": 1.7875,
"grad_norm": 0.14825944302091063,
"learning_rate": 3.3996065358600782e-06,
"loss": 0.1124,
"step": 286
},
{
"epoch": 1.79375,
"grad_norm": 0.17101334397597814,
"learning_rate": 3.2047036621337236e-06,
"loss": 0.1242,
"step": 287
},
{
"epoch": 1.8,
"grad_norm": 0.15468868193258223,
"learning_rate": 3.0153689607045845e-06,
"loss": 0.1269,
"step": 288
},
{
"epoch": 1.80625,
"grad_norm": 0.13742574401888047,
"learning_rate": 2.8316249605087386e-06,
"loss": 0.108,
"step": 289
},
{
"epoch": 1.8125,
"grad_norm": 0.12723180285999527,
"learning_rate": 2.653493525244721e-06,
"loss": 0.104,
"step": 290
},
{
"epoch": 1.81875,
"grad_norm": 0.15299833624657141,
"learning_rate": 2.4809958507719444e-06,
"loss": 0.1227,
"step": 291
},
{
"epoch": 1.825,
"grad_norm": 0.153187221540368,
"learning_rate": 2.314152462588659e-06,
"loss": 0.1122,
"step": 292
},
{
"epoch": 1.83125,
"grad_norm": 0.1566217766687287,
"learning_rate": 2.152983213389559e-06,
"loss": 0.1206,
"step": 293
},
{
"epoch": 1.8375,
"grad_norm": 0.1619426370013769,
"learning_rate": 1.99750728070357e-06,
"loss": 0.0849,
"step": 294
},
{
"epoch": 1.84375,
"grad_norm": 0.1440771616743465,
"learning_rate": 1.8477431646118648e-06,
"loss": 0.1168,
"step": 295
},
{
"epoch": 1.85,
"grad_norm": 0.1266403405941791,
"learning_rate": 1.70370868554659e-06,
"loss": 0.1002,
"step": 296
},
{
"epoch": 1.85625,
"grad_norm": 0.14490235679395352,
"learning_rate": 1.565420982170346e-06,
"loss": 0.1204,
"step": 297
},
{
"epoch": 1.8625,
"grad_norm": 0.14214631954361082,
"learning_rate": 1.4328965093369283e-06,
"loss": 0.1052,
"step": 298
},
{
"epoch": 1.86875,
"grad_norm": 0.15125342239263603,
"learning_rate": 1.3061510361333185e-06,
"loss": 0.1185,
"step": 299
},
{
"epoch": 1.875,
"grad_norm": 0.14873736163107223,
"learning_rate": 1.1851996440033319e-06,
"loss": 0.1372,
"step": 300
},
{
"epoch": 1.88125,
"grad_norm": 0.15823839872382495,
"learning_rate": 1.0700567249530834e-06,
"loss": 0.1374,
"step": 301
},
{
"epoch": 1.8875,
"grad_norm": 0.1477147886939706,
"learning_rate": 9.607359798384785e-07,
"loss": 0.1265,
"step": 302
},
{
"epoch": 1.89375,
"grad_norm": 0.149870799299156,
"learning_rate": 8.572504167349449e-07,
"loss": 0.1156,
"step": 303
},
{
"epoch": 1.9,
"grad_norm": 0.13907104929191907,
"learning_rate": 7.596123493895991e-07,
"loss": 0.1131,
"step": 304
},
{
"epoch": 1.90625,
"grad_norm": 0.1438881017950769,
"learning_rate": 6.678333957560512e-07,
"loss": 0.1212,
"step": 305
},
{
"epoch": 1.9125,
"grad_norm": 0.14174173341102103,
"learning_rate": 5.81924476611967e-07,
"loss": 0.1249,
"step": 306
},
{
"epoch": 1.91875,
"grad_norm": 0.1409441874264998,
"learning_rate": 5.018958142596065e-07,
"loss": 0.1123,
"step": 307
},
{
"epoch": 1.925,
"grad_norm": 0.16658637594296666,
"learning_rate": 4.277569313094809e-07,
"loss": 0.1017,
"step": 308
},
{
"epoch": 1.93125,
"grad_norm": 0.1767438653947937,
"learning_rate": 3.59516649547248e-07,
"loss": 0.1265,
"step": 309
},
{
"epoch": 1.9375,
"grad_norm": 0.15957908409005475,
"learning_rate": 2.971830888840177e-07,
"loss": 0.139,
"step": 310
},
{
"epoch": 1.94375,
"grad_norm": 0.13358713671440797,
"learning_rate": 2.407636663901591e-07,
"loss": 0.1015,
"step": 311
},
{
"epoch": 1.95,
"grad_norm": 0.14596552013446734,
"learning_rate": 1.9026509541272275e-07,
"loss": 0.1294,
"step": 312
},
{
"epoch": 1.95625,
"grad_norm": 0.15582989274250142,
"learning_rate": 1.4569338477666838e-07,
"loss": 0.1282,
"step": 313
},
{
"epoch": 1.9625,
"grad_norm": 0.14404601592111216,
"learning_rate": 1.0705383806982606e-07,
"loss": 0.1103,
"step": 314
},
{
"epoch": 1.96875,
"grad_norm": 0.1591129940081992,
"learning_rate": 7.43510530118452e-08,
"loss": 0.1114,
"step": 315
},
{
"epoch": 1.975,
"grad_norm": 0.14377087428625243,
"learning_rate": 4.7588920907110094e-08,
"loss": 0.1202,
"step": 316
},
{
"epoch": 1.98125,
"grad_norm": 0.14890980505746196,
"learning_rate": 2.6770626181715773e-08,
"loss": 0.1172,
"step": 317
},
{
"epoch": 1.9875,
"grad_norm": 0.1536343383847314,
"learning_rate": 1.189864600454338e-08,
"loss": 0.1239,
"step": 318
},
{
"epoch": 1.99375,
"grad_norm": 0.15806075985905502,
"learning_rate": 2.974749992512571e-09,
"loss": 0.1275,
"step": 319
},
{
"epoch": 2.0,
"grad_norm": 0.14605676768659354,
"learning_rate": 0.0,
"loss": 0.1243,
"step": 320
},
{
"epoch": 2.0,
"step": 320,
"total_flos": 1567061767618560.0,
"train_loss": 0.15366401586215944,
"train_runtime": 5710.924,
"train_samples_per_second": 0.224,
"train_steps_per_second": 0.056
}
],
"logging_steps": 1,
"max_steps": 320,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1567061767618560.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}