AlioLeuchtmann's picture
Upload folder using huggingface_hub
43e2bce verified
{
"best_global_step": 648,
"best_metric": 0.00023728572705294937,
"best_model_checkpoint": "Qwen2.5_3B_VL_flip_detection/checkpoint-648",
"epoch": 1.0,
"eval_steps": 72,
"global_step": 725,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001380738695201933,
"grad_norm": 318.0,
"learning_rate": 0.0,
"loss": 6.0874,
"step": 1
},
{
"epoch": 0.002761477390403866,
"grad_norm": 326.0,
"learning_rate": 9.090909090909091e-07,
"loss": 6.1564,
"step": 2
},
{
"epoch": 0.004142216085605799,
"grad_norm": 312.0,
"learning_rate": 1.8181818181818183e-06,
"loss": 6.0274,
"step": 3
},
{
"epoch": 0.005522954780807732,
"grad_norm": 338.0,
"learning_rate": 2.7272727272727272e-06,
"loss": 5.9388,
"step": 4
},
{
"epoch": 0.0069036934760096655,
"grad_norm": 282.0,
"learning_rate": 3.6363636363636366e-06,
"loss": 5.6881,
"step": 5
},
{
"epoch": 0.008284432171211598,
"grad_norm": 290.0,
"learning_rate": 4.5454545454545455e-06,
"loss": 5.5135,
"step": 6
},
{
"epoch": 0.009665170866413532,
"grad_norm": 219.0,
"learning_rate": 5.4545454545454545e-06,
"loss": 4.9453,
"step": 7
},
{
"epoch": 0.011045909561615464,
"grad_norm": 140.0,
"learning_rate": 6.363636363636364e-06,
"loss": 4.438,
"step": 8
},
{
"epoch": 0.012426648256817397,
"grad_norm": 125.0,
"learning_rate": 7.272727272727273e-06,
"loss": 4.0221,
"step": 9
},
{
"epoch": 0.013807386952019331,
"grad_norm": 152.0,
"learning_rate": 8.181818181818183e-06,
"loss": 3.5748,
"step": 10
},
{
"epoch": 0.015188125647221263,
"grad_norm": 84.0,
"learning_rate": 9.090909090909091e-06,
"loss": 3.0537,
"step": 11
},
{
"epoch": 0.016568864342423197,
"grad_norm": 72.0,
"learning_rate": 1e-05,
"loss": 2.5683,
"step": 12
},
{
"epoch": 0.01794960303762513,
"grad_norm": 62.0,
"learning_rate": 1.0909090909090909e-05,
"loss": 2.1136,
"step": 13
},
{
"epoch": 0.019330341732827064,
"grad_norm": 63.75,
"learning_rate": 1.181818181818182e-05,
"loss": 1.8072,
"step": 14
},
{
"epoch": 0.020711080428028994,
"grad_norm": 66.0,
"learning_rate": 1.2727272727272728e-05,
"loss": 1.4696,
"step": 15
},
{
"epoch": 0.022091819123230928,
"grad_norm": 49.25,
"learning_rate": 1.3636363636363637e-05,
"loss": 1.1919,
"step": 16
},
{
"epoch": 0.02347255781843286,
"grad_norm": 53.75,
"learning_rate": 1.4545454545454546e-05,
"loss": 0.9495,
"step": 17
},
{
"epoch": 0.024853296513634795,
"grad_norm": 46.5,
"learning_rate": 1.5454545454545454e-05,
"loss": 0.8079,
"step": 18
},
{
"epoch": 0.02623403520883673,
"grad_norm": 58.75,
"learning_rate": 1.6363636363636366e-05,
"loss": 0.6889,
"step": 19
},
{
"epoch": 0.027614773904038662,
"grad_norm": 31.625,
"learning_rate": 1.7272727272727274e-05,
"loss": 0.5888,
"step": 20
},
{
"epoch": 0.028995512599240592,
"grad_norm": 61.75,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.5087,
"step": 21
},
{
"epoch": 0.030376251294442526,
"grad_norm": 21.625,
"learning_rate": 1.9090909090909094e-05,
"loss": 0.4641,
"step": 22
},
{
"epoch": 0.03175698998964446,
"grad_norm": 37.25,
"learning_rate": 2e-05,
"loss": 0.4248,
"step": 23
},
{
"epoch": 0.03313772868484639,
"grad_norm": 19.25,
"learning_rate": 1.997155049786629e-05,
"loss": 0.3976,
"step": 24
},
{
"epoch": 0.03451846738004832,
"grad_norm": 54.25,
"learning_rate": 1.9943100995732575e-05,
"loss": 0.3755,
"step": 25
},
{
"epoch": 0.03589920607525026,
"grad_norm": 37.75,
"learning_rate": 1.9914651493598865e-05,
"loss": 0.3674,
"step": 26
},
{
"epoch": 0.03727994477045219,
"grad_norm": 12.3125,
"learning_rate": 1.9886201991465152e-05,
"loss": 0.3343,
"step": 27
},
{
"epoch": 0.03866068346565413,
"grad_norm": 23.875,
"learning_rate": 1.985775248933144e-05,
"loss": 0.3108,
"step": 28
},
{
"epoch": 0.04004142216085606,
"grad_norm": 16.0,
"learning_rate": 1.9829302987197725e-05,
"loss": 0.3134,
"step": 29
},
{
"epoch": 0.04142216085605799,
"grad_norm": 140.0,
"learning_rate": 1.9800853485064012e-05,
"loss": 0.2831,
"step": 30
},
{
"epoch": 0.042802899551259925,
"grad_norm": 12.875,
"learning_rate": 1.97724039829303e-05,
"loss": 0.2804,
"step": 31
},
{
"epoch": 0.044183638246461855,
"grad_norm": 70.0,
"learning_rate": 1.974395448079659e-05,
"loss": 0.2625,
"step": 32
},
{
"epoch": 0.04556437694166379,
"grad_norm": 20.625,
"learning_rate": 1.9715504978662876e-05,
"loss": 0.2517,
"step": 33
},
{
"epoch": 0.04694511563686572,
"grad_norm": 33.25,
"learning_rate": 1.9687055476529162e-05,
"loss": 0.2638,
"step": 34
},
{
"epoch": 0.04832585433206766,
"grad_norm": 7.1875,
"learning_rate": 1.965860597439545e-05,
"loss": 0.2471,
"step": 35
},
{
"epoch": 0.04970659302726959,
"grad_norm": 4.53125,
"learning_rate": 1.9630156472261736e-05,
"loss": 0.228,
"step": 36
},
{
"epoch": 0.05108733172247152,
"grad_norm": 3.828125,
"learning_rate": 1.9601706970128026e-05,
"loss": 0.2205,
"step": 37
},
{
"epoch": 0.05246807041767346,
"grad_norm": 8.8125,
"learning_rate": 1.957325746799431e-05,
"loss": 0.221,
"step": 38
},
{
"epoch": 0.05384880911287539,
"grad_norm": 22.25,
"learning_rate": 1.95448079658606e-05,
"loss": 0.2166,
"step": 39
},
{
"epoch": 0.055229547808077324,
"grad_norm": 3.203125,
"learning_rate": 1.9516358463726886e-05,
"loss": 0.206,
"step": 40
},
{
"epoch": 0.056610286503279254,
"grad_norm": 10.5,
"learning_rate": 1.9487908961593173e-05,
"loss": 0.2014,
"step": 41
},
{
"epoch": 0.057991025198481184,
"grad_norm": 3.265625,
"learning_rate": 1.9459459459459463e-05,
"loss": 0.1947,
"step": 42
},
{
"epoch": 0.05937176389368312,
"grad_norm": 8.875,
"learning_rate": 1.943100995732575e-05,
"loss": 0.1937,
"step": 43
},
{
"epoch": 0.06075250258888505,
"grad_norm": 4.375,
"learning_rate": 1.9402560455192037e-05,
"loss": 0.1874,
"step": 44
},
{
"epoch": 0.06213324128408699,
"grad_norm": 3.625,
"learning_rate": 1.9374110953058323e-05,
"loss": 0.1797,
"step": 45
},
{
"epoch": 0.06351397997928893,
"grad_norm": 3.359375,
"learning_rate": 1.934566145092461e-05,
"loss": 0.177,
"step": 46
},
{
"epoch": 0.06489471867449086,
"grad_norm": 4.0,
"learning_rate": 1.9317211948790897e-05,
"loss": 0.1697,
"step": 47
},
{
"epoch": 0.06627545736969279,
"grad_norm": 3.359375,
"learning_rate": 1.9288762446657187e-05,
"loss": 0.163,
"step": 48
},
{
"epoch": 0.06765619606489472,
"grad_norm": 3.46875,
"learning_rate": 1.926031294452347e-05,
"loss": 0.1563,
"step": 49
},
{
"epoch": 0.06903693476009665,
"grad_norm": 3.8125,
"learning_rate": 1.923186344238976e-05,
"loss": 0.1531,
"step": 50
},
{
"epoch": 0.07041767345529859,
"grad_norm": 5.1875,
"learning_rate": 1.9203413940256047e-05,
"loss": 0.1461,
"step": 51
},
{
"epoch": 0.07179841215050052,
"grad_norm": 3.265625,
"learning_rate": 1.9174964438122334e-05,
"loss": 0.1408,
"step": 52
},
{
"epoch": 0.07317915084570245,
"grad_norm": 6.21875,
"learning_rate": 1.9146514935988624e-05,
"loss": 0.1403,
"step": 53
},
{
"epoch": 0.07455988954090438,
"grad_norm": 3.15625,
"learning_rate": 1.9118065433854907e-05,
"loss": 0.1285,
"step": 54
},
{
"epoch": 0.07594062823610631,
"grad_norm": 3.125,
"learning_rate": 1.9089615931721197e-05,
"loss": 0.1205,
"step": 55
},
{
"epoch": 0.07732136693130826,
"grad_norm": 3.3125,
"learning_rate": 1.9061166429587484e-05,
"loss": 0.1147,
"step": 56
},
{
"epoch": 0.07870210562651019,
"grad_norm": 3.0625,
"learning_rate": 1.903271692745377e-05,
"loss": 0.1054,
"step": 57
},
{
"epoch": 0.08008284432171212,
"grad_norm": 5.15625,
"learning_rate": 1.9004267425320058e-05,
"loss": 0.1014,
"step": 58
},
{
"epoch": 0.08146358301691405,
"grad_norm": 3.3125,
"learning_rate": 1.8975817923186348e-05,
"loss": 0.0919,
"step": 59
},
{
"epoch": 0.08284432171211598,
"grad_norm": 3.078125,
"learning_rate": 1.894736842105263e-05,
"loss": 0.0857,
"step": 60
},
{
"epoch": 0.08422506040731792,
"grad_norm": 2.921875,
"learning_rate": 1.891891891891892e-05,
"loss": 0.0781,
"step": 61
},
{
"epoch": 0.08560579910251985,
"grad_norm": 2.890625,
"learning_rate": 1.8890469416785208e-05,
"loss": 0.0702,
"step": 62
},
{
"epoch": 0.08698653779772178,
"grad_norm": 2.734375,
"learning_rate": 1.8862019914651495e-05,
"loss": 0.0619,
"step": 63
},
{
"epoch": 0.08836727649292371,
"grad_norm": 2.53125,
"learning_rate": 1.8833570412517785e-05,
"loss": 0.055,
"step": 64
},
{
"epoch": 0.08974801518812564,
"grad_norm": 2.46875,
"learning_rate": 1.8805120910384068e-05,
"loss": 0.049,
"step": 65
},
{
"epoch": 0.09112875388332758,
"grad_norm": 2.53125,
"learning_rate": 1.8776671408250358e-05,
"loss": 0.0431,
"step": 66
},
{
"epoch": 0.09250949257852951,
"grad_norm": 2.734375,
"learning_rate": 1.8748221906116645e-05,
"loss": 0.0366,
"step": 67
},
{
"epoch": 0.09389023127373144,
"grad_norm": 4.0625,
"learning_rate": 1.871977240398293e-05,
"loss": 0.0433,
"step": 68
},
{
"epoch": 0.09527096996893337,
"grad_norm": 1.6875,
"learning_rate": 1.869132290184922e-05,
"loss": 0.0275,
"step": 69
},
{
"epoch": 0.09665170866413532,
"grad_norm": 1.7109375,
"learning_rate": 1.8662873399715505e-05,
"loss": 0.0239,
"step": 70
},
{
"epoch": 0.09803244735933725,
"grad_norm": 1.53125,
"learning_rate": 1.8634423897581795e-05,
"loss": 0.0238,
"step": 71
},
{
"epoch": 0.09941318605453918,
"grad_norm": 2.984375,
"learning_rate": 1.8605974395448082e-05,
"loss": 0.019,
"step": 72
},
{
"epoch": 0.09941318605453918,
"eval_loss": 0.016371596604585648,
"eval_runtime": 592.2167,
"eval_samples_per_second": 2.175,
"eval_steps_per_second": 2.175,
"step": 72
},
{
"epoch": 0.10079392474974111,
"grad_norm": 1.515625,
"learning_rate": 1.857752489331437e-05,
"loss": 0.0154,
"step": 73
},
{
"epoch": 0.10217466344494304,
"grad_norm": 0.9296875,
"learning_rate": 1.8549075391180655e-05,
"loss": 0.013,
"step": 74
},
{
"epoch": 0.10355540214014498,
"grad_norm": 0.91796875,
"learning_rate": 1.8520625889046942e-05,
"loss": 0.0154,
"step": 75
},
{
"epoch": 0.10493614083534691,
"grad_norm": 1.2734375,
"learning_rate": 1.849217638691323e-05,
"loss": 0.0101,
"step": 76
},
{
"epoch": 0.10631687953054884,
"grad_norm": 2.03125,
"learning_rate": 1.846372688477952e-05,
"loss": 0.0098,
"step": 77
},
{
"epoch": 0.10769761822575077,
"grad_norm": 6.3125,
"learning_rate": 1.8435277382645806e-05,
"loss": 0.0114,
"step": 78
},
{
"epoch": 0.1090783569209527,
"grad_norm": 0.51171875,
"learning_rate": 1.8406827880512092e-05,
"loss": 0.0065,
"step": 79
},
{
"epoch": 0.11045909561615465,
"grad_norm": 0.6796875,
"learning_rate": 1.8378378378378383e-05,
"loss": 0.0067,
"step": 80
},
{
"epoch": 0.11183983431135658,
"grad_norm": 1.0859375,
"learning_rate": 1.8349928876244666e-05,
"loss": 0.0109,
"step": 81
},
{
"epoch": 0.11322057300655851,
"grad_norm": 0.37109375,
"learning_rate": 1.8321479374110956e-05,
"loss": 0.0045,
"step": 82
},
{
"epoch": 0.11460131170176044,
"grad_norm": 0.6875,
"learning_rate": 1.8293029871977243e-05,
"loss": 0.0065,
"step": 83
},
{
"epoch": 0.11598205039696237,
"grad_norm": 0.37109375,
"learning_rate": 1.826458036984353e-05,
"loss": 0.0036,
"step": 84
},
{
"epoch": 0.11736278909216431,
"grad_norm": 0.25390625,
"learning_rate": 1.8236130867709816e-05,
"loss": 0.0029,
"step": 85
},
{
"epoch": 0.11874352778736624,
"grad_norm": 0.22265625,
"learning_rate": 1.8207681365576103e-05,
"loss": 0.0025,
"step": 86
},
{
"epoch": 0.12012426648256817,
"grad_norm": 0.208984375,
"learning_rate": 1.817923186344239e-05,
"loss": 0.0023,
"step": 87
},
{
"epoch": 0.1215050051777701,
"grad_norm": 0.3125,
"learning_rate": 1.815078236130868e-05,
"loss": 0.0021,
"step": 88
},
{
"epoch": 0.12288574387297203,
"grad_norm": 0.1748046875,
"learning_rate": 1.8122332859174966e-05,
"loss": 0.0019,
"step": 89
},
{
"epoch": 0.12426648256817398,
"grad_norm": 0.150390625,
"learning_rate": 1.8093883357041253e-05,
"loss": 0.0016,
"step": 90
},
{
"epoch": 0.1256472212633759,
"grad_norm": 0.134765625,
"learning_rate": 1.806543385490754e-05,
"loss": 0.0014,
"step": 91
},
{
"epoch": 0.12702795995857785,
"grad_norm": 0.96484375,
"learning_rate": 1.8036984352773827e-05,
"loss": 0.0027,
"step": 92
},
{
"epoch": 0.12840869865377977,
"grad_norm": 0.1162109375,
"learning_rate": 1.8008534850640117e-05,
"loss": 0.0012,
"step": 93
},
{
"epoch": 0.1297894373489817,
"grad_norm": 0.107421875,
"learning_rate": 1.7980085348506404e-05,
"loss": 0.0011,
"step": 94
},
{
"epoch": 0.13117017604418363,
"grad_norm": 0.6015625,
"learning_rate": 1.795163584637269e-05,
"loss": 0.0025,
"step": 95
},
{
"epoch": 0.13255091473938557,
"grad_norm": 0.0869140625,
"learning_rate": 1.7923186344238977e-05,
"loss": 0.0009,
"step": 96
},
{
"epoch": 0.13393165343458752,
"grad_norm": 0.1396484375,
"learning_rate": 1.7894736842105264e-05,
"loss": 0.001,
"step": 97
},
{
"epoch": 0.13531239212978943,
"grad_norm": 0.078125,
"learning_rate": 1.7866287339971554e-05,
"loss": 0.0008,
"step": 98
},
{
"epoch": 0.13669313082499138,
"grad_norm": 0.1689453125,
"learning_rate": 1.783783783783784e-05,
"loss": 0.0015,
"step": 99
},
{
"epoch": 0.1380738695201933,
"grad_norm": 0.255859375,
"learning_rate": 1.7809388335704127e-05,
"loss": 0.001,
"step": 100
},
{
"epoch": 0.13945460821539524,
"grad_norm": 0.06640625,
"learning_rate": 1.7780938833570414e-05,
"loss": 0.0007,
"step": 101
},
{
"epoch": 0.14083534691059718,
"grad_norm": 0.0634765625,
"learning_rate": 1.77524893314367e-05,
"loss": 0.0006,
"step": 102
},
{
"epoch": 0.1422160856057991,
"grad_norm": 0.058837890625,
"learning_rate": 1.7724039829302988e-05,
"loss": 0.0006,
"step": 103
},
{
"epoch": 0.14359682430100104,
"grad_norm": 0.053955078125,
"learning_rate": 1.7695590327169278e-05,
"loss": 0.0005,
"step": 104
},
{
"epoch": 0.14497756299620296,
"grad_norm": 0.054931640625,
"learning_rate": 1.766714082503556e-05,
"loss": 0.0005,
"step": 105
},
{
"epoch": 0.1463583016914049,
"grad_norm": 0.054443359375,
"learning_rate": 1.763869132290185e-05,
"loss": 0.0005,
"step": 106
},
{
"epoch": 0.14773904038660685,
"grad_norm": 0.053466796875,
"learning_rate": 1.7610241820768138e-05,
"loss": 0.0005,
"step": 107
},
{
"epoch": 0.14911977908180876,
"grad_norm": 0.81640625,
"learning_rate": 1.7581792318634425e-05,
"loss": 0.001,
"step": 108
},
{
"epoch": 0.1505005177770107,
"grad_norm": 0.048583984375,
"learning_rate": 1.7553342816500715e-05,
"loss": 0.0005,
"step": 109
},
{
"epoch": 0.15188125647221262,
"grad_norm": 0.05322265625,
"learning_rate": 1.7524893314367e-05,
"loss": 0.0005,
"step": 110
},
{
"epoch": 0.15326199516741457,
"grad_norm": 0.26953125,
"learning_rate": 1.7496443812233288e-05,
"loss": 0.0005,
"step": 111
},
{
"epoch": 0.1546427338626165,
"grad_norm": 2.5625,
"learning_rate": 1.7467994310099575e-05,
"loss": 0.0025,
"step": 112
},
{
"epoch": 0.15602347255781843,
"grad_norm": 0.4609375,
"learning_rate": 1.743954480796586e-05,
"loss": 0.0029,
"step": 113
},
{
"epoch": 0.15740421125302037,
"grad_norm": 0.56640625,
"learning_rate": 1.741109530583215e-05,
"loss": 0.0007,
"step": 114
},
{
"epoch": 0.1587849499482223,
"grad_norm": 0.044189453125,
"learning_rate": 1.738264580369844e-05,
"loss": 0.0004,
"step": 115
},
{
"epoch": 0.16016568864342423,
"grad_norm": 0.04443359375,
"learning_rate": 1.7354196301564722e-05,
"loss": 0.0004,
"step": 116
},
{
"epoch": 0.16154642733862618,
"grad_norm": 0.045654296875,
"learning_rate": 1.7325746799431012e-05,
"loss": 0.0004,
"step": 117
},
{
"epoch": 0.1629271660338281,
"grad_norm": 0.09033203125,
"learning_rate": 1.72972972972973e-05,
"loss": 0.0005,
"step": 118
},
{
"epoch": 0.16430790472903004,
"grad_norm": 0.046875,
"learning_rate": 1.7268847795163585e-05,
"loss": 0.0004,
"step": 119
},
{
"epoch": 0.16568864342423195,
"grad_norm": 0.0439453125,
"learning_rate": 1.7240398293029875e-05,
"loss": 0.0004,
"step": 120
},
{
"epoch": 0.1670693821194339,
"grad_norm": 0.41015625,
"learning_rate": 1.721194879089616e-05,
"loss": 0.0056,
"step": 121
},
{
"epoch": 0.16845012081463584,
"grad_norm": 0.07275390625,
"learning_rate": 1.718349928876245e-05,
"loss": 0.0005,
"step": 122
},
{
"epoch": 0.16983085950983776,
"grad_norm": 0.042236328125,
"learning_rate": 1.7155049786628736e-05,
"loss": 0.0004,
"step": 123
},
{
"epoch": 0.1712115982050397,
"grad_norm": 0.10107421875,
"learning_rate": 1.7126600284495022e-05,
"loss": 0.0005,
"step": 124
},
{
"epoch": 0.17259233690024162,
"grad_norm": 0.040283203125,
"learning_rate": 1.709815078236131e-05,
"loss": 0.0004,
"step": 125
},
{
"epoch": 0.17397307559544356,
"grad_norm": 0.28515625,
"learning_rate": 1.7069701280227596e-05,
"loss": 0.0034,
"step": 126
},
{
"epoch": 0.1753538142906455,
"grad_norm": 0.146484375,
"learning_rate": 1.7041251778093886e-05,
"loss": 0.0008,
"step": 127
},
{
"epoch": 0.17673455298584742,
"grad_norm": 0.279296875,
"learning_rate": 1.7012802275960173e-05,
"loss": 0.0019,
"step": 128
},
{
"epoch": 0.17811529168104936,
"grad_norm": 0.25390625,
"learning_rate": 1.698435277382646e-05,
"loss": 0.0018,
"step": 129
},
{
"epoch": 0.17949603037625128,
"grad_norm": 0.10498046875,
"learning_rate": 1.6955903271692746e-05,
"loss": 0.0006,
"step": 130
},
{
"epoch": 0.18087676907145323,
"grad_norm": 0.1474609375,
"learning_rate": 1.6927453769559036e-05,
"loss": 0.001,
"step": 131
},
{
"epoch": 0.18225750776665517,
"grad_norm": 0.12060546875,
"learning_rate": 1.689900426742532e-05,
"loss": 0.0011,
"step": 132
},
{
"epoch": 0.18363824646185709,
"grad_norm": 0.034912109375,
"learning_rate": 1.687055476529161e-05,
"loss": 0.0003,
"step": 133
},
{
"epoch": 0.18501898515705903,
"grad_norm": 0.03515625,
"learning_rate": 1.6842105263157896e-05,
"loss": 0.0003,
"step": 134
},
{
"epoch": 0.18639972385226097,
"grad_norm": 0.08544921875,
"learning_rate": 1.6813655761024183e-05,
"loss": 0.0007,
"step": 135
},
{
"epoch": 0.1877804625474629,
"grad_norm": 0.03466796875,
"learning_rate": 1.6785206258890473e-05,
"loss": 0.0003,
"step": 136
},
{
"epoch": 0.18916120124266483,
"grad_norm": 0.033203125,
"learning_rate": 1.6756756756756757e-05,
"loss": 0.0003,
"step": 137
},
{
"epoch": 0.19054193993786675,
"grad_norm": 0.1650390625,
"learning_rate": 1.6728307254623047e-05,
"loss": 0.0004,
"step": 138
},
{
"epoch": 0.1919226786330687,
"grad_norm": 0.0703125,
"learning_rate": 1.6699857752489334e-05,
"loss": 0.0004,
"step": 139
},
{
"epoch": 0.19330341732827064,
"grad_norm": 0.03955078125,
"learning_rate": 1.667140825035562e-05,
"loss": 0.0004,
"step": 140
},
{
"epoch": 0.19468415602347255,
"grad_norm": 0.138671875,
"learning_rate": 1.6642958748221907e-05,
"loss": 0.0004,
"step": 141
},
{
"epoch": 0.1960648947186745,
"grad_norm": 1.015625,
"learning_rate": 1.6614509246088194e-05,
"loss": 0.0013,
"step": 142
},
{
"epoch": 0.19744563341387641,
"grad_norm": 0.03271484375,
"learning_rate": 1.658605974395448e-05,
"loss": 0.0003,
"step": 143
},
{
"epoch": 0.19882637210907836,
"grad_norm": 0.03369140625,
"learning_rate": 1.655761024182077e-05,
"loss": 0.0003,
"step": 144
},
{
"epoch": 0.19882637210907836,
"eval_loss": 0.0005994443781673908,
"eval_runtime": 594.9971,
"eval_samples_per_second": 2.165,
"eval_steps_per_second": 2.165,
"step": 144
},
{
"epoch": 0.2002071108042803,
"grad_norm": 0.349609375,
"learning_rate": 1.6529160739687057e-05,
"loss": 0.0005,
"step": 145
},
{
"epoch": 0.20158784949948222,
"grad_norm": 0.054443359375,
"learning_rate": 1.6500711237553344e-05,
"loss": 0.0005,
"step": 146
},
{
"epoch": 0.20296858819468416,
"grad_norm": 0.494140625,
"learning_rate": 1.647226173541963e-05,
"loss": 0.0038,
"step": 147
},
{
"epoch": 0.20434932688988608,
"grad_norm": 0.09228515625,
"learning_rate": 1.6443812233285917e-05,
"loss": 0.0005,
"step": 148
},
{
"epoch": 0.20573006558508802,
"grad_norm": 0.0380859375,
"learning_rate": 1.6415362731152208e-05,
"loss": 0.0003,
"step": 149
},
{
"epoch": 0.20711080428028997,
"grad_norm": 0.044189453125,
"learning_rate": 1.6386913229018494e-05,
"loss": 0.0005,
"step": 150
},
{
"epoch": 0.20849154297549188,
"grad_norm": 0.50390625,
"learning_rate": 1.635846372688478e-05,
"loss": 0.0018,
"step": 151
},
{
"epoch": 0.20987228167069383,
"grad_norm": 0.2275390625,
"learning_rate": 1.6330014224751068e-05,
"loss": 0.0006,
"step": 152
},
{
"epoch": 0.21125302036589574,
"grad_norm": 0.08349609375,
"learning_rate": 1.6301564722617355e-05,
"loss": 0.0005,
"step": 153
},
{
"epoch": 0.2126337590610977,
"grad_norm": 0.1689453125,
"learning_rate": 1.627311522048364e-05,
"loss": 0.0007,
"step": 154
},
{
"epoch": 0.21401449775629963,
"grad_norm": 0.89453125,
"learning_rate": 1.624466571834993e-05,
"loss": 0.0041,
"step": 155
},
{
"epoch": 0.21539523645150155,
"grad_norm": 1.0625,
"learning_rate": 1.6216216216216218e-05,
"loss": 0.0024,
"step": 156
},
{
"epoch": 0.2167759751467035,
"grad_norm": 2.28125,
"learning_rate": 1.6187766714082505e-05,
"loss": 0.0014,
"step": 157
},
{
"epoch": 0.2181567138419054,
"grad_norm": 0.09716796875,
"learning_rate": 1.615931721194879e-05,
"loss": 0.0005,
"step": 158
},
{
"epoch": 0.21953745253710735,
"grad_norm": 0.2431640625,
"learning_rate": 1.6130867709815078e-05,
"loss": 0.001,
"step": 159
},
{
"epoch": 0.2209181912323093,
"grad_norm": 0.047119140625,
"learning_rate": 1.610241820768137e-05,
"loss": 0.0004,
"step": 160
},
{
"epoch": 0.2222989299275112,
"grad_norm": 0.0303955078125,
"learning_rate": 1.6073968705547652e-05,
"loss": 0.0003,
"step": 161
},
{
"epoch": 0.22367966862271316,
"grad_norm": 0.07470703125,
"learning_rate": 1.6045519203413942e-05,
"loss": 0.0004,
"step": 162
},
{
"epoch": 0.22506040731791507,
"grad_norm": 0.042724609375,
"learning_rate": 1.601706970128023e-05,
"loss": 0.0003,
"step": 163
},
{
"epoch": 0.22644114601311702,
"grad_norm": 0.02880859375,
"learning_rate": 1.5988620199146515e-05,
"loss": 0.0003,
"step": 164
},
{
"epoch": 0.22782188470831896,
"grad_norm": 0.029541015625,
"learning_rate": 1.5960170697012805e-05,
"loss": 0.0003,
"step": 165
},
{
"epoch": 0.22920262340352088,
"grad_norm": 0.028076171875,
"learning_rate": 1.5931721194879092e-05,
"loss": 0.0003,
"step": 166
},
{
"epoch": 0.23058336209872282,
"grad_norm": 0.0291748046875,
"learning_rate": 1.590327169274538e-05,
"loss": 0.0003,
"step": 167
},
{
"epoch": 0.23196410079392474,
"grad_norm": 0.0284423828125,
"learning_rate": 1.5874822190611666e-05,
"loss": 0.0003,
"step": 168
},
{
"epoch": 0.23334483948912668,
"grad_norm": 0.0279541015625,
"learning_rate": 1.5846372688477952e-05,
"loss": 0.0003,
"step": 169
},
{
"epoch": 0.23472557818432863,
"grad_norm": 0.0277099609375,
"learning_rate": 1.581792318634424e-05,
"loss": 0.0003,
"step": 170
},
{
"epoch": 0.23610631687953054,
"grad_norm": 0.349609375,
"learning_rate": 1.578947368421053e-05,
"loss": 0.0036,
"step": 171
},
{
"epoch": 0.2374870555747325,
"grad_norm": 0.0274658203125,
"learning_rate": 1.5761024182076813e-05,
"loss": 0.0003,
"step": 172
},
{
"epoch": 0.2388677942699344,
"grad_norm": 1.0625,
"learning_rate": 1.5732574679943103e-05,
"loss": 0.0027,
"step": 173
},
{
"epoch": 0.24024853296513635,
"grad_norm": 0.0274658203125,
"learning_rate": 1.570412517780939e-05,
"loss": 0.0003,
"step": 174
},
{
"epoch": 0.2416292716603383,
"grad_norm": 0.0277099609375,
"learning_rate": 1.5675675675675676e-05,
"loss": 0.0003,
"step": 175
},
{
"epoch": 0.2430100103555402,
"grad_norm": 0.0262451171875,
"learning_rate": 1.5647226173541966e-05,
"loss": 0.0003,
"step": 176
},
{
"epoch": 0.24439074905074215,
"grad_norm": 0.0264892578125,
"learning_rate": 1.561877667140825e-05,
"loss": 0.0003,
"step": 177
},
{
"epoch": 0.24577148774594407,
"grad_norm": 0.0274658203125,
"learning_rate": 1.559032716927454e-05,
"loss": 0.0003,
"step": 178
},
{
"epoch": 0.247152226441146,
"grad_norm": 0.345703125,
"learning_rate": 1.5561877667140826e-05,
"loss": 0.001,
"step": 179
},
{
"epoch": 0.24853296513634796,
"grad_norm": 0.034423828125,
"learning_rate": 1.5533428165007113e-05,
"loss": 0.0003,
"step": 180
},
{
"epoch": 0.24991370383154987,
"grad_norm": 0.038330078125,
"learning_rate": 1.55049786628734e-05,
"loss": 0.0003,
"step": 181
},
{
"epoch": 0.2512944425267518,
"grad_norm": 0.0262451171875,
"learning_rate": 1.547652916073969e-05,
"loss": 0.0003,
"step": 182
},
{
"epoch": 0.25267518122195376,
"grad_norm": 1.484375,
"learning_rate": 1.5448079658605977e-05,
"loss": 0.0017,
"step": 183
},
{
"epoch": 0.2540559199171557,
"grad_norm": 0.025634765625,
"learning_rate": 1.5419630156472263e-05,
"loss": 0.0002,
"step": 184
},
{
"epoch": 0.2554366586123576,
"grad_norm": 0.02685546875,
"learning_rate": 1.539118065433855e-05,
"loss": 0.0003,
"step": 185
},
{
"epoch": 0.25681739730755954,
"grad_norm": 3.921875,
"learning_rate": 1.5362731152204837e-05,
"loss": 0.0042,
"step": 186
},
{
"epoch": 0.2581981360027615,
"grad_norm": 0.81640625,
"learning_rate": 1.5334281650071127e-05,
"loss": 0.0012,
"step": 187
},
{
"epoch": 0.2595788746979634,
"grad_norm": 0.0390625,
"learning_rate": 1.530583214793741e-05,
"loss": 0.0004,
"step": 188
},
{
"epoch": 0.26095961339316537,
"grad_norm": 0.0262451171875,
"learning_rate": 1.52773826458037e-05,
"loss": 0.0003,
"step": 189
},
{
"epoch": 0.26234035208836726,
"grad_norm": 0.0250244140625,
"learning_rate": 1.5248933143669986e-05,
"loss": 0.0002,
"step": 190
},
{
"epoch": 0.2637210907835692,
"grad_norm": 0.1689453125,
"learning_rate": 1.5220483641536274e-05,
"loss": 0.0004,
"step": 191
},
{
"epoch": 0.26510182947877114,
"grad_norm": 0.0294189453125,
"learning_rate": 1.5192034139402562e-05,
"loss": 0.0003,
"step": 192
},
{
"epoch": 0.2664825681739731,
"grad_norm": 0.419921875,
"learning_rate": 1.5163584637268849e-05,
"loss": 0.0013,
"step": 193
},
{
"epoch": 0.26786330686917503,
"grad_norm": 0.0260009765625,
"learning_rate": 1.5135135135135138e-05,
"loss": 0.0002,
"step": 194
},
{
"epoch": 0.2692440455643769,
"grad_norm": 0.02685546875,
"learning_rate": 1.5106685633001423e-05,
"loss": 0.0003,
"step": 195
},
{
"epoch": 0.27062478425957887,
"grad_norm": 0.025634765625,
"learning_rate": 1.5078236130867711e-05,
"loss": 0.0002,
"step": 196
},
{
"epoch": 0.2720055229547808,
"grad_norm": 0.0244140625,
"learning_rate": 1.5049786628733998e-05,
"loss": 0.0002,
"step": 197
},
{
"epoch": 0.27338626164998275,
"grad_norm": 0.029296875,
"learning_rate": 1.5021337126600286e-05,
"loss": 0.0003,
"step": 198
},
{
"epoch": 0.2747670003451847,
"grad_norm": 0.02880859375,
"learning_rate": 1.4992887624466573e-05,
"loss": 0.0003,
"step": 199
},
{
"epoch": 0.2761477390403866,
"grad_norm": 0.024658203125,
"learning_rate": 1.4964438122332861e-05,
"loss": 0.0002,
"step": 200
},
{
"epoch": 0.27752847773558853,
"grad_norm": 0.0732421875,
"learning_rate": 1.4935988620199146e-05,
"loss": 0.0004,
"step": 201
},
{
"epoch": 0.2789092164307905,
"grad_norm": 0.03466796875,
"learning_rate": 1.4907539118065435e-05,
"loss": 0.0003,
"step": 202
},
{
"epoch": 0.2802899551259924,
"grad_norm": 0.0235595703125,
"learning_rate": 1.4879089615931723e-05,
"loss": 0.0002,
"step": 203
},
{
"epoch": 0.28167069382119436,
"grad_norm": 0.0257568359375,
"learning_rate": 1.485064011379801e-05,
"loss": 0.0002,
"step": 204
},
{
"epoch": 0.28305143251639625,
"grad_norm": 0.0263671875,
"learning_rate": 1.4822190611664298e-05,
"loss": 0.0002,
"step": 205
},
{
"epoch": 0.2844321712115982,
"grad_norm": 0.045166015625,
"learning_rate": 1.4793741109530583e-05,
"loss": 0.0004,
"step": 206
},
{
"epoch": 0.28581290990680014,
"grad_norm": 0.0252685546875,
"learning_rate": 1.4765291607396872e-05,
"loss": 0.0002,
"step": 207
},
{
"epoch": 0.2871936486020021,
"grad_norm": 0.703125,
"learning_rate": 1.4736842105263159e-05,
"loss": 0.0015,
"step": 208
},
{
"epoch": 0.288574387297204,
"grad_norm": 0.0233154296875,
"learning_rate": 1.4708392603129447e-05,
"loss": 0.0002,
"step": 209
},
{
"epoch": 0.2899551259924059,
"grad_norm": 0.0255126953125,
"learning_rate": 1.4679943100995732e-05,
"loss": 0.0003,
"step": 210
},
{
"epoch": 0.29133586468760786,
"grad_norm": 0.0233154296875,
"learning_rate": 1.465149359886202e-05,
"loss": 0.0002,
"step": 211
},
{
"epoch": 0.2927166033828098,
"grad_norm": 0.023193359375,
"learning_rate": 1.4623044096728309e-05,
"loss": 0.0002,
"step": 212
},
{
"epoch": 0.29409734207801175,
"grad_norm": 0.51953125,
"learning_rate": 1.4594594594594596e-05,
"loss": 0.0004,
"step": 213
},
{
"epoch": 0.2954780807732137,
"grad_norm": 0.52734375,
"learning_rate": 1.4566145092460884e-05,
"loss": 0.0003,
"step": 214
},
{
"epoch": 0.2968588194684156,
"grad_norm": 0.02392578125,
"learning_rate": 1.453769559032717e-05,
"loss": 0.0002,
"step": 215
},
{
"epoch": 0.2982395581636175,
"grad_norm": 0.0234375,
"learning_rate": 1.4509246088193457e-05,
"loss": 0.0002,
"step": 216
},
{
"epoch": 0.2982395581636175,
"eval_loss": 0.000599406601395458,
"eval_runtime": 586.817,
"eval_samples_per_second": 2.195,
"eval_steps_per_second": 2.195,
"step": 216
},
{
"epoch": 0.29962029685881947,
"grad_norm": 0.024658203125,
"learning_rate": 1.4480796586059744e-05,
"loss": 0.0002,
"step": 217
},
{
"epoch": 0.3010010355540214,
"grad_norm": 0.16796875,
"learning_rate": 1.4452347083926033e-05,
"loss": 0.0028,
"step": 218
},
{
"epoch": 0.30238177424922336,
"grad_norm": 0.0230712890625,
"learning_rate": 1.442389758179232e-05,
"loss": 0.0002,
"step": 219
},
{
"epoch": 0.30376251294442524,
"grad_norm": 0.024658203125,
"learning_rate": 1.4395448079658608e-05,
"loss": 0.0002,
"step": 220
},
{
"epoch": 0.3051432516396272,
"grad_norm": 0.0238037109375,
"learning_rate": 1.4366998577524896e-05,
"loss": 0.0002,
"step": 221
},
{
"epoch": 0.30652399033482913,
"grad_norm": 0.4765625,
"learning_rate": 1.4338549075391181e-05,
"loss": 0.0009,
"step": 222
},
{
"epoch": 0.3079047290300311,
"grad_norm": 0.023193359375,
"learning_rate": 1.431009957325747e-05,
"loss": 0.0002,
"step": 223
},
{
"epoch": 0.309285467725233,
"grad_norm": 0.023193359375,
"learning_rate": 1.4281650071123756e-05,
"loss": 0.0002,
"step": 224
},
{
"epoch": 0.3106662064204349,
"grad_norm": 0.0223388671875,
"learning_rate": 1.4253200568990045e-05,
"loss": 0.0002,
"step": 225
},
{
"epoch": 0.31204694511563685,
"grad_norm": 0.022216796875,
"learning_rate": 1.422475106685633e-05,
"loss": 0.0002,
"step": 226
},
{
"epoch": 0.3134276838108388,
"grad_norm": 0.02294921875,
"learning_rate": 1.4196301564722618e-05,
"loss": 0.0002,
"step": 227
},
{
"epoch": 0.31480842250604074,
"grad_norm": 0.0233154296875,
"learning_rate": 1.4167852062588905e-05,
"loss": 0.0002,
"step": 228
},
{
"epoch": 0.3161891612012427,
"grad_norm": 0.0238037109375,
"learning_rate": 1.4139402560455193e-05,
"loss": 0.0002,
"step": 229
},
{
"epoch": 0.3175698998964446,
"grad_norm": 1.25,
"learning_rate": 1.4110953058321482e-05,
"loss": 0.012,
"step": 230
},
{
"epoch": 0.3189506385916465,
"grad_norm": 0.0224609375,
"learning_rate": 1.4082503556187767e-05,
"loss": 0.0002,
"step": 231
},
{
"epoch": 0.32033137728684846,
"grad_norm": 0.0218505859375,
"learning_rate": 1.4054054054054055e-05,
"loss": 0.0002,
"step": 232
},
{
"epoch": 0.3217121159820504,
"grad_norm": 0.06005859375,
"learning_rate": 1.4025604551920342e-05,
"loss": 0.0004,
"step": 233
},
{
"epoch": 0.32309285467725235,
"grad_norm": 0.042236328125,
"learning_rate": 1.399715504978663e-05,
"loss": 0.0004,
"step": 234
},
{
"epoch": 0.32447359337245424,
"grad_norm": 0.031494140625,
"learning_rate": 1.3968705547652917e-05,
"loss": 0.0003,
"step": 235
},
{
"epoch": 0.3258543320676562,
"grad_norm": 0.0264892578125,
"learning_rate": 1.3940256045519206e-05,
"loss": 0.0003,
"step": 236
},
{
"epoch": 0.3272350707628581,
"grad_norm": 0.1318359375,
"learning_rate": 1.391180654338549e-05,
"loss": 0.0007,
"step": 237
},
{
"epoch": 0.32861580945806007,
"grad_norm": 0.034423828125,
"learning_rate": 1.3883357041251779e-05,
"loss": 0.0003,
"step": 238
},
{
"epoch": 0.329996548153262,
"grad_norm": 0.036865234375,
"learning_rate": 1.3854907539118068e-05,
"loss": 0.0003,
"step": 239
},
{
"epoch": 0.3313772868484639,
"grad_norm": 0.039794921875,
"learning_rate": 1.3826458036984354e-05,
"loss": 0.0003,
"step": 240
},
{
"epoch": 0.33275802554366585,
"grad_norm": 0.0390625,
"learning_rate": 1.3798008534850643e-05,
"loss": 0.0003,
"step": 241
},
{
"epoch": 0.3341387642388678,
"grad_norm": 0.03759765625,
"learning_rate": 1.3769559032716928e-05,
"loss": 0.0003,
"step": 242
},
{
"epoch": 0.33551950293406974,
"grad_norm": 0.045654296875,
"learning_rate": 1.3741109530583216e-05,
"loss": 0.0004,
"step": 243
},
{
"epoch": 0.3369002416292717,
"grad_norm": 0.3671875,
"learning_rate": 1.3712660028449503e-05,
"loss": 0.0004,
"step": 244
},
{
"epoch": 0.33828098032447357,
"grad_norm": 0.0260009765625,
"learning_rate": 1.3684210526315791e-05,
"loss": 0.0003,
"step": 245
},
{
"epoch": 0.3396617190196755,
"grad_norm": 0.051513671875,
"learning_rate": 1.3655761024182076e-05,
"loss": 0.0004,
"step": 246
},
{
"epoch": 0.34104245771487746,
"grad_norm": 0.023193359375,
"learning_rate": 1.3627311522048365e-05,
"loss": 0.0002,
"step": 247
},
{
"epoch": 0.3424231964100794,
"grad_norm": 0.062255859375,
"learning_rate": 1.3598862019914653e-05,
"loss": 0.0003,
"step": 248
},
{
"epoch": 0.34380393510528134,
"grad_norm": 0.0250244140625,
"learning_rate": 1.357041251778094e-05,
"loss": 0.0003,
"step": 249
},
{
"epoch": 0.34518467380048323,
"grad_norm": 0.0235595703125,
"learning_rate": 1.3541963015647228e-05,
"loss": 0.0002,
"step": 250
},
{
"epoch": 0.3465654124956852,
"grad_norm": 0.021728515625,
"learning_rate": 1.3513513513513515e-05,
"loss": 0.0002,
"step": 251
},
{
"epoch": 0.3479461511908871,
"grad_norm": 1.6015625,
"learning_rate": 1.3485064011379802e-05,
"loss": 0.0012,
"step": 252
},
{
"epoch": 0.34932688988608906,
"grad_norm": 0.0224609375,
"learning_rate": 1.3456614509246089e-05,
"loss": 0.0002,
"step": 253
},
{
"epoch": 0.350707628581291,
"grad_norm": 0.034912109375,
"learning_rate": 1.3428165007112377e-05,
"loss": 0.0002,
"step": 254
},
{
"epoch": 0.3520883672764929,
"grad_norm": 0.034423828125,
"learning_rate": 1.3399715504978664e-05,
"loss": 0.0003,
"step": 255
},
{
"epoch": 0.35346910597169484,
"grad_norm": 0.0206298828125,
"learning_rate": 1.3371266002844952e-05,
"loss": 0.0002,
"step": 256
},
{
"epoch": 0.3548498446668968,
"grad_norm": 0.0247802734375,
"learning_rate": 1.3342816500711237e-05,
"loss": 0.0002,
"step": 257
},
{
"epoch": 0.35623058336209873,
"grad_norm": 0.023193359375,
"learning_rate": 1.3314366998577526e-05,
"loss": 0.0002,
"step": 258
},
{
"epoch": 0.3576113220573007,
"grad_norm": 0.0208740234375,
"learning_rate": 1.3285917496443814e-05,
"loss": 0.0002,
"step": 259
},
{
"epoch": 0.35899206075250256,
"grad_norm": 0.02099609375,
"learning_rate": 1.32574679943101e-05,
"loss": 0.0002,
"step": 260
},
{
"epoch": 0.3603727994477045,
"grad_norm": 0.70703125,
"learning_rate": 1.322901849217639e-05,
"loss": 0.0005,
"step": 261
},
{
"epoch": 0.36175353814290645,
"grad_norm": 0.02001953125,
"learning_rate": 1.3200568990042674e-05,
"loss": 0.0002,
"step": 262
},
{
"epoch": 0.3631342768381084,
"grad_norm": 0.021728515625,
"learning_rate": 1.3172119487908963e-05,
"loss": 0.0002,
"step": 263
},
{
"epoch": 0.36451501553331034,
"grad_norm": 0.0203857421875,
"learning_rate": 1.314366998577525e-05,
"loss": 0.0002,
"step": 264
},
{
"epoch": 0.3658957542285122,
"grad_norm": 0.03564453125,
"learning_rate": 1.3115220483641538e-05,
"loss": 0.0003,
"step": 265
},
{
"epoch": 0.36727649292371417,
"grad_norm": 0.020263671875,
"learning_rate": 1.3086770981507825e-05,
"loss": 0.0002,
"step": 266
},
{
"epoch": 0.3686572316189161,
"grad_norm": 0.0201416015625,
"learning_rate": 1.3058321479374111e-05,
"loss": 0.0002,
"step": 267
},
{
"epoch": 0.37003797031411806,
"grad_norm": 0.1396484375,
"learning_rate": 1.30298719772404e-05,
"loss": 0.0002,
"step": 268
},
{
"epoch": 0.37141870900932,
"grad_norm": 0.388671875,
"learning_rate": 1.3001422475106686e-05,
"loss": 0.0006,
"step": 269
},
{
"epoch": 0.37279944770452195,
"grad_norm": 0.02001953125,
"learning_rate": 1.2972972972972975e-05,
"loss": 0.0002,
"step": 270
},
{
"epoch": 0.37418018639972384,
"grad_norm": 0.0264892578125,
"learning_rate": 1.2944523470839262e-05,
"loss": 0.0002,
"step": 271
},
{
"epoch": 0.3755609250949258,
"grad_norm": 0.019775390625,
"learning_rate": 1.291607396870555e-05,
"loss": 0.0002,
"step": 272
},
{
"epoch": 0.3769416637901277,
"grad_norm": 0.019775390625,
"learning_rate": 1.2887624466571835e-05,
"loss": 0.0002,
"step": 273
},
{
"epoch": 0.37832240248532967,
"grad_norm": 0.019287109375,
"learning_rate": 1.2859174964438123e-05,
"loss": 0.0002,
"step": 274
},
{
"epoch": 0.3797031411805316,
"grad_norm": 0.02978515625,
"learning_rate": 1.283072546230441e-05,
"loss": 0.0002,
"step": 275
},
{
"epoch": 0.3810838798757335,
"grad_norm": 0.0189208984375,
"learning_rate": 1.2802275960170699e-05,
"loss": 0.0002,
"step": 276
},
{
"epoch": 0.38246461857093544,
"grad_norm": 0.333984375,
"learning_rate": 1.2773826458036987e-05,
"loss": 0.0008,
"step": 277
},
{
"epoch": 0.3838453572661374,
"grad_norm": 0.018798828125,
"learning_rate": 1.2745376955903272e-05,
"loss": 0.0002,
"step": 278
},
{
"epoch": 0.38522609596133933,
"grad_norm": 0.019775390625,
"learning_rate": 1.271692745376956e-05,
"loss": 0.0002,
"step": 279
},
{
"epoch": 0.3866068346565413,
"grad_norm": 0.267578125,
"learning_rate": 1.2688477951635847e-05,
"loss": 0.0031,
"step": 280
},
{
"epoch": 0.38798757335174316,
"grad_norm": 0.0224609375,
"learning_rate": 1.2660028449502136e-05,
"loss": 0.0002,
"step": 281
},
{
"epoch": 0.3893683120469451,
"grad_norm": 0.0194091796875,
"learning_rate": 1.263157894736842e-05,
"loss": 0.0002,
"step": 282
},
{
"epoch": 0.39074905074214705,
"grad_norm": 0.041259765625,
"learning_rate": 1.2603129445234709e-05,
"loss": 0.0002,
"step": 283
},
{
"epoch": 0.392129789437349,
"grad_norm": 0.0245361328125,
"learning_rate": 1.2574679943100996e-05,
"loss": 0.0003,
"step": 284
},
{
"epoch": 0.39351052813255094,
"grad_norm": 0.0186767578125,
"learning_rate": 1.2546230440967284e-05,
"loss": 0.0002,
"step": 285
},
{
"epoch": 0.39489126682775283,
"grad_norm": 0.0194091796875,
"learning_rate": 1.2517780938833573e-05,
"loss": 0.0002,
"step": 286
},
{
"epoch": 0.3962720055229548,
"grad_norm": 0.018798828125,
"learning_rate": 1.248933143669986e-05,
"loss": 0.0002,
"step": 287
},
{
"epoch": 0.3976527442181567,
"grad_norm": 0.02001953125,
"learning_rate": 1.2460881934566146e-05,
"loss": 0.0002,
"step": 288
},
{
"epoch": 0.3976527442181567,
"eval_loss": 0.00039300136268138885,
"eval_runtime": 581.6065,
"eval_samples_per_second": 2.215,
"eval_steps_per_second": 2.215,
"step": 288
},
{
"epoch": 0.39903348291335866,
"grad_norm": 0.2041015625,
"learning_rate": 1.2432432432432433e-05,
"loss": 0.0005,
"step": 289
},
{
"epoch": 0.4004142216085606,
"grad_norm": 0.0208740234375,
"learning_rate": 1.2403982930298721e-05,
"loss": 0.0002,
"step": 290
},
{
"epoch": 0.4017949603037625,
"grad_norm": 0.09619140625,
"learning_rate": 1.2375533428165008e-05,
"loss": 0.0002,
"step": 291
},
{
"epoch": 0.40317569899896444,
"grad_norm": 0.15625,
"learning_rate": 1.2347083926031296e-05,
"loss": 0.0021,
"step": 292
},
{
"epoch": 0.4045564376941664,
"grad_norm": 0.0196533203125,
"learning_rate": 1.2318634423897581e-05,
"loss": 0.0002,
"step": 293
},
{
"epoch": 0.4059371763893683,
"grad_norm": 0.0189208984375,
"learning_rate": 1.229018492176387e-05,
"loss": 0.0002,
"step": 294
},
{
"epoch": 0.40731791508457027,
"grad_norm": 0.018310546875,
"learning_rate": 1.2261735419630158e-05,
"loss": 0.0002,
"step": 295
},
{
"epoch": 0.40869865377977216,
"grad_norm": 0.263671875,
"learning_rate": 1.2233285917496445e-05,
"loss": 0.0033,
"step": 296
},
{
"epoch": 0.4100793924749741,
"grad_norm": 0.068359375,
"learning_rate": 1.2204836415362733e-05,
"loss": 0.0003,
"step": 297
},
{
"epoch": 0.41146013117017605,
"grad_norm": 0.0196533203125,
"learning_rate": 1.2176386913229019e-05,
"loss": 0.0002,
"step": 298
},
{
"epoch": 0.412840869865378,
"grad_norm": 0.019287109375,
"learning_rate": 1.2147937411095307e-05,
"loss": 0.0002,
"step": 299
},
{
"epoch": 0.41422160856057993,
"grad_norm": 0.0223388671875,
"learning_rate": 1.2119487908961594e-05,
"loss": 0.0002,
"step": 300
},
{
"epoch": 0.4156023472557818,
"grad_norm": 0.0186767578125,
"learning_rate": 1.2091038406827882e-05,
"loss": 0.0002,
"step": 301
},
{
"epoch": 0.41698308595098377,
"grad_norm": 0.66796875,
"learning_rate": 1.2062588904694169e-05,
"loss": 0.0017,
"step": 302
},
{
"epoch": 0.4183638246461857,
"grad_norm": 0.018798828125,
"learning_rate": 1.2034139402560456e-05,
"loss": 0.0002,
"step": 303
},
{
"epoch": 0.41974456334138766,
"grad_norm": 0.0196533203125,
"learning_rate": 1.2005689900426742e-05,
"loss": 0.0002,
"step": 304
},
{
"epoch": 0.4211253020365896,
"grad_norm": 0.020751953125,
"learning_rate": 1.197724039829303e-05,
"loss": 0.0002,
"step": 305
},
{
"epoch": 0.4225060407317915,
"grad_norm": 0.03662109375,
"learning_rate": 1.1948790896159319e-05,
"loss": 0.0003,
"step": 306
},
{
"epoch": 0.42388677942699343,
"grad_norm": 0.0196533203125,
"learning_rate": 1.1920341394025606e-05,
"loss": 0.0002,
"step": 307
},
{
"epoch": 0.4252675181221954,
"grad_norm": 0.036376953125,
"learning_rate": 1.1891891891891894e-05,
"loss": 0.0002,
"step": 308
},
{
"epoch": 0.4266482568173973,
"grad_norm": 0.197265625,
"learning_rate": 1.186344238975818e-05,
"loss": 0.0002,
"step": 309
},
{
"epoch": 0.42802899551259926,
"grad_norm": 0.02197265625,
"learning_rate": 1.1834992887624468e-05,
"loss": 0.0002,
"step": 310
},
{
"epoch": 0.42940973420780115,
"grad_norm": 0.0198974609375,
"learning_rate": 1.1806543385490754e-05,
"loss": 0.0002,
"step": 311
},
{
"epoch": 0.4307904729030031,
"grad_norm": 0.02001953125,
"learning_rate": 1.1778093883357043e-05,
"loss": 0.0002,
"step": 312
},
{
"epoch": 0.43217121159820504,
"grad_norm": 0.02197265625,
"learning_rate": 1.1749644381223328e-05,
"loss": 0.0002,
"step": 313
},
{
"epoch": 0.433551950293407,
"grad_norm": 0.0208740234375,
"learning_rate": 1.1721194879089616e-05,
"loss": 0.0002,
"step": 314
},
{
"epoch": 0.43493268898860893,
"grad_norm": 0.05322265625,
"learning_rate": 1.1692745376955905e-05,
"loss": 0.0005,
"step": 315
},
{
"epoch": 0.4363134276838108,
"grad_norm": 0.0179443359375,
"learning_rate": 1.1664295874822192e-05,
"loss": 0.0002,
"step": 316
},
{
"epoch": 0.43769416637901276,
"grad_norm": 0.01904296875,
"learning_rate": 1.163584637268848e-05,
"loss": 0.0002,
"step": 317
},
{
"epoch": 0.4390749050742147,
"grad_norm": 0.0184326171875,
"learning_rate": 1.1607396870554765e-05,
"loss": 0.0002,
"step": 318
},
{
"epoch": 0.44045564376941665,
"grad_norm": 0.03125,
"learning_rate": 1.1578947368421053e-05,
"loss": 0.0003,
"step": 319
},
{
"epoch": 0.4418363824646186,
"grad_norm": 0.031005859375,
"learning_rate": 1.155049786628734e-05,
"loss": 0.0002,
"step": 320
},
{
"epoch": 0.4432171211598205,
"grad_norm": 0.0712890625,
"learning_rate": 1.1522048364153629e-05,
"loss": 0.0003,
"step": 321
},
{
"epoch": 0.4445978598550224,
"grad_norm": 0.024169921875,
"learning_rate": 1.1493598862019915e-05,
"loss": 0.0002,
"step": 322
},
{
"epoch": 0.44597859855022437,
"grad_norm": 0.019287109375,
"learning_rate": 1.1465149359886204e-05,
"loss": 0.0002,
"step": 323
},
{
"epoch": 0.4473593372454263,
"grad_norm": 0.040283203125,
"learning_rate": 1.143669985775249e-05,
"loss": 0.0003,
"step": 324
},
{
"epoch": 0.44874007594062826,
"grad_norm": 0.0179443359375,
"learning_rate": 1.1408250355618777e-05,
"loss": 0.0002,
"step": 325
},
{
"epoch": 0.45012081463583015,
"grad_norm": 0.016845703125,
"learning_rate": 1.1379800853485066e-05,
"loss": 0.0002,
"step": 326
},
{
"epoch": 0.4515015533310321,
"grad_norm": 0.0179443359375,
"learning_rate": 1.1351351351351352e-05,
"loss": 0.0002,
"step": 327
},
{
"epoch": 0.45288229202623403,
"grad_norm": 0.30078125,
"learning_rate": 1.132290184921764e-05,
"loss": 0.0007,
"step": 328
},
{
"epoch": 0.454263030721436,
"grad_norm": 0.0206298828125,
"learning_rate": 1.1294452347083926e-05,
"loss": 0.0002,
"step": 329
},
{
"epoch": 0.4556437694166379,
"grad_norm": 0.07421875,
"learning_rate": 1.1266002844950214e-05,
"loss": 0.0003,
"step": 330
},
{
"epoch": 0.4570245081118398,
"grad_norm": 0.0177001953125,
"learning_rate": 1.1237553342816501e-05,
"loss": 0.0002,
"step": 331
},
{
"epoch": 0.45840524680704176,
"grad_norm": 0.016845703125,
"learning_rate": 1.120910384068279e-05,
"loss": 0.0002,
"step": 332
},
{
"epoch": 0.4597859855022437,
"grad_norm": 0.032958984375,
"learning_rate": 1.1180654338549078e-05,
"loss": 0.0002,
"step": 333
},
{
"epoch": 0.46116672419744564,
"grad_norm": 0.2109375,
"learning_rate": 1.1152204836415363e-05,
"loss": 0.0002,
"step": 334
},
{
"epoch": 0.4625474628926476,
"grad_norm": 0.0181884765625,
"learning_rate": 1.1123755334281651e-05,
"loss": 0.0002,
"step": 335
},
{
"epoch": 0.4639282015878495,
"grad_norm": 0.01708984375,
"learning_rate": 1.1095305832147938e-05,
"loss": 0.0002,
"step": 336
},
{
"epoch": 0.4653089402830514,
"grad_norm": 0.0167236328125,
"learning_rate": 1.1066856330014226e-05,
"loss": 0.0002,
"step": 337
},
{
"epoch": 0.46668967897825336,
"grad_norm": 0.0255126953125,
"learning_rate": 1.1038406827880513e-05,
"loss": 0.0002,
"step": 338
},
{
"epoch": 0.4680704176734553,
"grad_norm": 0.019287109375,
"learning_rate": 1.10099573257468e-05,
"loss": 0.0002,
"step": 339
},
{
"epoch": 0.46945115636865725,
"grad_norm": 0.01708984375,
"learning_rate": 1.0981507823613087e-05,
"loss": 0.0002,
"step": 340
},
{
"epoch": 0.47083189506385914,
"grad_norm": 0.0172119140625,
"learning_rate": 1.0953058321479375e-05,
"loss": 0.0002,
"step": 341
},
{
"epoch": 0.4722126337590611,
"grad_norm": 0.04541015625,
"learning_rate": 1.0924608819345663e-05,
"loss": 0.0004,
"step": 342
},
{
"epoch": 0.47359337245426303,
"grad_norm": 0.038818359375,
"learning_rate": 1.089615931721195e-05,
"loss": 0.0003,
"step": 343
},
{
"epoch": 0.474974111149465,
"grad_norm": 0.059326171875,
"learning_rate": 1.0867709815078239e-05,
"loss": 0.0003,
"step": 344
},
{
"epoch": 0.4763548498446669,
"grad_norm": 0.017333984375,
"learning_rate": 1.0839260312944524e-05,
"loss": 0.0002,
"step": 345
},
{
"epoch": 0.4777355885398688,
"grad_norm": 0.02001953125,
"learning_rate": 1.0810810810810812e-05,
"loss": 0.0002,
"step": 346
},
{
"epoch": 0.47911632723507075,
"grad_norm": 0.0419921875,
"learning_rate": 1.0782361308677099e-05,
"loss": 0.0002,
"step": 347
},
{
"epoch": 0.4804970659302727,
"grad_norm": 0.021240234375,
"learning_rate": 1.0753911806543387e-05,
"loss": 0.0002,
"step": 348
},
{
"epoch": 0.48187780462547464,
"grad_norm": 0.044677734375,
"learning_rate": 1.0725462304409672e-05,
"loss": 0.0004,
"step": 349
},
{
"epoch": 0.4832585433206766,
"grad_norm": 0.0174560546875,
"learning_rate": 1.069701280227596e-05,
"loss": 0.0002,
"step": 350
},
{
"epoch": 0.48463928201587847,
"grad_norm": 0.01708984375,
"learning_rate": 1.0668563300142247e-05,
"loss": 0.0002,
"step": 351
},
{
"epoch": 0.4860200207110804,
"grad_norm": 0.016357421875,
"learning_rate": 1.0640113798008536e-05,
"loss": 0.0002,
"step": 352
},
{
"epoch": 0.48740075940628236,
"grad_norm": 0.02001953125,
"learning_rate": 1.0611664295874824e-05,
"loss": 0.0002,
"step": 353
},
{
"epoch": 0.4887814981014843,
"grad_norm": 0.0172119140625,
"learning_rate": 1.058321479374111e-05,
"loss": 0.0002,
"step": 354
},
{
"epoch": 0.49016223679668625,
"grad_norm": 0.0159912109375,
"learning_rate": 1.0554765291607398e-05,
"loss": 0.0002,
"step": 355
},
{
"epoch": 0.49154297549188813,
"grad_norm": 0.5,
"learning_rate": 1.0526315789473684e-05,
"loss": 0.0021,
"step": 356
},
{
"epoch": 0.4929237141870901,
"grad_norm": 0.01611328125,
"learning_rate": 1.0497866287339973e-05,
"loss": 0.0002,
"step": 357
},
{
"epoch": 0.494304452882292,
"grad_norm": 0.016357421875,
"learning_rate": 1.046941678520626e-05,
"loss": 0.0002,
"step": 358
},
{
"epoch": 0.49568519157749397,
"grad_norm": 0.016845703125,
"learning_rate": 1.0440967283072548e-05,
"loss": 0.0002,
"step": 359
},
{
"epoch": 0.4970659302726959,
"grad_norm": 0.0194091796875,
"learning_rate": 1.0412517780938833e-05,
"loss": 0.0002,
"step": 360
},
{
"epoch": 0.4970659302726959,
"eval_loss": 0.00029756984440609813,
"eval_runtime": 582.415,
"eval_samples_per_second": 2.211,
"eval_steps_per_second": 2.211,
"step": 360
},
{
"epoch": 0.4984466689678978,
"grad_norm": 0.0169677734375,
"learning_rate": 1.0384068278805121e-05,
"loss": 0.0002,
"step": 361
},
{
"epoch": 0.49982740766309974,
"grad_norm": 0.020263671875,
"learning_rate": 1.035561877667141e-05,
"loss": 0.0002,
"step": 362
},
{
"epoch": 0.5012081463583017,
"grad_norm": 0.016357421875,
"learning_rate": 1.0327169274537697e-05,
"loss": 0.0002,
"step": 363
},
{
"epoch": 0.5025888850535036,
"grad_norm": 0.01953125,
"learning_rate": 1.0298719772403985e-05,
"loss": 0.0002,
"step": 364
},
{
"epoch": 0.5039696237487056,
"grad_norm": 0.0159912109375,
"learning_rate": 1.027027027027027e-05,
"loss": 0.0002,
"step": 365
},
{
"epoch": 0.5053503624439075,
"grad_norm": 0.0166015625,
"learning_rate": 1.0241820768136559e-05,
"loss": 0.0002,
"step": 366
},
{
"epoch": 0.5067311011391095,
"grad_norm": 0.04296875,
"learning_rate": 1.0213371266002845e-05,
"loss": 0.0002,
"step": 367
},
{
"epoch": 0.5081118398343114,
"grad_norm": 0.0673828125,
"learning_rate": 1.0184921763869134e-05,
"loss": 0.0003,
"step": 368
},
{
"epoch": 0.5094925785295132,
"grad_norm": 0.0164794921875,
"learning_rate": 1.0156472261735419e-05,
"loss": 0.0002,
"step": 369
},
{
"epoch": 0.5108733172247152,
"grad_norm": 0.0174560546875,
"learning_rate": 1.0128022759601707e-05,
"loss": 0.0002,
"step": 370
},
{
"epoch": 0.5122540559199171,
"grad_norm": 0.0224609375,
"learning_rate": 1.0099573257467996e-05,
"loss": 0.0002,
"step": 371
},
{
"epoch": 0.5136347946151191,
"grad_norm": 0.42578125,
"learning_rate": 1.0071123755334282e-05,
"loss": 0.0002,
"step": 372
},
{
"epoch": 0.515015533310321,
"grad_norm": 0.126953125,
"learning_rate": 1.004267425320057e-05,
"loss": 0.0014,
"step": 373
},
{
"epoch": 0.516396272005523,
"grad_norm": 0.015869140625,
"learning_rate": 1.0014224751066857e-05,
"loss": 0.0002,
"step": 374
},
{
"epoch": 0.5177770107007249,
"grad_norm": 0.1337890625,
"learning_rate": 9.985775248933144e-06,
"loss": 0.0014,
"step": 375
},
{
"epoch": 0.5191577493959268,
"grad_norm": 0.040771484375,
"learning_rate": 9.957325746799433e-06,
"loss": 0.0004,
"step": 376
},
{
"epoch": 0.5205384880911288,
"grad_norm": 0.302734375,
"learning_rate": 9.92887624466572e-06,
"loss": 0.0003,
"step": 377
},
{
"epoch": 0.5219192267863307,
"grad_norm": 0.01708984375,
"learning_rate": 9.900426742532006e-06,
"loss": 0.0002,
"step": 378
},
{
"epoch": 0.5232999654815326,
"grad_norm": 0.1884765625,
"learning_rate": 9.871977240398294e-06,
"loss": 0.0008,
"step": 379
},
{
"epoch": 0.5246807041767345,
"grad_norm": 0.016357421875,
"learning_rate": 9.843527738264581e-06,
"loss": 0.0002,
"step": 380
},
{
"epoch": 0.5260614428719365,
"grad_norm": 0.0162353515625,
"learning_rate": 9.815078236130868e-06,
"loss": 0.0002,
"step": 381
},
{
"epoch": 0.5274421815671384,
"grad_norm": 0.06591796875,
"learning_rate": 9.786628733997155e-06,
"loss": 0.0004,
"step": 382
},
{
"epoch": 0.5288229202623403,
"grad_norm": 0.015625,
"learning_rate": 9.758179231863443e-06,
"loss": 0.0001,
"step": 383
},
{
"epoch": 0.5302036589575423,
"grad_norm": 0.0230712890625,
"learning_rate": 9.729729729729732e-06,
"loss": 0.0002,
"step": 384
},
{
"epoch": 0.5315843976527442,
"grad_norm": 0.016357421875,
"learning_rate": 9.701280227596018e-06,
"loss": 0.0002,
"step": 385
},
{
"epoch": 0.5329651363479462,
"grad_norm": 0.01611328125,
"learning_rate": 9.672830725462305e-06,
"loss": 0.0002,
"step": 386
},
{
"epoch": 0.5343458750431481,
"grad_norm": 0.0223388671875,
"learning_rate": 9.644381223328593e-06,
"loss": 0.0002,
"step": 387
},
{
"epoch": 0.5357266137383501,
"grad_norm": 0.0299072265625,
"learning_rate": 9.61593172119488e-06,
"loss": 0.0002,
"step": 388
},
{
"epoch": 0.5371073524335519,
"grad_norm": 0.0174560546875,
"learning_rate": 9.587482219061167e-06,
"loss": 0.0002,
"step": 389
},
{
"epoch": 0.5384880911287538,
"grad_norm": 0.016845703125,
"learning_rate": 9.559032716927454e-06,
"loss": 0.0002,
"step": 390
},
{
"epoch": 0.5398688298239558,
"grad_norm": 0.63671875,
"learning_rate": 9.530583214793742e-06,
"loss": 0.002,
"step": 391
},
{
"epoch": 0.5412495685191577,
"grad_norm": 0.0225830078125,
"learning_rate": 9.502133712660029e-06,
"loss": 0.0002,
"step": 392
},
{
"epoch": 0.5426303072143597,
"grad_norm": 0.0296630859375,
"learning_rate": 9.473684210526315e-06,
"loss": 0.0002,
"step": 393
},
{
"epoch": 0.5440110459095616,
"grad_norm": 0.0771484375,
"learning_rate": 9.445234708392604e-06,
"loss": 0.0011,
"step": 394
},
{
"epoch": 0.5453917846047636,
"grad_norm": 0.0673828125,
"learning_rate": 9.416785206258892e-06,
"loss": 0.0005,
"step": 395
},
{
"epoch": 0.5467725232999655,
"grad_norm": 0.0213623046875,
"learning_rate": 9.388335704125179e-06,
"loss": 0.0002,
"step": 396
},
{
"epoch": 0.5481532619951675,
"grad_norm": 0.039306640625,
"learning_rate": 9.359886201991466e-06,
"loss": 0.0002,
"step": 397
},
{
"epoch": 0.5495340006903694,
"grad_norm": 0.0162353515625,
"learning_rate": 9.331436699857753e-06,
"loss": 0.0002,
"step": 398
},
{
"epoch": 0.5509147393855712,
"grad_norm": 0.01519775390625,
"learning_rate": 9.302987197724041e-06,
"loss": 0.0001,
"step": 399
},
{
"epoch": 0.5522954780807732,
"grad_norm": 0.0224609375,
"learning_rate": 9.274537695590328e-06,
"loss": 0.0002,
"step": 400
},
{
"epoch": 0.5536762167759751,
"grad_norm": 0.033935546875,
"learning_rate": 9.246088193456614e-06,
"loss": 0.0002,
"step": 401
},
{
"epoch": 0.5550569554711771,
"grad_norm": 0.019287109375,
"learning_rate": 9.217638691322903e-06,
"loss": 0.0002,
"step": 402
},
{
"epoch": 0.556437694166379,
"grad_norm": 0.11962890625,
"learning_rate": 9.189189189189191e-06,
"loss": 0.0007,
"step": 403
},
{
"epoch": 0.557818432861581,
"grad_norm": 0.0179443359375,
"learning_rate": 9.160739687055478e-06,
"loss": 0.0001,
"step": 404
},
{
"epoch": 0.5591991715567829,
"grad_norm": 0.0400390625,
"learning_rate": 9.132290184921765e-06,
"loss": 0.0002,
"step": 405
},
{
"epoch": 0.5605799102519848,
"grad_norm": 0.61328125,
"learning_rate": 9.103840682788051e-06,
"loss": 0.0011,
"step": 406
},
{
"epoch": 0.5619606489471868,
"grad_norm": 0.0174560546875,
"learning_rate": 9.07539118065434e-06,
"loss": 0.0002,
"step": 407
},
{
"epoch": 0.5633413876423887,
"grad_norm": 0.061767578125,
"learning_rate": 9.046941678520627e-06,
"loss": 0.0003,
"step": 408
},
{
"epoch": 0.5647221263375906,
"grad_norm": 0.01708984375,
"learning_rate": 9.018492176386913e-06,
"loss": 0.0002,
"step": 409
},
{
"epoch": 0.5661028650327925,
"grad_norm": 0.1650390625,
"learning_rate": 8.990042674253202e-06,
"loss": 0.0011,
"step": 410
},
{
"epoch": 0.5674836037279944,
"grad_norm": 0.01519775390625,
"learning_rate": 8.961593172119488e-06,
"loss": 0.0001,
"step": 411
},
{
"epoch": 0.5688643424231964,
"grad_norm": 0.0194091796875,
"learning_rate": 8.933143669985777e-06,
"loss": 0.0002,
"step": 412
},
{
"epoch": 0.5702450811183983,
"grad_norm": 0.0206298828125,
"learning_rate": 8.904694167852064e-06,
"loss": 0.0001,
"step": 413
},
{
"epoch": 0.5716258198136003,
"grad_norm": 0.01483154296875,
"learning_rate": 8.87624466571835e-06,
"loss": 0.0001,
"step": 414
},
{
"epoch": 0.5730065585088022,
"grad_norm": 0.018798828125,
"learning_rate": 8.847795163584639e-06,
"loss": 0.0002,
"step": 415
},
{
"epoch": 0.5743872972040042,
"grad_norm": 0.040771484375,
"learning_rate": 8.819345661450926e-06,
"loss": 0.0002,
"step": 416
},
{
"epoch": 0.5757680358992061,
"grad_norm": 0.015380859375,
"learning_rate": 8.790896159317212e-06,
"loss": 0.0001,
"step": 417
},
{
"epoch": 0.577148774594408,
"grad_norm": 0.01495361328125,
"learning_rate": 8.7624466571835e-06,
"loss": 0.0001,
"step": 418
},
{
"epoch": 0.5785295132896099,
"grad_norm": 0.0152587890625,
"learning_rate": 8.733997155049787e-06,
"loss": 0.0001,
"step": 419
},
{
"epoch": 0.5799102519848118,
"grad_norm": 0.03515625,
"learning_rate": 8.705547652916074e-06,
"loss": 0.0002,
"step": 420
},
{
"epoch": 0.5812909906800138,
"grad_norm": 0.0167236328125,
"learning_rate": 8.677098150782361e-06,
"loss": 0.0002,
"step": 421
},
{
"epoch": 0.5826717293752157,
"grad_norm": 0.0263671875,
"learning_rate": 8.64864864864865e-06,
"loss": 0.0002,
"step": 422
},
{
"epoch": 0.5840524680704177,
"grad_norm": 0.015625,
"learning_rate": 8.620199146514938e-06,
"loss": 0.0001,
"step": 423
},
{
"epoch": 0.5854332067656196,
"grad_norm": 0.0198974609375,
"learning_rate": 8.591749644381224e-06,
"loss": 0.0002,
"step": 424
},
{
"epoch": 0.5868139454608216,
"grad_norm": 0.0225830078125,
"learning_rate": 8.563300142247511e-06,
"loss": 0.0002,
"step": 425
},
{
"epoch": 0.5881946841560235,
"grad_norm": 0.03125,
"learning_rate": 8.534850640113798e-06,
"loss": 0.0002,
"step": 426
},
{
"epoch": 0.5895754228512254,
"grad_norm": 0.01470947265625,
"learning_rate": 8.506401137980086e-06,
"loss": 0.0001,
"step": 427
},
{
"epoch": 0.5909561615464274,
"grad_norm": 0.02392578125,
"learning_rate": 8.477951635846373e-06,
"loss": 0.0002,
"step": 428
},
{
"epoch": 0.5923369002416292,
"grad_norm": 0.01531982421875,
"learning_rate": 8.44950213371266e-06,
"loss": 0.0001,
"step": 429
},
{
"epoch": 0.5937176389368312,
"grad_norm": 0.01446533203125,
"learning_rate": 8.421052631578948e-06,
"loss": 0.0001,
"step": 430
},
{
"epoch": 0.5950983776320331,
"grad_norm": 0.053466796875,
"learning_rate": 8.392603129445237e-06,
"loss": 0.0002,
"step": 431
},
{
"epoch": 0.596479116327235,
"grad_norm": 0.017578125,
"learning_rate": 8.364153627311523e-06,
"loss": 0.0001,
"step": 432
},
{
"epoch": 0.596479116327235,
"eval_loss": 0.0002615667472127825,
"eval_runtime": 581.6024,
"eval_samples_per_second": 2.215,
"eval_steps_per_second": 2.215,
"step": 432
},
{
"epoch": 0.597859855022437,
"grad_norm": 0.01483154296875,
"learning_rate": 8.33570412517781e-06,
"loss": 0.0001,
"step": 433
},
{
"epoch": 0.5992405937176389,
"grad_norm": 0.0211181640625,
"learning_rate": 8.307254623044097e-06,
"loss": 0.0002,
"step": 434
},
{
"epoch": 0.6006213324128409,
"grad_norm": 0.01953125,
"learning_rate": 8.278805120910385e-06,
"loss": 0.0002,
"step": 435
},
{
"epoch": 0.6020020711080428,
"grad_norm": 0.0157470703125,
"learning_rate": 8.250355618776672e-06,
"loss": 0.0001,
"step": 436
},
{
"epoch": 0.6033828098032448,
"grad_norm": 0.08447265625,
"learning_rate": 8.221906116642959e-06,
"loss": 0.0005,
"step": 437
},
{
"epoch": 0.6047635484984467,
"grad_norm": 0.01470947265625,
"learning_rate": 8.193456614509247e-06,
"loss": 0.0001,
"step": 438
},
{
"epoch": 0.6061442871936487,
"grad_norm": 0.015869140625,
"learning_rate": 8.165007112375534e-06,
"loss": 0.0002,
"step": 439
},
{
"epoch": 0.6075250258888505,
"grad_norm": 0.0162353515625,
"learning_rate": 8.13655761024182e-06,
"loss": 0.0001,
"step": 440
},
{
"epoch": 0.6089057645840524,
"grad_norm": 0.0223388671875,
"learning_rate": 8.108108108108109e-06,
"loss": 0.0002,
"step": 441
},
{
"epoch": 0.6102865032792544,
"grad_norm": 0.062255859375,
"learning_rate": 8.079658605974396e-06,
"loss": 0.0005,
"step": 442
},
{
"epoch": 0.6116672419744563,
"grad_norm": 0.020263671875,
"learning_rate": 8.051209103840684e-06,
"loss": 0.0001,
"step": 443
},
{
"epoch": 0.6130479806696583,
"grad_norm": 0.01544189453125,
"learning_rate": 8.022759601706971e-06,
"loss": 0.0001,
"step": 444
},
{
"epoch": 0.6144287193648602,
"grad_norm": 0.6953125,
"learning_rate": 7.994310099573258e-06,
"loss": 0.0031,
"step": 445
},
{
"epoch": 0.6158094580600622,
"grad_norm": 0.0262451171875,
"learning_rate": 7.965860597439546e-06,
"loss": 0.0002,
"step": 446
},
{
"epoch": 0.6171901967552641,
"grad_norm": 0.060791015625,
"learning_rate": 7.937411095305833e-06,
"loss": 0.0003,
"step": 447
},
{
"epoch": 0.618570935450466,
"grad_norm": 0.0172119140625,
"learning_rate": 7.90896159317212e-06,
"loss": 0.0002,
"step": 448
},
{
"epoch": 0.619951674145668,
"grad_norm": 0.0146484375,
"learning_rate": 7.880512091038406e-06,
"loss": 0.0001,
"step": 449
},
{
"epoch": 0.6213324128408698,
"grad_norm": 0.014892578125,
"learning_rate": 7.852062588904695e-06,
"loss": 0.0001,
"step": 450
},
{
"epoch": 0.6227131515360718,
"grad_norm": 0.0145263671875,
"learning_rate": 7.823613086770983e-06,
"loss": 0.0001,
"step": 451
},
{
"epoch": 0.6240938902312737,
"grad_norm": 0.014892578125,
"learning_rate": 7.79516358463727e-06,
"loss": 0.0001,
"step": 452
},
{
"epoch": 0.6254746289264757,
"grad_norm": 0.017333984375,
"learning_rate": 7.766714082503557e-06,
"loss": 0.0001,
"step": 453
},
{
"epoch": 0.6268553676216776,
"grad_norm": 0.25390625,
"learning_rate": 7.738264580369845e-06,
"loss": 0.002,
"step": 454
},
{
"epoch": 0.6282361063168795,
"grad_norm": 0.73828125,
"learning_rate": 7.709815078236132e-06,
"loss": 0.0019,
"step": 455
},
{
"epoch": 0.6296168450120815,
"grad_norm": 0.01531982421875,
"learning_rate": 7.681365576102418e-06,
"loss": 0.0001,
"step": 456
},
{
"epoch": 0.6309975837072834,
"grad_norm": 0.015625,
"learning_rate": 7.652916073968705e-06,
"loss": 0.0001,
"step": 457
},
{
"epoch": 0.6323783224024854,
"grad_norm": 0.01531982421875,
"learning_rate": 7.624466571834993e-06,
"loss": 0.0001,
"step": 458
},
{
"epoch": 0.6337590610976873,
"grad_norm": 0.014404296875,
"learning_rate": 7.596017069701281e-06,
"loss": 0.0001,
"step": 459
},
{
"epoch": 0.6351397997928891,
"grad_norm": 2.171875,
"learning_rate": 7.567567567567569e-06,
"loss": 0.0005,
"step": 460
},
{
"epoch": 0.6365205384880911,
"grad_norm": 0.0191650390625,
"learning_rate": 7.5391180654338555e-06,
"loss": 0.0002,
"step": 461
},
{
"epoch": 0.637901277183293,
"grad_norm": 0.018798828125,
"learning_rate": 7.510668563300143e-06,
"loss": 0.0002,
"step": 462
},
{
"epoch": 0.639282015878495,
"grad_norm": 0.408203125,
"learning_rate": 7.482219061166431e-06,
"loss": 0.0017,
"step": 463
},
{
"epoch": 0.6406627545736969,
"grad_norm": 0.013916015625,
"learning_rate": 7.453769559032717e-06,
"loss": 0.0001,
"step": 464
},
{
"epoch": 0.6420434932688989,
"grad_norm": 0.01806640625,
"learning_rate": 7.425320056899005e-06,
"loss": 0.0001,
"step": 465
},
{
"epoch": 0.6434242319641008,
"grad_norm": 0.013916015625,
"learning_rate": 7.396870554765292e-06,
"loss": 0.0001,
"step": 466
},
{
"epoch": 0.6448049706593028,
"grad_norm": 0.06201171875,
"learning_rate": 7.368421052631579e-06,
"loss": 0.0004,
"step": 467
},
{
"epoch": 0.6461857093545047,
"grad_norm": 0.0179443359375,
"learning_rate": 7.339971550497866e-06,
"loss": 0.0001,
"step": 468
},
{
"epoch": 0.6475664480497066,
"grad_norm": 0.0225830078125,
"learning_rate": 7.3115220483641544e-06,
"loss": 0.0002,
"step": 469
},
{
"epoch": 0.6489471867449085,
"grad_norm": 0.0242919921875,
"learning_rate": 7.283072546230442e-06,
"loss": 0.0002,
"step": 470
},
{
"epoch": 0.6503279254401104,
"grad_norm": 0.0242919921875,
"learning_rate": 7.254623044096729e-06,
"loss": 0.0002,
"step": 471
},
{
"epoch": 0.6517086641353124,
"grad_norm": 0.01446533203125,
"learning_rate": 7.226173541963016e-06,
"loss": 0.0001,
"step": 472
},
{
"epoch": 0.6530894028305143,
"grad_norm": 0.039306640625,
"learning_rate": 7.197724039829304e-06,
"loss": 0.0003,
"step": 473
},
{
"epoch": 0.6544701415257163,
"grad_norm": 0.1201171875,
"learning_rate": 7.169274537695591e-06,
"loss": 0.0002,
"step": 474
},
{
"epoch": 0.6558508802209182,
"grad_norm": 0.01483154296875,
"learning_rate": 7.140825035561878e-06,
"loss": 0.0001,
"step": 475
},
{
"epoch": 0.6572316189161201,
"grad_norm": 0.035888671875,
"learning_rate": 7.112375533428165e-06,
"loss": 0.0002,
"step": 476
},
{
"epoch": 0.6586123576113221,
"grad_norm": 0.08447265625,
"learning_rate": 7.0839260312944525e-06,
"loss": 0.0004,
"step": 477
},
{
"epoch": 0.659993096306524,
"grad_norm": 0.01556396484375,
"learning_rate": 7.055476529160741e-06,
"loss": 0.0001,
"step": 478
},
{
"epoch": 0.661373835001726,
"grad_norm": 0.0240478515625,
"learning_rate": 7.027027027027028e-06,
"loss": 0.0002,
"step": 479
},
{
"epoch": 0.6627545736969278,
"grad_norm": 0.0181884765625,
"learning_rate": 6.998577524893315e-06,
"loss": 0.0001,
"step": 480
},
{
"epoch": 0.6641353123921298,
"grad_norm": 0.0145263671875,
"learning_rate": 6.970128022759603e-06,
"loss": 0.0001,
"step": 481
},
{
"epoch": 0.6655160510873317,
"grad_norm": 0.029541015625,
"learning_rate": 6.9416785206258896e-06,
"loss": 0.0002,
"step": 482
},
{
"epoch": 0.6668967897825336,
"grad_norm": 0.04736328125,
"learning_rate": 6.913229018492177e-06,
"loss": 0.0003,
"step": 483
},
{
"epoch": 0.6682775284777356,
"grad_norm": 0.0242919921875,
"learning_rate": 6.884779516358464e-06,
"loss": 0.0002,
"step": 484
},
{
"epoch": 0.6696582671729375,
"grad_norm": 0.01611328125,
"learning_rate": 6.8563300142247514e-06,
"loss": 0.0002,
"step": 485
},
{
"epoch": 0.6710390058681395,
"grad_norm": 0.11474609375,
"learning_rate": 6.827880512091038e-06,
"loss": 0.001,
"step": 486
},
{
"epoch": 0.6724197445633414,
"grad_norm": 0.0230712890625,
"learning_rate": 6.799431009957327e-06,
"loss": 0.0002,
"step": 487
},
{
"epoch": 0.6738004832585434,
"grad_norm": 0.11083984375,
"learning_rate": 6.770981507823614e-06,
"loss": 0.0005,
"step": 488
},
{
"epoch": 0.6751812219537453,
"grad_norm": 0.0184326171875,
"learning_rate": 6.742532005689901e-06,
"loss": 0.0001,
"step": 489
},
{
"epoch": 0.6765619606489471,
"grad_norm": 0.01409912109375,
"learning_rate": 6.7140825035561885e-06,
"loss": 0.0001,
"step": 490
},
{
"epoch": 0.6779426993441491,
"grad_norm": 0.01531982421875,
"learning_rate": 6.685633001422476e-06,
"loss": 0.0001,
"step": 491
},
{
"epoch": 0.679323438039351,
"grad_norm": 0.055908203125,
"learning_rate": 6.657183499288763e-06,
"loss": 0.0004,
"step": 492
},
{
"epoch": 0.680704176734553,
"grad_norm": 0.0152587890625,
"learning_rate": 6.62873399715505e-06,
"loss": 0.0001,
"step": 493
},
{
"epoch": 0.6820849154297549,
"grad_norm": 0.01458740234375,
"learning_rate": 6.600284495021337e-06,
"loss": 0.0001,
"step": 494
},
{
"epoch": 0.6834656541249569,
"grad_norm": 0.0244140625,
"learning_rate": 6.571834992887625e-06,
"loss": 0.0002,
"step": 495
},
{
"epoch": 0.6848463928201588,
"grad_norm": 0.1591796875,
"learning_rate": 6.543385490753912e-06,
"loss": 0.0003,
"step": 496
},
{
"epoch": 0.6862271315153607,
"grad_norm": 0.01397705078125,
"learning_rate": 6.5149359886202e-06,
"loss": 0.0001,
"step": 497
},
{
"epoch": 0.6876078702105627,
"grad_norm": 0.0142822265625,
"learning_rate": 6.486486486486487e-06,
"loss": 0.0001,
"step": 498
},
{
"epoch": 0.6889886089057646,
"grad_norm": 0.013916015625,
"learning_rate": 6.458036984352775e-06,
"loss": 0.0001,
"step": 499
},
{
"epoch": 0.6903693476009665,
"grad_norm": 0.07470703125,
"learning_rate": 6.429587482219062e-06,
"loss": 0.0002,
"step": 500
},
{
"epoch": 0.6917500862961684,
"grad_norm": 0.018798828125,
"learning_rate": 6.401137980085349e-06,
"loss": 0.0002,
"step": 501
},
{
"epoch": 0.6931308249913704,
"grad_norm": 0.0206298828125,
"learning_rate": 6.372688477951636e-06,
"loss": 0.0002,
"step": 502
},
{
"epoch": 0.6945115636865723,
"grad_norm": 0.01544189453125,
"learning_rate": 6.344238975817924e-06,
"loss": 0.0001,
"step": 503
},
{
"epoch": 0.6958923023817742,
"grad_norm": 0.0137939453125,
"learning_rate": 6.31578947368421e-06,
"loss": 0.0001,
"step": 504
},
{
"epoch": 0.6958923023817742,
"eval_loss": 0.00025013022241182625,
"eval_runtime": 580.5157,
"eval_samples_per_second": 2.219,
"eval_steps_per_second": 2.219,
"step": 504
},
{
"epoch": 0.6972730410769762,
"grad_norm": 0.1513671875,
"learning_rate": 6.287339971550498e-06,
"loss": 0.0005,
"step": 505
},
{
"epoch": 0.6986537797721781,
"grad_norm": 0.01806640625,
"learning_rate": 6.258890469416786e-06,
"loss": 0.0001,
"step": 506
},
{
"epoch": 0.7000345184673801,
"grad_norm": 0.0169677734375,
"learning_rate": 6.230440967283073e-06,
"loss": 0.0001,
"step": 507
},
{
"epoch": 0.701415257162582,
"grad_norm": 0.014892578125,
"learning_rate": 6.201991465149361e-06,
"loss": 0.0001,
"step": 508
},
{
"epoch": 0.702795995857784,
"grad_norm": 0.30078125,
"learning_rate": 6.173541963015648e-06,
"loss": 0.0009,
"step": 509
},
{
"epoch": 0.7041767345529858,
"grad_norm": 0.01373291015625,
"learning_rate": 6.145092460881935e-06,
"loss": 0.0001,
"step": 510
},
{
"epoch": 0.7055574732481877,
"grad_norm": 0.0164794921875,
"learning_rate": 6.1166429587482225e-06,
"loss": 0.0001,
"step": 511
},
{
"epoch": 0.7069382119433897,
"grad_norm": 0.016357421875,
"learning_rate": 6.088193456614509e-06,
"loss": 0.0001,
"step": 512
},
{
"epoch": 0.7083189506385916,
"grad_norm": 0.0181884765625,
"learning_rate": 6.059743954480797e-06,
"loss": 0.0001,
"step": 513
},
{
"epoch": 0.7096996893337936,
"grad_norm": 0.0230712890625,
"learning_rate": 6.031294452347084e-06,
"loss": 0.0002,
"step": 514
},
{
"epoch": 0.7110804280289955,
"grad_norm": 0.0146484375,
"learning_rate": 6.002844950213371e-06,
"loss": 0.0001,
"step": 515
},
{
"epoch": 0.7124611667241975,
"grad_norm": 0.11328125,
"learning_rate": 5.9743954480796596e-06,
"loss": 0.0007,
"step": 516
},
{
"epoch": 0.7138419054193994,
"grad_norm": 0.02099609375,
"learning_rate": 5.945945945945947e-06,
"loss": 0.0001,
"step": 517
},
{
"epoch": 0.7152226441146013,
"grad_norm": 0.015625,
"learning_rate": 5.917496443812234e-06,
"loss": 0.0001,
"step": 518
},
{
"epoch": 0.7166033828098033,
"grad_norm": 0.01422119140625,
"learning_rate": 5.8890469416785214e-06,
"loss": 0.0001,
"step": 519
},
{
"epoch": 0.7179841215050051,
"grad_norm": 0.01361083984375,
"learning_rate": 5.860597439544808e-06,
"loss": 0.0001,
"step": 520
},
{
"epoch": 0.7193648602002071,
"grad_norm": 0.013916015625,
"learning_rate": 5.832147937411096e-06,
"loss": 0.0001,
"step": 521
},
{
"epoch": 0.720745598895409,
"grad_norm": 0.044677734375,
"learning_rate": 5.8036984352773825e-06,
"loss": 0.0002,
"step": 522
},
{
"epoch": 0.722126337590611,
"grad_norm": 0.01611328125,
"learning_rate": 5.77524893314367e-06,
"loss": 0.0001,
"step": 523
},
{
"epoch": 0.7235070762858129,
"grad_norm": 0.01806640625,
"learning_rate": 5.746799431009958e-06,
"loss": 0.0001,
"step": 524
},
{
"epoch": 0.7248878149810148,
"grad_norm": 0.0137939453125,
"learning_rate": 5.718349928876245e-06,
"loss": 0.0001,
"step": 525
},
{
"epoch": 0.7262685536762168,
"grad_norm": 0.07861328125,
"learning_rate": 5.689900426742533e-06,
"loss": 0.0002,
"step": 526
},
{
"epoch": 0.7276492923714187,
"grad_norm": 0.12060546875,
"learning_rate": 5.66145092460882e-06,
"loss": 0.0005,
"step": 527
},
{
"epoch": 0.7290300310666207,
"grad_norm": 0.0137939453125,
"learning_rate": 5.633001422475107e-06,
"loss": 0.0001,
"step": 528
},
{
"epoch": 0.7304107697618226,
"grad_norm": 0.0162353515625,
"learning_rate": 5.604551920341395e-06,
"loss": 0.0001,
"step": 529
},
{
"epoch": 0.7317915084570245,
"grad_norm": 0.017333984375,
"learning_rate": 5.576102418207681e-06,
"loss": 0.0002,
"step": 530
},
{
"epoch": 0.7331722471522264,
"grad_norm": 0.0458984375,
"learning_rate": 5.547652916073969e-06,
"loss": 0.0003,
"step": 531
},
{
"epoch": 0.7345529858474283,
"grad_norm": 0.01373291015625,
"learning_rate": 5.5192034139402566e-06,
"loss": 0.0001,
"step": 532
},
{
"epoch": 0.7359337245426303,
"grad_norm": 0.01397705078125,
"learning_rate": 5.490753911806543e-06,
"loss": 0.0001,
"step": 533
},
{
"epoch": 0.7373144632378322,
"grad_norm": 0.01361083984375,
"learning_rate": 5.462304409672832e-06,
"loss": 0.0001,
"step": 534
},
{
"epoch": 0.7386952019330342,
"grad_norm": 0.0634765625,
"learning_rate": 5.433854907539119e-06,
"loss": 0.0003,
"step": 535
},
{
"epoch": 0.7400759406282361,
"grad_norm": 0.12353515625,
"learning_rate": 5.405405405405406e-06,
"loss": 0.0012,
"step": 536
},
{
"epoch": 0.7414566793234381,
"grad_norm": 0.0157470703125,
"learning_rate": 5.376955903271694e-06,
"loss": 0.0001,
"step": 537
},
{
"epoch": 0.74283741801864,
"grad_norm": 0.01416015625,
"learning_rate": 5.34850640113798e-06,
"loss": 0.0001,
"step": 538
},
{
"epoch": 0.744218156713842,
"grad_norm": 0.11376953125,
"learning_rate": 5.320056899004268e-06,
"loss": 0.0003,
"step": 539
},
{
"epoch": 0.7455988954090439,
"grad_norm": 0.01373291015625,
"learning_rate": 5.291607396870555e-06,
"loss": 0.0001,
"step": 540
},
{
"epoch": 0.7469796341042457,
"grad_norm": 0.08984375,
"learning_rate": 5.263157894736842e-06,
"loss": 0.0003,
"step": 541
},
{
"epoch": 0.7483603727994477,
"grad_norm": 0.0137939453125,
"learning_rate": 5.23470839260313e-06,
"loss": 0.0001,
"step": 542
},
{
"epoch": 0.7497411114946496,
"grad_norm": 0.048583984375,
"learning_rate": 5.2062588904694165e-06,
"loss": 0.0004,
"step": 543
},
{
"epoch": 0.7511218501898516,
"grad_norm": 0.0135498046875,
"learning_rate": 5.177809388335705e-06,
"loss": 0.0001,
"step": 544
},
{
"epoch": 0.7525025888850535,
"grad_norm": 0.01397705078125,
"learning_rate": 5.1493598862019925e-06,
"loss": 0.0001,
"step": 545
},
{
"epoch": 0.7538833275802554,
"grad_norm": 0.01446533203125,
"learning_rate": 5.120910384068279e-06,
"loss": 0.0001,
"step": 546
},
{
"epoch": 0.7552640662754574,
"grad_norm": 0.040283203125,
"learning_rate": 5.092460881934567e-06,
"loss": 0.0002,
"step": 547
},
{
"epoch": 0.7566448049706593,
"grad_norm": 0.01422119140625,
"learning_rate": 5.0640113798008536e-06,
"loss": 0.0001,
"step": 548
},
{
"epoch": 0.7580255436658613,
"grad_norm": 0.01361083984375,
"learning_rate": 5.035561877667141e-06,
"loss": 0.0001,
"step": 549
},
{
"epoch": 0.7594062823610632,
"grad_norm": 0.013916015625,
"learning_rate": 5.007112375533429e-06,
"loss": 0.0001,
"step": 550
},
{
"epoch": 0.7607870210562651,
"grad_norm": 0.03857421875,
"learning_rate": 4.978662873399716e-06,
"loss": 0.0002,
"step": 551
},
{
"epoch": 0.762167759751467,
"grad_norm": 0.013671875,
"learning_rate": 4.950213371266003e-06,
"loss": 0.0001,
"step": 552
},
{
"epoch": 0.7635484984466689,
"grad_norm": 0.0205078125,
"learning_rate": 4.921763869132291e-06,
"loss": 0.0001,
"step": 553
},
{
"epoch": 0.7649292371418709,
"grad_norm": 0.0145263671875,
"learning_rate": 4.893314366998577e-06,
"loss": 0.0001,
"step": 554
},
{
"epoch": 0.7663099758370728,
"grad_norm": 0.01336669921875,
"learning_rate": 4.864864864864866e-06,
"loss": 0.0001,
"step": 555
},
{
"epoch": 0.7676907145322748,
"grad_norm": 0.01336669921875,
"learning_rate": 4.8364153627311525e-06,
"loss": 0.0001,
"step": 556
},
{
"epoch": 0.7690714532274767,
"grad_norm": 0.01336669921875,
"learning_rate": 4.80796586059744e-06,
"loss": 0.0001,
"step": 557
},
{
"epoch": 0.7704521919226787,
"grad_norm": 0.01324462890625,
"learning_rate": 4.779516358463727e-06,
"loss": 0.0001,
"step": 558
},
{
"epoch": 0.7718329306178806,
"grad_norm": 0.01361083984375,
"learning_rate": 4.751066856330014e-06,
"loss": 0.0001,
"step": 559
},
{
"epoch": 0.7732136693130826,
"grad_norm": 0.01348876953125,
"learning_rate": 4.722617354196302e-06,
"loss": 0.0001,
"step": 560
},
{
"epoch": 0.7745944080082844,
"grad_norm": 0.0142822265625,
"learning_rate": 4.6941678520625895e-06,
"loss": 0.0001,
"step": 561
},
{
"epoch": 0.7759751467034863,
"grad_norm": 0.119140625,
"learning_rate": 4.665718349928876e-06,
"loss": 0.0007,
"step": 562
},
{
"epoch": 0.7773558853986883,
"grad_norm": 0.0223388671875,
"learning_rate": 4.637268847795164e-06,
"loss": 0.0002,
"step": 563
},
{
"epoch": 0.7787366240938902,
"grad_norm": 0.01318359375,
"learning_rate": 4.608819345661451e-06,
"loss": 0.0001,
"step": 564
},
{
"epoch": 0.7801173627890922,
"grad_norm": 0.013671875,
"learning_rate": 4.580369843527739e-06,
"loss": 0.0001,
"step": 565
},
{
"epoch": 0.7814981014842941,
"grad_norm": 0.013427734375,
"learning_rate": 4.551920341394026e-06,
"loss": 0.0001,
"step": 566
},
{
"epoch": 0.782878840179496,
"grad_norm": 0.0174560546875,
"learning_rate": 4.523470839260313e-06,
"loss": 0.0001,
"step": 567
},
{
"epoch": 0.784259578874698,
"grad_norm": 0.0203857421875,
"learning_rate": 4.495021337126601e-06,
"loss": 0.0001,
"step": 568
},
{
"epoch": 0.7856403175698999,
"grad_norm": 0.013427734375,
"learning_rate": 4.4665718349928885e-06,
"loss": 0.0001,
"step": 569
},
{
"epoch": 0.7870210562651019,
"grad_norm": 0.01318359375,
"learning_rate": 4.438122332859175e-06,
"loss": 0.0001,
"step": 570
},
{
"epoch": 0.7884017949603037,
"grad_norm": 0.162109375,
"learning_rate": 4.409672830725463e-06,
"loss": 0.0005,
"step": 571
},
{
"epoch": 0.7897825336555057,
"grad_norm": 0.0133056640625,
"learning_rate": 4.38122332859175e-06,
"loss": 0.0001,
"step": 572
},
{
"epoch": 0.7911632723507076,
"grad_norm": 0.013427734375,
"learning_rate": 4.352773826458037e-06,
"loss": 0.0001,
"step": 573
},
{
"epoch": 0.7925440110459095,
"grad_norm": 0.053466796875,
"learning_rate": 4.324324324324325e-06,
"loss": 0.0003,
"step": 574
},
{
"epoch": 0.7939247497411115,
"grad_norm": 0.01373291015625,
"learning_rate": 4.295874822190612e-06,
"loss": 0.0001,
"step": 575
},
{
"epoch": 0.7953054884363134,
"grad_norm": 0.0224609375,
"learning_rate": 4.267425320056899e-06,
"loss": 0.0002,
"step": 576
},
{
"epoch": 0.7953054884363134,
"eval_loss": 0.0002442169061396271,
"eval_runtime": 582.9379,
"eval_samples_per_second": 2.209,
"eval_steps_per_second": 2.209,
"step": 576
},
{
"epoch": 0.7966862271315154,
"grad_norm": 0.01361083984375,
"learning_rate": 4.2389758179231865e-06,
"loss": 0.0001,
"step": 577
},
{
"epoch": 0.7980669658267173,
"grad_norm": 0.01953125,
"learning_rate": 4.210526315789474e-06,
"loss": 0.0002,
"step": 578
},
{
"epoch": 0.7994477045219193,
"grad_norm": 0.0201416015625,
"learning_rate": 4.182076813655762e-06,
"loss": 0.0002,
"step": 579
},
{
"epoch": 0.8008284432171212,
"grad_norm": 0.01336669921875,
"learning_rate": 4.1536273115220484e-06,
"loss": 0.0001,
"step": 580
},
{
"epoch": 0.802209181912323,
"grad_norm": 0.0130615234375,
"learning_rate": 4.125177809388336e-06,
"loss": 0.0001,
"step": 581
},
{
"epoch": 0.803589920607525,
"grad_norm": 0.0133056640625,
"learning_rate": 4.096728307254624e-06,
"loss": 0.0001,
"step": 582
},
{
"epoch": 0.8049706593027269,
"grad_norm": 0.01373291015625,
"learning_rate": 4.06827880512091e-06,
"loss": 0.0001,
"step": 583
},
{
"epoch": 0.8063513979979289,
"grad_norm": 0.03173828125,
"learning_rate": 4.039829302987198e-06,
"loss": 0.0002,
"step": 584
},
{
"epoch": 0.8077321366931308,
"grad_norm": 0.212890625,
"learning_rate": 4.0113798008534855e-06,
"loss": 0.001,
"step": 585
},
{
"epoch": 0.8091128753883328,
"grad_norm": 0.0439453125,
"learning_rate": 3.982930298719773e-06,
"loss": 0.0004,
"step": 586
},
{
"epoch": 0.8104936140835347,
"grad_norm": 0.04443359375,
"learning_rate": 3.95448079658606e-06,
"loss": 0.0003,
"step": 587
},
{
"epoch": 0.8118743527787367,
"grad_norm": 0.078125,
"learning_rate": 3.926031294452347e-06,
"loss": 0.0004,
"step": 588
},
{
"epoch": 0.8132550914739386,
"grad_norm": 0.0233154296875,
"learning_rate": 3.897581792318635e-06,
"loss": 0.0002,
"step": 589
},
{
"epoch": 0.8146358301691405,
"grad_norm": 0.01416015625,
"learning_rate": 3.8691322901849225e-06,
"loss": 0.0001,
"step": 590
},
{
"epoch": 0.8160165688643424,
"grad_norm": 0.044677734375,
"learning_rate": 3.840682788051209e-06,
"loss": 0.0002,
"step": 591
},
{
"epoch": 0.8173973075595443,
"grad_norm": 0.17578125,
"learning_rate": 3.8122332859174964e-06,
"loss": 0.0004,
"step": 592
},
{
"epoch": 0.8187780462547463,
"grad_norm": 0.255859375,
"learning_rate": 3.7837837837837844e-06,
"loss": 0.0003,
"step": 593
},
{
"epoch": 0.8201587849499482,
"grad_norm": 0.01416015625,
"learning_rate": 3.7553342816500715e-06,
"loss": 0.0001,
"step": 594
},
{
"epoch": 0.8215395236451501,
"grad_norm": 0.0137939453125,
"learning_rate": 3.7268847795163587e-06,
"loss": 0.0001,
"step": 595
},
{
"epoch": 0.8229202623403521,
"grad_norm": 0.01373291015625,
"learning_rate": 3.698435277382646e-06,
"loss": 0.0001,
"step": 596
},
{
"epoch": 0.824301001035554,
"grad_norm": 0.01385498046875,
"learning_rate": 3.669985775248933e-06,
"loss": 0.0001,
"step": 597
},
{
"epoch": 0.825681739730756,
"grad_norm": 0.0133056640625,
"learning_rate": 3.641536273115221e-06,
"loss": 0.0001,
"step": 598
},
{
"epoch": 0.8270624784259579,
"grad_norm": 0.04248046875,
"learning_rate": 3.613086770981508e-06,
"loss": 0.0003,
"step": 599
},
{
"epoch": 0.8284432171211599,
"grad_norm": 0.126953125,
"learning_rate": 3.5846372688477953e-06,
"loss": 0.0013,
"step": 600
},
{
"epoch": 0.8298239558163617,
"grad_norm": 0.0145263671875,
"learning_rate": 3.5561877667140825e-06,
"loss": 0.0001,
"step": 601
},
{
"epoch": 0.8312046945115636,
"grad_norm": 0.043212890625,
"learning_rate": 3.5277382645803705e-06,
"loss": 0.0004,
"step": 602
},
{
"epoch": 0.8325854332067656,
"grad_norm": 0.02099609375,
"learning_rate": 3.4992887624466576e-06,
"loss": 0.0001,
"step": 603
},
{
"epoch": 0.8339661719019675,
"grad_norm": 0.55859375,
"learning_rate": 3.4708392603129448e-06,
"loss": 0.0004,
"step": 604
},
{
"epoch": 0.8353469105971695,
"grad_norm": 0.045166015625,
"learning_rate": 3.442389758179232e-06,
"loss": 0.0004,
"step": 605
},
{
"epoch": 0.8367276492923714,
"grad_norm": 0.080078125,
"learning_rate": 3.413940256045519e-06,
"loss": 0.0002,
"step": 606
},
{
"epoch": 0.8381083879875734,
"grad_norm": 0.01416015625,
"learning_rate": 3.385490753911807e-06,
"loss": 0.0001,
"step": 607
},
{
"epoch": 0.8394891266827753,
"grad_norm": 0.01336669921875,
"learning_rate": 3.3570412517780942e-06,
"loss": 0.0001,
"step": 608
},
{
"epoch": 0.8408698653779773,
"grad_norm": 0.048828125,
"learning_rate": 3.3285917496443814e-06,
"loss": 0.0002,
"step": 609
},
{
"epoch": 0.8422506040731792,
"grad_norm": 0.016357421875,
"learning_rate": 3.3001422475106685e-06,
"loss": 0.0001,
"step": 610
},
{
"epoch": 0.843631342768381,
"grad_norm": 0.146484375,
"learning_rate": 3.271692745376956e-06,
"loss": 0.0008,
"step": 611
},
{
"epoch": 0.845012081463583,
"grad_norm": 0.01336669921875,
"learning_rate": 3.2432432432432437e-06,
"loss": 0.0001,
"step": 612
},
{
"epoch": 0.8463928201587849,
"grad_norm": 0.01336669921875,
"learning_rate": 3.214793741109531e-06,
"loss": 0.0001,
"step": 613
},
{
"epoch": 0.8477735588539869,
"grad_norm": 0.072265625,
"learning_rate": 3.186344238975818e-06,
"loss": 0.0002,
"step": 614
},
{
"epoch": 0.8491542975491888,
"grad_norm": 0.0133056640625,
"learning_rate": 3.157894736842105e-06,
"loss": 0.0001,
"step": 615
},
{
"epoch": 0.8505350362443908,
"grad_norm": 0.01397705078125,
"learning_rate": 3.129445234708393e-06,
"loss": 0.0001,
"step": 616
},
{
"epoch": 0.8519157749395927,
"grad_norm": 0.05615234375,
"learning_rate": 3.1009957325746803e-06,
"loss": 0.0002,
"step": 617
},
{
"epoch": 0.8532965136347946,
"grad_norm": 0.0257568359375,
"learning_rate": 3.0725462304409675e-06,
"loss": 0.0001,
"step": 618
},
{
"epoch": 0.8546772523299966,
"grad_norm": 0.21875,
"learning_rate": 3.0440967283072546e-06,
"loss": 0.0014,
"step": 619
},
{
"epoch": 0.8560579910251985,
"grad_norm": 0.013671875,
"learning_rate": 3.015647226173542e-06,
"loss": 0.0001,
"step": 620
},
{
"epoch": 0.8574387297204004,
"grad_norm": 0.08203125,
"learning_rate": 2.9871977240398298e-06,
"loss": 0.0003,
"step": 621
},
{
"epoch": 0.8588194684156023,
"grad_norm": 0.0135498046875,
"learning_rate": 2.958748221906117e-06,
"loss": 0.0001,
"step": 622
},
{
"epoch": 0.8602002071108042,
"grad_norm": 0.053955078125,
"learning_rate": 2.930298719772404e-06,
"loss": 0.0003,
"step": 623
},
{
"epoch": 0.8615809458060062,
"grad_norm": 0.01348876953125,
"learning_rate": 2.9018492176386912e-06,
"loss": 0.0001,
"step": 624
},
{
"epoch": 0.8629616845012081,
"grad_norm": 0.01385498046875,
"learning_rate": 2.873399715504979e-06,
"loss": 0.0001,
"step": 625
},
{
"epoch": 0.8643424231964101,
"grad_norm": 0.01348876953125,
"learning_rate": 2.8449502133712664e-06,
"loss": 0.0001,
"step": 626
},
{
"epoch": 0.865723161891612,
"grad_norm": 0.01409912109375,
"learning_rate": 2.8165007112375536e-06,
"loss": 0.0001,
"step": 627
},
{
"epoch": 0.867103900586814,
"grad_norm": 0.018798828125,
"learning_rate": 2.7880512091038407e-06,
"loss": 0.0001,
"step": 628
},
{
"epoch": 0.8684846392820159,
"grad_norm": 0.01397705078125,
"learning_rate": 2.7596017069701283e-06,
"loss": 0.0001,
"step": 629
},
{
"epoch": 0.8698653779772179,
"grad_norm": 0.01348876953125,
"learning_rate": 2.731152204836416e-06,
"loss": 0.0001,
"step": 630
},
{
"epoch": 0.8712461166724197,
"grad_norm": 0.0133056640625,
"learning_rate": 2.702702702702703e-06,
"loss": 0.0001,
"step": 631
},
{
"epoch": 0.8726268553676216,
"grad_norm": 0.012939453125,
"learning_rate": 2.67425320056899e-06,
"loss": 0.0001,
"step": 632
},
{
"epoch": 0.8740075940628236,
"grad_norm": 0.0135498046875,
"learning_rate": 2.6458036984352773e-06,
"loss": 0.0001,
"step": 633
},
{
"epoch": 0.8753883327580255,
"grad_norm": 0.01300048828125,
"learning_rate": 2.617354196301565e-06,
"loss": 0.0001,
"step": 634
},
{
"epoch": 0.8767690714532275,
"grad_norm": 0.16015625,
"learning_rate": 2.5889046941678525e-06,
"loss": 0.0004,
"step": 635
},
{
"epoch": 0.8781498101484294,
"grad_norm": 0.01336669921875,
"learning_rate": 2.5604551920341396e-06,
"loss": 0.0001,
"step": 636
},
{
"epoch": 0.8795305488436314,
"grad_norm": 0.091796875,
"learning_rate": 2.5320056899004268e-06,
"loss": 0.0006,
"step": 637
},
{
"epoch": 0.8809112875388333,
"grad_norm": 0.0189208984375,
"learning_rate": 2.5035561877667144e-06,
"loss": 0.0002,
"step": 638
},
{
"epoch": 0.8822920262340352,
"grad_norm": 0.06494140625,
"learning_rate": 2.4751066856330015e-06,
"loss": 0.0003,
"step": 639
},
{
"epoch": 0.8836727649292372,
"grad_norm": 0.01324462890625,
"learning_rate": 2.4466571834992887e-06,
"loss": 0.0001,
"step": 640
},
{
"epoch": 0.885053503624439,
"grad_norm": 0.19921875,
"learning_rate": 2.4182076813655762e-06,
"loss": 0.0013,
"step": 641
},
{
"epoch": 0.886434242319641,
"grad_norm": 0.027099609375,
"learning_rate": 2.3897581792318634e-06,
"loss": 0.0002,
"step": 642
},
{
"epoch": 0.8878149810148429,
"grad_norm": 0.1884765625,
"learning_rate": 2.361308677098151e-06,
"loss": 0.0003,
"step": 643
},
{
"epoch": 0.8891957197100449,
"grad_norm": 0.279296875,
"learning_rate": 2.332859174964438e-06,
"loss": 0.0014,
"step": 644
},
{
"epoch": 0.8905764584052468,
"grad_norm": 0.0732421875,
"learning_rate": 2.3044096728307257e-06,
"loss": 0.0002,
"step": 645
},
{
"epoch": 0.8919571971004487,
"grad_norm": 0.01361083984375,
"learning_rate": 2.275960170697013e-06,
"loss": 0.0001,
"step": 646
},
{
"epoch": 0.8933379357956507,
"grad_norm": 0.01318359375,
"learning_rate": 2.2475106685633004e-06,
"loss": 0.0001,
"step": 647
},
{
"epoch": 0.8947186744908526,
"grad_norm": 0.10302734375,
"learning_rate": 2.2190611664295876e-06,
"loss": 0.0005,
"step": 648
},
{
"epoch": 0.8947186744908526,
"eval_loss": 0.00023728572705294937,
"eval_runtime": 582.6042,
"eval_samples_per_second": 2.211,
"eval_steps_per_second": 2.211,
"step": 648
},
{
"epoch": 0.8960994131860546,
"grad_norm": 0.0244140625,
"learning_rate": 2.190611664295875e-06,
"loss": 0.0002,
"step": 649
},
{
"epoch": 0.8974801518812565,
"grad_norm": 0.0218505859375,
"learning_rate": 2.1621621621621623e-06,
"loss": 0.0002,
"step": 650
},
{
"epoch": 0.8988608905764585,
"grad_norm": 0.04736328125,
"learning_rate": 2.1337126600284495e-06,
"loss": 0.0004,
"step": 651
},
{
"epoch": 0.9002416292716603,
"grad_norm": 0.0137939453125,
"learning_rate": 2.105263157894737e-06,
"loss": 0.0001,
"step": 652
},
{
"epoch": 0.9016223679668622,
"grad_norm": 0.0196533203125,
"learning_rate": 2.0768136557610242e-06,
"loss": 0.0002,
"step": 653
},
{
"epoch": 0.9030031066620642,
"grad_norm": 0.0133056640625,
"learning_rate": 2.048364153627312e-06,
"loss": 0.0001,
"step": 654
},
{
"epoch": 0.9043838453572661,
"grad_norm": 0.01422119140625,
"learning_rate": 2.019914651493599e-06,
"loss": 0.0001,
"step": 655
},
{
"epoch": 0.9057645840524681,
"grad_norm": 0.1435546875,
"learning_rate": 1.9914651493598865e-06,
"loss": 0.0007,
"step": 656
},
{
"epoch": 0.90714532274767,
"grad_norm": 0.01531982421875,
"learning_rate": 1.9630156472261737e-06,
"loss": 0.0001,
"step": 657
},
{
"epoch": 0.908526061442872,
"grad_norm": 0.07470703125,
"learning_rate": 1.9345661450924613e-06,
"loss": 0.0003,
"step": 658
},
{
"epoch": 0.9099068001380739,
"grad_norm": 0.023681640625,
"learning_rate": 1.9061166429587482e-06,
"loss": 0.0002,
"step": 659
},
{
"epoch": 0.9112875388332758,
"grad_norm": 0.052001953125,
"learning_rate": 1.8776671408250358e-06,
"loss": 0.0002,
"step": 660
},
{
"epoch": 0.9126682775284778,
"grad_norm": 0.045654296875,
"learning_rate": 1.849217638691323e-06,
"loss": 0.0003,
"step": 661
},
{
"epoch": 0.9140490162236796,
"grad_norm": 0.1142578125,
"learning_rate": 1.8207681365576105e-06,
"loss": 0.0004,
"step": 662
},
{
"epoch": 0.9154297549188816,
"grad_norm": 0.01324462890625,
"learning_rate": 1.7923186344238977e-06,
"loss": 0.0001,
"step": 663
},
{
"epoch": 0.9168104936140835,
"grad_norm": 0.01348876953125,
"learning_rate": 1.7638691322901852e-06,
"loss": 0.0001,
"step": 664
},
{
"epoch": 0.9181912323092855,
"grad_norm": 0.0133056640625,
"learning_rate": 1.7354196301564724e-06,
"loss": 0.0001,
"step": 665
},
{
"epoch": 0.9195719710044874,
"grad_norm": 0.01544189453125,
"learning_rate": 1.7069701280227595e-06,
"loss": 0.0001,
"step": 666
},
{
"epoch": 0.9209527096996893,
"grad_norm": 0.01507568359375,
"learning_rate": 1.6785206258890471e-06,
"loss": 0.0001,
"step": 667
},
{
"epoch": 0.9223334483948913,
"grad_norm": 0.0135498046875,
"learning_rate": 1.6500711237553343e-06,
"loss": 0.0001,
"step": 668
},
{
"epoch": 0.9237141870900932,
"grad_norm": 0.0142822265625,
"learning_rate": 1.6216216216216219e-06,
"loss": 0.0001,
"step": 669
},
{
"epoch": 0.9250949257852952,
"grad_norm": 0.126953125,
"learning_rate": 1.593172119487909e-06,
"loss": 0.0012,
"step": 670
},
{
"epoch": 0.9264756644804971,
"grad_norm": 0.126953125,
"learning_rate": 1.5647226173541966e-06,
"loss": 0.0003,
"step": 671
},
{
"epoch": 0.927856403175699,
"grad_norm": 0.01312255859375,
"learning_rate": 1.5362731152204837e-06,
"loss": 0.0001,
"step": 672
},
{
"epoch": 0.9292371418709009,
"grad_norm": 0.037109375,
"learning_rate": 1.507823613086771e-06,
"loss": 0.0002,
"step": 673
},
{
"epoch": 0.9306178805661028,
"grad_norm": 0.0225830078125,
"learning_rate": 1.4793741109530585e-06,
"loss": 0.0001,
"step": 674
},
{
"epoch": 0.9319986192613048,
"grad_norm": 0.01324462890625,
"learning_rate": 1.4509246088193456e-06,
"loss": 0.0001,
"step": 675
},
{
"epoch": 0.9333793579565067,
"grad_norm": 0.01416015625,
"learning_rate": 1.4224751066856332e-06,
"loss": 0.0001,
"step": 676
},
{
"epoch": 0.9347600966517087,
"grad_norm": 0.01611328125,
"learning_rate": 1.3940256045519204e-06,
"loss": 0.0001,
"step": 677
},
{
"epoch": 0.9361408353469106,
"grad_norm": 0.3828125,
"learning_rate": 1.365576102418208e-06,
"loss": 0.0012,
"step": 678
},
{
"epoch": 0.9375215740421126,
"grad_norm": 0.0157470703125,
"learning_rate": 1.337126600284495e-06,
"loss": 0.0001,
"step": 679
},
{
"epoch": 0.9389023127373145,
"grad_norm": 0.25390625,
"learning_rate": 1.3086770981507825e-06,
"loss": 0.0017,
"step": 680
},
{
"epoch": 0.9402830514325164,
"grad_norm": 0.130859375,
"learning_rate": 1.2802275960170698e-06,
"loss": 0.0004,
"step": 681
},
{
"epoch": 0.9416637901277183,
"grad_norm": 0.0166015625,
"learning_rate": 1.2517780938833572e-06,
"loss": 0.0001,
"step": 682
},
{
"epoch": 0.9430445288229202,
"grad_norm": 0.01458740234375,
"learning_rate": 1.2233285917496443e-06,
"loss": 0.0001,
"step": 683
},
{
"epoch": 0.9444252675181222,
"grad_norm": 0.01373291015625,
"learning_rate": 1.1948790896159317e-06,
"loss": 0.0001,
"step": 684
},
{
"epoch": 0.9458060062133241,
"grad_norm": 0.01336669921875,
"learning_rate": 1.166429587482219e-06,
"loss": 0.0001,
"step": 685
},
{
"epoch": 0.9471867449085261,
"grad_norm": 0.0213623046875,
"learning_rate": 1.1379800853485064e-06,
"loss": 0.0001,
"step": 686
},
{
"epoch": 0.948567483603728,
"grad_norm": 0.038818359375,
"learning_rate": 1.1095305832147938e-06,
"loss": 0.0002,
"step": 687
},
{
"epoch": 0.94994822229893,
"grad_norm": 0.0157470703125,
"learning_rate": 1.0810810810810812e-06,
"loss": 0.0001,
"step": 688
},
{
"epoch": 0.9513289609941319,
"grad_norm": 0.01373291015625,
"learning_rate": 1.0526315789473685e-06,
"loss": 0.0001,
"step": 689
},
{
"epoch": 0.9527096996893338,
"grad_norm": 0.0137939453125,
"learning_rate": 1.024182076813656e-06,
"loss": 0.0001,
"step": 690
},
{
"epoch": 0.9540904383845358,
"grad_norm": 0.05078125,
"learning_rate": 9.957325746799433e-07,
"loss": 0.0002,
"step": 691
},
{
"epoch": 0.9554711770797376,
"grad_norm": 0.015869140625,
"learning_rate": 9.672830725462306e-07,
"loss": 0.0002,
"step": 692
},
{
"epoch": 0.9568519157749396,
"grad_norm": 0.042236328125,
"learning_rate": 9.388335704125179e-07,
"loss": 0.0002,
"step": 693
},
{
"epoch": 0.9582326544701415,
"grad_norm": 0.01287841796875,
"learning_rate": 9.103840682788053e-07,
"loss": 0.0001,
"step": 694
},
{
"epoch": 0.9596133931653434,
"grad_norm": 0.01422119140625,
"learning_rate": 8.819345661450926e-07,
"loss": 0.0001,
"step": 695
},
{
"epoch": 0.9609941318605454,
"grad_norm": 0.01513671875,
"learning_rate": 8.534850640113798e-07,
"loss": 0.0001,
"step": 696
},
{
"epoch": 0.9623748705557473,
"grad_norm": 0.0771484375,
"learning_rate": 8.250355618776671e-07,
"loss": 0.0003,
"step": 697
},
{
"epoch": 0.9637556092509493,
"grad_norm": 0.01300048828125,
"learning_rate": 7.965860597439545e-07,
"loss": 0.0001,
"step": 698
},
{
"epoch": 0.9651363479461512,
"grad_norm": 0.01324462890625,
"learning_rate": 7.681365576102419e-07,
"loss": 0.0001,
"step": 699
},
{
"epoch": 0.9665170866413532,
"grad_norm": 0.01336669921875,
"learning_rate": 7.396870554765292e-07,
"loss": 0.0001,
"step": 700
},
{
"epoch": 0.9678978253365551,
"grad_norm": 0.04736328125,
"learning_rate": 7.112375533428166e-07,
"loss": 0.0002,
"step": 701
},
{
"epoch": 0.9692785640317569,
"grad_norm": 0.020751953125,
"learning_rate": 6.82788051209104e-07,
"loss": 0.0002,
"step": 702
},
{
"epoch": 0.9706593027269589,
"grad_norm": 0.037841796875,
"learning_rate": 6.543385490753912e-07,
"loss": 0.0001,
"step": 703
},
{
"epoch": 0.9720400414221608,
"grad_norm": 0.01483154296875,
"learning_rate": 6.258890469416786e-07,
"loss": 0.0001,
"step": 704
},
{
"epoch": 0.9734207801173628,
"grad_norm": 0.0208740234375,
"learning_rate": 5.974395448079659e-07,
"loss": 0.0001,
"step": 705
},
{
"epoch": 0.9748015188125647,
"grad_norm": 0.01300048828125,
"learning_rate": 5.689900426742532e-07,
"loss": 0.0001,
"step": 706
},
{
"epoch": 0.9761822575077667,
"grad_norm": 0.01409912109375,
"learning_rate": 5.405405405405406e-07,
"loss": 0.0001,
"step": 707
},
{
"epoch": 0.9775629962029686,
"grad_norm": 0.0137939453125,
"learning_rate": 5.12091038406828e-07,
"loss": 0.0001,
"step": 708
},
{
"epoch": 0.9789437348981705,
"grad_norm": 0.044921875,
"learning_rate": 4.836415362731153e-07,
"loss": 0.0002,
"step": 709
},
{
"epoch": 0.9803244735933725,
"grad_norm": 0.1357421875,
"learning_rate": 4.551920341394026e-07,
"loss": 0.0003,
"step": 710
},
{
"epoch": 0.9817052122885744,
"grad_norm": 0.01422119140625,
"learning_rate": 4.267425320056899e-07,
"loss": 0.0001,
"step": 711
},
{
"epoch": 0.9830859509837763,
"grad_norm": 0.040283203125,
"learning_rate": 3.9829302987197725e-07,
"loss": 0.0003,
"step": 712
},
{
"epoch": 0.9844666896789782,
"grad_norm": 0.02734375,
"learning_rate": 3.698435277382646e-07,
"loss": 0.0002,
"step": 713
},
{
"epoch": 0.9858474283741802,
"grad_norm": 0.01422119140625,
"learning_rate": 3.41394025604552e-07,
"loss": 0.0001,
"step": 714
},
{
"epoch": 0.9872281670693821,
"grad_norm": 0.091796875,
"learning_rate": 3.129445234708393e-07,
"loss": 0.0006,
"step": 715
},
{
"epoch": 0.988608905764584,
"grad_norm": 0.01611328125,
"learning_rate": 2.844950213371266e-07,
"loss": 0.0001,
"step": 716
},
{
"epoch": 0.989989644459786,
"grad_norm": 0.054443359375,
"learning_rate": 2.56045519203414e-07,
"loss": 0.0003,
"step": 717
},
{
"epoch": 0.9913703831549879,
"grad_norm": 0.014892578125,
"learning_rate": 2.275960170697013e-07,
"loss": 0.0001,
"step": 718
},
{
"epoch": 0.9927511218501899,
"grad_norm": 0.0137939453125,
"learning_rate": 1.9914651493598863e-07,
"loss": 0.0001,
"step": 719
},
{
"epoch": 0.9941318605453918,
"grad_norm": 0.01361083984375,
"learning_rate": 1.70697012802276e-07,
"loss": 0.0001,
"step": 720
},
{
"epoch": 0.9941318605453918,
"eval_loss": 0.00024354300694540143,
"eval_runtime": 582.0573,
"eval_samples_per_second": 2.213,
"eval_steps_per_second": 2.213,
"step": 720
},
{
"epoch": 0.9955125992405938,
"grad_norm": 0.013671875,
"learning_rate": 1.422475106685633e-07,
"loss": 0.0001,
"step": 721
},
{
"epoch": 0.9968933379357956,
"grad_norm": 0.0133056640625,
"learning_rate": 1.1379800853485066e-07,
"loss": 0.0001,
"step": 722
},
{
"epoch": 0.9982740766309975,
"grad_norm": 0.01318359375,
"learning_rate": 8.5348506401138e-08,
"loss": 0.0001,
"step": 723
},
{
"epoch": 0.9996548153261995,
"grad_norm": 0.0211181640625,
"learning_rate": 5.689900426742533e-08,
"loss": 0.0001,
"step": 724
},
{
"epoch": 1.0,
"grad_norm": 0.003204345703125,
"learning_rate": 2.8449502133712664e-08,
"loss": 0.0,
"step": 725
}
],
"logging_steps": 1,
"max_steps": 725,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 72,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.298593420780503e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}