llama33B / checkpoint-1168 /trainer_state.json
chrisgru's picture
Upload folder using huggingface_hub
3636183 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997860047078965,
"eval_steps": 117,
"global_step": 1168,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008559811684142949,
"grad_norm": 4.09375,
"learning_rate": 2.5e-06,
"loss": 1.4683,
"step": 1
},
{
"epoch": 0.0008559811684142949,
"eval_loss": 1.6825672388076782,
"eval_runtime": 22.7989,
"eval_samples_per_second": 17.106,
"eval_steps_per_second": 17.106,
"step": 1
},
{
"epoch": 0.0017119623368285898,
"grad_norm": 2.90625,
"learning_rate": 5e-06,
"loss": 1.6305,
"step": 2
},
{
"epoch": 0.0025679435052428845,
"grad_norm": 3.0,
"learning_rate": 7.5e-06,
"loss": 1.6191,
"step": 3
},
{
"epoch": 0.0034239246736571796,
"grad_norm": 2.78125,
"learning_rate": 1e-05,
"loss": 1.6011,
"step": 4
},
{
"epoch": 0.004279905842071475,
"grad_norm": 2.671875,
"learning_rate": 1.25e-05,
"loss": 1.6021,
"step": 5
},
{
"epoch": 0.005135887010485769,
"grad_norm": 2.71875,
"learning_rate": 1.5e-05,
"loss": 1.4842,
"step": 6
},
{
"epoch": 0.005991868178900064,
"grad_norm": 2.3125,
"learning_rate": 1.75e-05,
"loss": 1.718,
"step": 7
},
{
"epoch": 0.006847849347314359,
"grad_norm": 2.328125,
"learning_rate": 2e-05,
"loss": 1.621,
"step": 8
},
{
"epoch": 0.007703830515728654,
"grad_norm": 2.203125,
"learning_rate": 2.25e-05,
"loss": 1.648,
"step": 9
},
{
"epoch": 0.00855981168414295,
"grad_norm": 2.078125,
"learning_rate": 2.5e-05,
"loss": 1.5684,
"step": 10
},
{
"epoch": 0.009415792852557245,
"grad_norm": 2.15625,
"learning_rate": 2.7500000000000004e-05,
"loss": 1.6588,
"step": 11
},
{
"epoch": 0.010271774020971538,
"grad_norm": 2.15625,
"learning_rate": 3e-05,
"loss": 1.5649,
"step": 12
},
{
"epoch": 0.011127755189385833,
"grad_norm": 2.21875,
"learning_rate": 3.2500000000000004e-05,
"loss": 1.5527,
"step": 13
},
{
"epoch": 0.011983736357800128,
"grad_norm": 2.0625,
"learning_rate": 3.5e-05,
"loss": 1.5464,
"step": 14
},
{
"epoch": 0.012839717526214423,
"grad_norm": 2.125,
"learning_rate": 3.7500000000000003e-05,
"loss": 1.7606,
"step": 15
},
{
"epoch": 0.013695698694628718,
"grad_norm": 2.21875,
"learning_rate": 4e-05,
"loss": 1.5089,
"step": 16
},
{
"epoch": 0.014551679863043013,
"grad_norm": 2.109375,
"learning_rate": 4.25e-05,
"loss": 1.5609,
"step": 17
},
{
"epoch": 0.015407661031457309,
"grad_norm": 2.265625,
"learning_rate": 4.5e-05,
"loss": 1.688,
"step": 18
},
{
"epoch": 0.016263642199871604,
"grad_norm": 2.140625,
"learning_rate": 4.75e-05,
"loss": 1.4745,
"step": 19
},
{
"epoch": 0.0171196233682859,
"grad_norm": 2.234375,
"learning_rate": 5e-05,
"loss": 1.6253,
"step": 20
},
{
"epoch": 0.017975604536700194,
"grad_norm": 2.34375,
"learning_rate": 4.999990638925292e-05,
"loss": 1.5921,
"step": 21
},
{
"epoch": 0.01883158570511449,
"grad_norm": 2.109375,
"learning_rate": 4.999962555771271e-05,
"loss": 1.443,
"step": 22
},
{
"epoch": 0.01968756687352878,
"grad_norm": 2.328125,
"learning_rate": 4.999915750748249e-05,
"loss": 1.609,
"step": 23
},
{
"epoch": 0.020543548041943076,
"grad_norm": 2.234375,
"learning_rate": 4.999850224206741e-05,
"loss": 1.6203,
"step": 24
},
{
"epoch": 0.02139952921035737,
"grad_norm": 2.1875,
"learning_rate": 4.999765976637467e-05,
"loss": 1.4801,
"step": 25
},
{
"epoch": 0.022255510378771666,
"grad_norm": 2.265625,
"learning_rate": 4.999663008671344e-05,
"loss": 1.6311,
"step": 26
},
{
"epoch": 0.02311149154718596,
"grad_norm": 2.0625,
"learning_rate": 4.9995413210794864e-05,
"loss": 1.586,
"step": 27
},
{
"epoch": 0.023967472715600256,
"grad_norm": 2.03125,
"learning_rate": 4.999400914773193e-05,
"loss": 1.5281,
"step": 28
},
{
"epoch": 0.02482345388401455,
"grad_norm": 2.0,
"learning_rate": 4.99924179080395e-05,
"loss": 1.589,
"step": 29
},
{
"epoch": 0.025679435052428846,
"grad_norm": 2.09375,
"learning_rate": 4.999063950363413e-05,
"loss": 1.6053,
"step": 30
},
{
"epoch": 0.02653541622084314,
"grad_norm": 2.046875,
"learning_rate": 4.998867394783404e-05,
"loss": 1.4413,
"step": 31
},
{
"epoch": 0.027391397389257437,
"grad_norm": 1.953125,
"learning_rate": 4.9986521255359004e-05,
"loss": 1.5999,
"step": 32
},
{
"epoch": 0.028247378557671732,
"grad_norm": 2.03125,
"learning_rate": 4.998418144233023e-05,
"loss": 1.6345,
"step": 33
},
{
"epoch": 0.029103359726086027,
"grad_norm": 2.3125,
"learning_rate": 4.998165452627025e-05,
"loss": 1.665,
"step": 34
},
{
"epoch": 0.029959340894500322,
"grad_norm": 2.0,
"learning_rate": 4.997894052610279e-05,
"loss": 1.5723,
"step": 35
},
{
"epoch": 0.030815322062914617,
"grad_norm": 2.0,
"learning_rate": 4.997603946215262e-05,
"loss": 1.4505,
"step": 36
},
{
"epoch": 0.03167130323132891,
"grad_norm": 2.015625,
"learning_rate": 4.997295135614539e-05,
"loss": 1.5724,
"step": 37
},
{
"epoch": 0.03252728439974321,
"grad_norm": 2.140625,
"learning_rate": 4.9969676231207494e-05,
"loss": 1.6605,
"step": 38
},
{
"epoch": 0.0333832655681575,
"grad_norm": 1.984375,
"learning_rate": 4.996621411186589e-05,
"loss": 1.5345,
"step": 39
},
{
"epoch": 0.0342392467365718,
"grad_norm": 1.9453125,
"learning_rate": 4.99625650240479e-05,
"loss": 1.6665,
"step": 40
},
{
"epoch": 0.03509522790498609,
"grad_norm": 2.0625,
"learning_rate": 4.995872899508103e-05,
"loss": 1.5707,
"step": 41
},
{
"epoch": 0.03595120907340039,
"grad_norm": 2.140625,
"learning_rate": 4.995470605369277e-05,
"loss": 1.652,
"step": 42
},
{
"epoch": 0.03680719024181468,
"grad_norm": 1.9453125,
"learning_rate": 4.995049623001036e-05,
"loss": 1.3974,
"step": 43
},
{
"epoch": 0.03766317141022898,
"grad_norm": 1.9296875,
"learning_rate": 4.9946099555560565e-05,
"loss": 1.613,
"step": 44
},
{
"epoch": 0.03851915257864327,
"grad_norm": 1.9921875,
"learning_rate": 4.994151606326949e-05,
"loss": 1.5067,
"step": 45
},
{
"epoch": 0.03937513374705756,
"grad_norm": 1.953125,
"learning_rate": 4.993674578746225e-05,
"loss": 1.5115,
"step": 46
},
{
"epoch": 0.040231114915471856,
"grad_norm": 2.234375,
"learning_rate": 4.993178876386278e-05,
"loss": 1.6309,
"step": 47
},
{
"epoch": 0.04108709608388615,
"grad_norm": 2.046875,
"learning_rate": 4.992664502959351e-05,
"loss": 1.6605,
"step": 48
},
{
"epoch": 0.04194307725230045,
"grad_norm": 2.0625,
"learning_rate": 4.9921314623175174e-05,
"loss": 1.5052,
"step": 49
},
{
"epoch": 0.04279905842071474,
"grad_norm": 2.0625,
"learning_rate": 4.991579758452644e-05,
"loss": 1.5388,
"step": 50
},
{
"epoch": 0.04365503958912904,
"grad_norm": 1.9140625,
"learning_rate": 4.99100939549636e-05,
"loss": 1.6761,
"step": 51
},
{
"epoch": 0.04451102075754333,
"grad_norm": 2.109375,
"learning_rate": 4.990420377720038e-05,
"loss": 1.6295,
"step": 52
},
{
"epoch": 0.04536700192595763,
"grad_norm": 1.953125,
"learning_rate": 4.9898127095347466e-05,
"loss": 1.5579,
"step": 53
},
{
"epoch": 0.04622298309437192,
"grad_norm": 1.8984375,
"learning_rate": 4.989186395491229e-05,
"loss": 1.4967,
"step": 54
},
{
"epoch": 0.04707896426278622,
"grad_norm": 1.921875,
"learning_rate": 4.9885414402798624e-05,
"loss": 1.4205,
"step": 55
},
{
"epoch": 0.04793494543120051,
"grad_norm": 2.03125,
"learning_rate": 4.987877848730627e-05,
"loss": 1.6522,
"step": 56
},
{
"epoch": 0.04879092659961481,
"grad_norm": 1.90625,
"learning_rate": 4.987195625813066e-05,
"loss": 1.5241,
"step": 57
},
{
"epoch": 0.0496469077680291,
"grad_norm": 2.09375,
"learning_rate": 4.9864947766362505e-05,
"loss": 1.548,
"step": 58
},
{
"epoch": 0.0505028889364434,
"grad_norm": 1.8984375,
"learning_rate": 4.985775306448743e-05,
"loss": 1.5058,
"step": 59
},
{
"epoch": 0.05135887010485769,
"grad_norm": 1.9765625,
"learning_rate": 4.985037220638555e-05,
"loss": 1.6028,
"step": 60
},
{
"epoch": 0.05221485127327199,
"grad_norm": 1.984375,
"learning_rate": 4.984280524733107e-05,
"loss": 1.5308,
"step": 61
},
{
"epoch": 0.05307083244168628,
"grad_norm": 2.0,
"learning_rate": 4.9835052243991874e-05,
"loss": 1.5042,
"step": 62
},
{
"epoch": 0.05392681361010058,
"grad_norm": 2.15625,
"learning_rate": 4.982711325442914e-05,
"loss": 1.5008,
"step": 63
},
{
"epoch": 0.05478279477851487,
"grad_norm": 2.109375,
"learning_rate": 4.981898833809683e-05,
"loss": 1.6986,
"step": 64
},
{
"epoch": 0.05563877594692917,
"grad_norm": 1.890625,
"learning_rate": 4.9810677555841314e-05,
"loss": 1.651,
"step": 65
},
{
"epoch": 0.056494757115343464,
"grad_norm": 1.9921875,
"learning_rate": 4.980218096990087e-05,
"loss": 1.5315,
"step": 66
},
{
"epoch": 0.05735073828375776,
"grad_norm": 1.84375,
"learning_rate": 4.9793498643905236e-05,
"loss": 1.5917,
"step": 67
},
{
"epoch": 0.058206719452172054,
"grad_norm": 1.9140625,
"learning_rate": 4.978463064287513e-05,
"loss": 1.5897,
"step": 68
},
{
"epoch": 0.05906270062058635,
"grad_norm": 2.453125,
"learning_rate": 4.977557703322178e-05,
"loss": 1.5924,
"step": 69
},
{
"epoch": 0.059918681789000644,
"grad_norm": 2.0625,
"learning_rate": 4.97663378827464e-05,
"loss": 1.5634,
"step": 70
},
{
"epoch": 0.06077466295741494,
"grad_norm": 2.171875,
"learning_rate": 4.9756913260639675e-05,
"loss": 1.5397,
"step": 71
},
{
"epoch": 0.061630644125829234,
"grad_norm": 1.90625,
"learning_rate": 4.974730323748129e-05,
"loss": 1.6735,
"step": 72
},
{
"epoch": 0.06248662529424353,
"grad_norm": 1.921875,
"learning_rate": 4.9737507885239366e-05,
"loss": 1.4538,
"step": 73
},
{
"epoch": 0.06334260646265782,
"grad_norm": 1.90625,
"learning_rate": 4.9727527277269915e-05,
"loss": 1.4092,
"step": 74
},
{
"epoch": 0.06419858763107211,
"grad_norm": 1.890625,
"learning_rate": 4.97173614883163e-05,
"loss": 1.5579,
"step": 75
},
{
"epoch": 0.06505456879948641,
"grad_norm": 1.8515625,
"learning_rate": 4.970701059450872e-05,
"loss": 1.6395,
"step": 76
},
{
"epoch": 0.0659105499679007,
"grad_norm": 1.8125,
"learning_rate": 4.9696474673363536e-05,
"loss": 1.4457,
"step": 77
},
{
"epoch": 0.066766531136315,
"grad_norm": 2.109375,
"learning_rate": 4.96857538037828e-05,
"loss": 1.5416,
"step": 78
},
{
"epoch": 0.0676225123047293,
"grad_norm": 1.921875,
"learning_rate": 4.9674848066053586e-05,
"loss": 1.4792,
"step": 79
},
{
"epoch": 0.0684784934731436,
"grad_norm": 1.9296875,
"learning_rate": 4.966375754184746e-05,
"loss": 1.467,
"step": 80
},
{
"epoch": 0.06933447464155788,
"grad_norm": 2.03125,
"learning_rate": 4.965248231421977e-05,
"loss": 1.6674,
"step": 81
},
{
"epoch": 0.07019045580997219,
"grad_norm": 2.015625,
"learning_rate": 4.964102246760914e-05,
"loss": 1.473,
"step": 82
},
{
"epoch": 0.07104643697838647,
"grad_norm": 2.03125,
"learning_rate": 4.962937808783675e-05,
"loss": 1.61,
"step": 83
},
{
"epoch": 0.07190241814680078,
"grad_norm": 1.875,
"learning_rate": 4.9617549262105724e-05,
"loss": 1.5847,
"step": 84
},
{
"epoch": 0.07275839931521506,
"grad_norm": 2.15625,
"learning_rate": 4.9605536079000476e-05,
"loss": 1.7443,
"step": 85
},
{
"epoch": 0.07361438048362937,
"grad_norm": 1.8359375,
"learning_rate": 4.9593338628486055e-05,
"loss": 1.5063,
"step": 86
},
{
"epoch": 0.07447036165204365,
"grad_norm": 1.9765625,
"learning_rate": 4.9580957001907445e-05,
"loss": 1.6636,
"step": 87
},
{
"epoch": 0.07532634282045796,
"grad_norm": 1.9921875,
"learning_rate": 4.9568391291988927e-05,
"loss": 1.6315,
"step": 88
},
{
"epoch": 0.07618232398887224,
"grad_norm": 1.8828125,
"learning_rate": 4.9555641592833334e-05,
"loss": 1.5544,
"step": 89
},
{
"epoch": 0.07703830515728655,
"grad_norm": 1.8203125,
"learning_rate": 4.954270799992138e-05,
"loss": 1.4513,
"step": 90
},
{
"epoch": 0.07789428632570083,
"grad_norm": 2.25,
"learning_rate": 4.9529590610110914e-05,
"loss": 1.5529,
"step": 91
},
{
"epoch": 0.07875026749411512,
"grad_norm": 1.796875,
"learning_rate": 4.9516289521636244e-05,
"loss": 1.3935,
"step": 92
},
{
"epoch": 0.07960624866252942,
"grad_norm": 1.9140625,
"learning_rate": 4.9502804834107354e-05,
"loss": 1.5309,
"step": 93
},
{
"epoch": 0.08046222983094371,
"grad_norm": 1.953125,
"learning_rate": 4.948913664850917e-05,
"loss": 1.5814,
"step": 94
},
{
"epoch": 0.08131821099935801,
"grad_norm": 1.8671875,
"learning_rate": 4.947528506720082e-05,
"loss": 1.5933,
"step": 95
},
{
"epoch": 0.0821741921677723,
"grad_norm": 1.8046875,
"learning_rate": 4.946125019391486e-05,
"loss": 1.4894,
"step": 96
},
{
"epoch": 0.0830301733361866,
"grad_norm": 1.828125,
"learning_rate": 4.944703213375648e-05,
"loss": 1.5702,
"step": 97
},
{
"epoch": 0.0838861545046009,
"grad_norm": 1.8828125,
"learning_rate": 4.943263099320275e-05,
"loss": 1.6595,
"step": 98
},
{
"epoch": 0.0847421356730152,
"grad_norm": 1.9375,
"learning_rate": 4.941804688010178e-05,
"loss": 1.5197,
"step": 99
},
{
"epoch": 0.08559811684142948,
"grad_norm": 2.015625,
"learning_rate": 4.940327990367195e-05,
"loss": 1.6567,
"step": 100
},
{
"epoch": 0.08645409800984379,
"grad_norm": 2.125,
"learning_rate": 4.938833017450108e-05,
"loss": 1.6511,
"step": 101
},
{
"epoch": 0.08731007917825807,
"grad_norm": 1.90625,
"learning_rate": 4.937319780454559e-05,
"loss": 1.6058,
"step": 102
},
{
"epoch": 0.08816606034667238,
"grad_norm": 1.984375,
"learning_rate": 4.9357882907129685e-05,
"loss": 1.5673,
"step": 103
},
{
"epoch": 0.08902204151508666,
"grad_norm": 1.96875,
"learning_rate": 4.934238559694448e-05,
"loss": 1.5504,
"step": 104
},
{
"epoch": 0.08987802268350097,
"grad_norm": 2.015625,
"learning_rate": 4.932670599004715e-05,
"loss": 1.5693,
"step": 105
},
{
"epoch": 0.09073400385191525,
"grad_norm": 2.484375,
"learning_rate": 4.9310844203860084e-05,
"loss": 1.5945,
"step": 106
},
{
"epoch": 0.09158998502032956,
"grad_norm": 1.96875,
"learning_rate": 4.929480035716997e-05,
"loss": 1.6466,
"step": 107
},
{
"epoch": 0.09244596618874384,
"grad_norm": 1.921875,
"learning_rate": 4.927857457012692e-05,
"loss": 1.6873,
"step": 108
},
{
"epoch": 0.09330194735715815,
"grad_norm": 1.9375,
"learning_rate": 4.9262166964243596e-05,
"loss": 1.7084,
"step": 109
},
{
"epoch": 0.09415792852557243,
"grad_norm": 1.7421875,
"learning_rate": 4.924557766239423e-05,
"loss": 1.4966,
"step": 110
},
{
"epoch": 0.09501390969398674,
"grad_norm": 1.8984375,
"learning_rate": 4.92288067888138e-05,
"loss": 1.5114,
"step": 111
},
{
"epoch": 0.09586989086240102,
"grad_norm": 2.046875,
"learning_rate": 4.921185446909702e-05,
"loss": 1.5532,
"step": 112
},
{
"epoch": 0.09672587203081533,
"grad_norm": 1.9609375,
"learning_rate": 4.919472083019743e-05,
"loss": 1.6787,
"step": 113
},
{
"epoch": 0.09758185319922962,
"grad_norm": 1.8359375,
"learning_rate": 4.917740600042645e-05,
"loss": 1.4609,
"step": 114
},
{
"epoch": 0.09843783436764392,
"grad_norm": 1.8203125,
"learning_rate": 4.915991010945241e-05,
"loss": 1.4925,
"step": 115
},
{
"epoch": 0.0992938155360582,
"grad_norm": 1.84375,
"learning_rate": 4.914223328829959e-05,
"loss": 1.5845,
"step": 116
},
{
"epoch": 0.10014979670447251,
"grad_norm": 2.09375,
"learning_rate": 4.912437566934723e-05,
"loss": 1.7777,
"step": 117
},
{
"epoch": 0.10014979670447251,
"eval_loss": 1.627388596534729,
"eval_runtime": 21.3696,
"eval_samples_per_second": 18.25,
"eval_steps_per_second": 18.25,
"step": 117
},
{
"epoch": 0.1010057778728868,
"grad_norm": 1.75,
"learning_rate": 4.9106337386328524e-05,
"loss": 1.6118,
"step": 118
},
{
"epoch": 0.1018617590413011,
"grad_norm": 1.859375,
"learning_rate": 4.908811857432965e-05,
"loss": 1.5514,
"step": 119
},
{
"epoch": 0.10271774020971539,
"grad_norm": 1.7734375,
"learning_rate": 4.9069719369788734e-05,
"loss": 1.5689,
"step": 120
},
{
"epoch": 0.10357372137812969,
"grad_norm": 1.90625,
"learning_rate": 4.905113991049484e-05,
"loss": 1.564,
"step": 121
},
{
"epoch": 0.10442970254654398,
"grad_norm": 1.96875,
"learning_rate": 4.903238033558692e-05,
"loss": 1.6917,
"step": 122
},
{
"epoch": 0.10528568371495826,
"grad_norm": 1.8046875,
"learning_rate": 4.901344078555282e-05,
"loss": 1.4474,
"step": 123
},
{
"epoch": 0.10614166488337257,
"grad_norm": 1.9921875,
"learning_rate": 4.899432140222816e-05,
"loss": 1.6063,
"step": 124
},
{
"epoch": 0.10699764605178685,
"grad_norm": 1.6328125,
"learning_rate": 4.8975022328795325e-05,
"loss": 1.5834,
"step": 125
},
{
"epoch": 0.10785362722020116,
"grad_norm": 1.921875,
"learning_rate": 4.895554370978238e-05,
"loss": 1.6613,
"step": 126
},
{
"epoch": 0.10870960838861544,
"grad_norm": 1.9140625,
"learning_rate": 4.893588569106195e-05,
"loss": 1.6858,
"step": 127
},
{
"epoch": 0.10956558955702975,
"grad_norm": 1.9609375,
"learning_rate": 4.89160484198502e-05,
"loss": 1.6384,
"step": 128
},
{
"epoch": 0.11042157072544403,
"grad_norm": 1.8671875,
"learning_rate": 4.8896032044705655e-05,
"loss": 1.5923,
"step": 129
},
{
"epoch": 0.11127755189385834,
"grad_norm": 2.0625,
"learning_rate": 4.887583671552816e-05,
"loss": 1.5658,
"step": 130
},
{
"epoch": 0.11213353306227263,
"grad_norm": 1.7109375,
"learning_rate": 4.885546258355769e-05,
"loss": 1.4684,
"step": 131
},
{
"epoch": 0.11298951423068693,
"grad_norm": 1.8203125,
"learning_rate": 4.8834909801373264e-05,
"loss": 1.5512,
"step": 132
},
{
"epoch": 0.11384549539910122,
"grad_norm": 1.7734375,
"learning_rate": 4.881417852289179e-05,
"loss": 1.5687,
"step": 133
},
{
"epoch": 0.11470147656751552,
"grad_norm": 2.109375,
"learning_rate": 4.8793268903366905e-05,
"loss": 1.6813,
"step": 134
},
{
"epoch": 0.1155574577359298,
"grad_norm": 1.75,
"learning_rate": 4.877218109938781e-05,
"loss": 1.4457,
"step": 135
},
{
"epoch": 0.11641343890434411,
"grad_norm": 1.828125,
"learning_rate": 4.875091526887813e-05,
"loss": 1.6283,
"step": 136
},
{
"epoch": 0.1172694200727584,
"grad_norm": 1.8203125,
"learning_rate": 4.872947157109467e-05,
"loss": 1.5411,
"step": 137
},
{
"epoch": 0.1181254012411727,
"grad_norm": 1.796875,
"learning_rate": 4.8707850166626266e-05,
"loss": 1.5107,
"step": 138
},
{
"epoch": 0.11898138240958699,
"grad_norm": 1.78125,
"learning_rate": 4.8686051217392606e-05,
"loss": 1.404,
"step": 139
},
{
"epoch": 0.11983736357800129,
"grad_norm": 2.15625,
"learning_rate": 4.866407488664296e-05,
"loss": 1.5754,
"step": 140
},
{
"epoch": 0.12069334474641558,
"grad_norm": 1.8984375,
"learning_rate": 4.864192133895498e-05,
"loss": 1.5735,
"step": 141
},
{
"epoch": 0.12154932591482988,
"grad_norm": 2.03125,
"learning_rate": 4.861959074023348e-05,
"loss": 1.5884,
"step": 142
},
{
"epoch": 0.12240530708324417,
"grad_norm": 1.9375,
"learning_rate": 4.8597083257709194e-05,
"loss": 1.5551,
"step": 143
},
{
"epoch": 0.12326128825165847,
"grad_norm": 1.8359375,
"learning_rate": 4.857439905993748e-05,
"loss": 1.4693,
"step": 144
},
{
"epoch": 0.12411726942007276,
"grad_norm": 1.859375,
"learning_rate": 4.855153831679713e-05,
"loss": 1.6085,
"step": 145
},
{
"epoch": 0.12497325058848706,
"grad_norm": 1.8203125,
"learning_rate": 4.852850119948904e-05,
"loss": 1.4771,
"step": 146
},
{
"epoch": 0.12582923175690136,
"grad_norm": 1.7578125,
"learning_rate": 4.850528788053495e-05,
"loss": 1.4144,
"step": 147
},
{
"epoch": 0.12668521292531565,
"grad_norm": 1.8203125,
"learning_rate": 4.848189853377615e-05,
"loss": 1.3908,
"step": 148
},
{
"epoch": 0.12754119409372994,
"grad_norm": 2.0,
"learning_rate": 4.8458333334372185e-05,
"loss": 1.6438,
"step": 149
},
{
"epoch": 0.12839717526214423,
"grad_norm": 1.7421875,
"learning_rate": 4.843459245879951e-05,
"loss": 1.5459,
"step": 150
},
{
"epoch": 0.12925315643055854,
"grad_norm": 1.78125,
"learning_rate": 4.841067608485024e-05,
"loss": 1.4941,
"step": 151
},
{
"epoch": 0.13010913759897283,
"grad_norm": 1.6953125,
"learning_rate": 4.8386584391630716e-05,
"loss": 1.3663,
"step": 152
},
{
"epoch": 0.13096511876738712,
"grad_norm": 1.7734375,
"learning_rate": 4.8362317559560274e-05,
"loss": 1.4986,
"step": 153
},
{
"epoch": 0.1318210999358014,
"grad_norm": 1.65625,
"learning_rate": 4.833787577036981e-05,
"loss": 1.4611,
"step": 154
},
{
"epoch": 0.1326770811042157,
"grad_norm": 1.9375,
"learning_rate": 4.831325920710045e-05,
"loss": 1.6472,
"step": 155
},
{
"epoch": 0.13353306227263,
"grad_norm": 1.7265625,
"learning_rate": 4.8288468054102186e-05,
"loss": 1.5,
"step": 156
},
{
"epoch": 0.1343890434410443,
"grad_norm": 1.8125,
"learning_rate": 4.8263502497032484e-05,
"loss": 1.4545,
"step": 157
},
{
"epoch": 0.1352450246094586,
"grad_norm": 1.703125,
"learning_rate": 4.823836272285491e-05,
"loss": 1.5297,
"step": 158
},
{
"epoch": 0.13610100577787287,
"grad_norm": 2.078125,
"learning_rate": 4.82130489198377e-05,
"loss": 1.5259,
"step": 159
},
{
"epoch": 0.1369569869462872,
"grad_norm": 1.8984375,
"learning_rate": 4.8187561277552374e-05,
"loss": 1.554,
"step": 160
},
{
"epoch": 0.13781296811470148,
"grad_norm": 1.8984375,
"learning_rate": 4.816189998687231e-05,
"loss": 1.6408,
"step": 161
},
{
"epoch": 0.13866894928311577,
"grad_norm": 1.96875,
"learning_rate": 4.813606523997132e-05,
"loss": 1.5234,
"step": 162
},
{
"epoch": 0.13952493045153005,
"grad_norm": 1.7890625,
"learning_rate": 4.811005723032219e-05,
"loss": 1.4525,
"step": 163
},
{
"epoch": 0.14038091161994437,
"grad_norm": 1.7578125,
"learning_rate": 4.808387615269528e-05,
"loss": 1.5951,
"step": 164
},
{
"epoch": 0.14123689278835866,
"grad_norm": 1.7421875,
"learning_rate": 4.805752220315699e-05,
"loss": 1.3059,
"step": 165
},
{
"epoch": 0.14209287395677295,
"grad_norm": 1.9140625,
"learning_rate": 4.8030995579068356e-05,
"loss": 1.5359,
"step": 166
},
{
"epoch": 0.14294885512518724,
"grad_norm": 1.734375,
"learning_rate": 4.800429647908354e-05,
"loss": 1.5332,
"step": 167
},
{
"epoch": 0.14380483629360155,
"grad_norm": 1.796875,
"learning_rate": 4.797742510314838e-05,
"loss": 1.5602,
"step": 168
},
{
"epoch": 0.14466081746201584,
"grad_norm": 1.7734375,
"learning_rate": 4.7950381652498816e-05,
"loss": 1.5634,
"step": 169
},
{
"epoch": 0.14551679863043013,
"grad_norm": 1.734375,
"learning_rate": 4.7923166329659466e-05,
"loss": 1.5805,
"step": 170
},
{
"epoch": 0.14637277979884442,
"grad_norm": 1.875,
"learning_rate": 4.7895779338442076e-05,
"loss": 1.5187,
"step": 171
},
{
"epoch": 0.14722876096725873,
"grad_norm": 1.8515625,
"learning_rate": 4.786822088394397e-05,
"loss": 1.664,
"step": 172
},
{
"epoch": 0.14808474213567302,
"grad_norm": 1.9765625,
"learning_rate": 4.784049117254656e-05,
"loss": 1.6186,
"step": 173
},
{
"epoch": 0.1489407233040873,
"grad_norm": 1.65625,
"learning_rate": 4.781259041191375e-05,
"loss": 1.4065,
"step": 174
},
{
"epoch": 0.1497967044725016,
"grad_norm": 1.8828125,
"learning_rate": 4.778451881099044e-05,
"loss": 1.652,
"step": 175
},
{
"epoch": 0.1506526856409159,
"grad_norm": 1.734375,
"learning_rate": 4.775627658000091e-05,
"loss": 1.4527,
"step": 176
},
{
"epoch": 0.1515086668093302,
"grad_norm": 1.8828125,
"learning_rate": 4.772786393044726e-05,
"loss": 1.4748,
"step": 177
},
{
"epoch": 0.1523646479777445,
"grad_norm": 1.9609375,
"learning_rate": 4.7699281075107835e-05,
"loss": 1.6003,
"step": 178
},
{
"epoch": 0.15322062914615878,
"grad_norm": 1.8359375,
"learning_rate": 4.767052822803565e-05,
"loss": 1.6305,
"step": 179
},
{
"epoch": 0.1540766103145731,
"grad_norm": 1.765625,
"learning_rate": 4.764160560455673e-05,
"loss": 1.3937,
"step": 180
},
{
"epoch": 0.15493259148298738,
"grad_norm": 1.8125,
"learning_rate": 4.7612513421268544e-05,
"loss": 1.4548,
"step": 181
},
{
"epoch": 0.15578857265140167,
"grad_norm": 1.8828125,
"learning_rate": 4.7583251896038386e-05,
"loss": 1.4323,
"step": 182
},
{
"epoch": 0.15664455381981596,
"grad_norm": 1.7890625,
"learning_rate": 4.7553821248001695e-05,
"loss": 1.4816,
"step": 183
},
{
"epoch": 0.15750053498823025,
"grad_norm": 1.75,
"learning_rate": 4.752422169756048e-05,
"loss": 1.4022,
"step": 184
},
{
"epoch": 0.15835651615664456,
"grad_norm": 1.7890625,
"learning_rate": 4.749445346638163e-05,
"loss": 1.5193,
"step": 185
},
{
"epoch": 0.15921249732505885,
"grad_norm": 1.9921875,
"learning_rate": 4.7464516777395234e-05,
"loss": 1.589,
"step": 186
},
{
"epoch": 0.16006847849347314,
"grad_norm": 1.703125,
"learning_rate": 4.743441185479297e-05,
"loss": 1.4739,
"step": 187
},
{
"epoch": 0.16092445966188743,
"grad_norm": 1.765625,
"learning_rate": 4.740413892402639e-05,
"loss": 1.4312,
"step": 188
},
{
"epoch": 0.16178044083030174,
"grad_norm": 1.984375,
"learning_rate": 4.7373698211805215e-05,
"loss": 1.677,
"step": 189
},
{
"epoch": 0.16263642199871603,
"grad_norm": 1.6953125,
"learning_rate": 4.7343089946095674e-05,
"loss": 1.6992,
"step": 190
},
{
"epoch": 0.16349240316713032,
"grad_norm": 1.8125,
"learning_rate": 4.7312314356118776e-05,
"loss": 1.5619,
"step": 191
},
{
"epoch": 0.1643483843355446,
"grad_norm": 1.671875,
"learning_rate": 4.7281371672348595e-05,
"loss": 1.6068,
"step": 192
},
{
"epoch": 0.16520436550395892,
"grad_norm": 1.78125,
"learning_rate": 4.725026212651056e-05,
"loss": 1.6795,
"step": 193
},
{
"epoch": 0.1660603466723732,
"grad_norm": 1.671875,
"learning_rate": 4.7218985951579685e-05,
"loss": 1.6281,
"step": 194
},
{
"epoch": 0.1669163278407875,
"grad_norm": 1.6171875,
"learning_rate": 4.7187543381778864e-05,
"loss": 1.4485,
"step": 195
},
{
"epoch": 0.1677723090092018,
"grad_norm": 1.875,
"learning_rate": 4.715593465257709e-05,
"loss": 1.5356,
"step": 196
},
{
"epoch": 0.1686282901776161,
"grad_norm": 2.0,
"learning_rate": 4.712416000068771e-05,
"loss": 1.6105,
"step": 197
},
{
"epoch": 0.1694842713460304,
"grad_norm": 2.171875,
"learning_rate": 4.7092219664066636e-05,
"loss": 1.7753,
"step": 198
},
{
"epoch": 0.17034025251444468,
"grad_norm": 1.8671875,
"learning_rate": 4.706011388191057e-05,
"loss": 1.6989,
"step": 199
},
{
"epoch": 0.17119623368285897,
"grad_norm": 1.625,
"learning_rate": 4.7027842894655205e-05,
"loss": 1.5058,
"step": 200
},
{
"epoch": 0.17205221485127328,
"grad_norm": 1.78125,
"learning_rate": 4.699540694397343e-05,
"loss": 1.6399,
"step": 201
},
{
"epoch": 0.17290819601968757,
"grad_norm": 1.828125,
"learning_rate": 4.6962806272773564e-05,
"loss": 1.491,
"step": 202
},
{
"epoch": 0.17376417718810186,
"grad_norm": 1.8984375,
"learning_rate": 4.693004112519743e-05,
"loss": 1.5155,
"step": 203
},
{
"epoch": 0.17462015835651615,
"grad_norm": 1.7421875,
"learning_rate": 4.689711174661864e-05,
"loss": 1.4796,
"step": 204
},
{
"epoch": 0.17547613952493046,
"grad_norm": 2.125,
"learning_rate": 4.686401838364068e-05,
"loss": 1.5699,
"step": 205
},
{
"epoch": 0.17633212069334475,
"grad_norm": 1.765625,
"learning_rate": 4.683076128409512e-05,
"loss": 1.5628,
"step": 206
},
{
"epoch": 0.17718810186175904,
"grad_norm": 1.703125,
"learning_rate": 4.6797340697039705e-05,
"loss": 1.5281,
"step": 207
},
{
"epoch": 0.17804408303017333,
"grad_norm": 1.6875,
"learning_rate": 4.6763756872756525e-05,
"loss": 1.5223,
"step": 208
},
{
"epoch": 0.17890006419858764,
"grad_norm": 1.6953125,
"learning_rate": 4.6730010062750134e-05,
"loss": 1.5561,
"step": 209
},
{
"epoch": 0.17975604536700193,
"grad_norm": 1.78125,
"learning_rate": 4.669610051974566e-05,
"loss": 1.3003,
"step": 210
},
{
"epoch": 0.18061202653541622,
"grad_norm": 1.859375,
"learning_rate": 4.6662028497686905e-05,
"loss": 1.5831,
"step": 211
},
{
"epoch": 0.1814680077038305,
"grad_norm": 1.9765625,
"learning_rate": 4.662779425173448e-05,
"loss": 1.4068,
"step": 212
},
{
"epoch": 0.18232398887224482,
"grad_norm": 1.7890625,
"learning_rate": 4.659339803826384e-05,
"loss": 1.2956,
"step": 213
},
{
"epoch": 0.1831799700406591,
"grad_norm": 1.734375,
"learning_rate": 4.655884011486341e-05,
"loss": 1.4742,
"step": 214
},
{
"epoch": 0.1840359512090734,
"grad_norm": 1.734375,
"learning_rate": 4.652412074033263e-05,
"loss": 1.4319,
"step": 215
},
{
"epoch": 0.1848919323774877,
"grad_norm": 1.765625,
"learning_rate": 4.648924017468003e-05,
"loss": 1.4521,
"step": 216
},
{
"epoch": 0.18574791354590198,
"grad_norm": 1.84375,
"learning_rate": 4.645419867912128e-05,
"loss": 1.5488,
"step": 217
},
{
"epoch": 0.1866038947143163,
"grad_norm": 1.8359375,
"learning_rate": 4.6418996516077205e-05,
"loss": 1.6545,
"step": 218
},
{
"epoch": 0.18745987588273058,
"grad_norm": 1.75,
"learning_rate": 4.6383633949171884e-05,
"loss": 1.5419,
"step": 219
},
{
"epoch": 0.18831585705114487,
"grad_norm": 1.640625,
"learning_rate": 4.634811124323062e-05,
"loss": 1.4832,
"step": 220
},
{
"epoch": 0.18917183821955916,
"grad_norm": 1.734375,
"learning_rate": 4.6312428664277976e-05,
"loss": 1.6318,
"step": 221
},
{
"epoch": 0.19002781938797347,
"grad_norm": 1.796875,
"learning_rate": 4.627658647953579e-05,
"loss": 1.4994,
"step": 222
},
{
"epoch": 0.19088380055638776,
"grad_norm": 1.8828125,
"learning_rate": 4.624058495742114e-05,
"loss": 1.5991,
"step": 223
},
{
"epoch": 0.19173978172480205,
"grad_norm": 1.6484375,
"learning_rate": 4.620442436754438e-05,
"loss": 1.4461,
"step": 224
},
{
"epoch": 0.19259576289321634,
"grad_norm": 1.71875,
"learning_rate": 4.6168104980707107e-05,
"loss": 1.5396,
"step": 225
},
{
"epoch": 0.19345174406163065,
"grad_norm": 1.71875,
"learning_rate": 4.613162706890011e-05,
"loss": 1.4974,
"step": 226
},
{
"epoch": 0.19430772523004494,
"grad_norm": 1.9140625,
"learning_rate": 4.609499090530136e-05,
"loss": 1.6796,
"step": 227
},
{
"epoch": 0.19516370639845923,
"grad_norm": 1.6953125,
"learning_rate": 4.605819676427393e-05,
"loss": 1.4685,
"step": 228
},
{
"epoch": 0.19601968756687352,
"grad_norm": 1.8671875,
"learning_rate": 4.602124492136401e-05,
"loss": 1.5252,
"step": 229
},
{
"epoch": 0.19687566873528783,
"grad_norm": 1.765625,
"learning_rate": 4.598413565329875e-05,
"loss": 1.5882,
"step": 230
},
{
"epoch": 0.19773164990370212,
"grad_norm": 1.8203125,
"learning_rate": 4.594686923798426e-05,
"loss": 1.5452,
"step": 231
},
{
"epoch": 0.1985876310721164,
"grad_norm": 2.03125,
"learning_rate": 4.5909445954503506e-05,
"loss": 1.5358,
"step": 232
},
{
"epoch": 0.1994436122405307,
"grad_norm": 1.8203125,
"learning_rate": 4.5871866083114204e-05,
"loss": 1.6252,
"step": 233
},
{
"epoch": 0.20029959340894501,
"grad_norm": 1.8203125,
"learning_rate": 4.5834129905246725e-05,
"loss": 1.4701,
"step": 234
},
{
"epoch": 0.20029959340894501,
"eval_loss": 1.6031174659729004,
"eval_runtime": 21.3555,
"eval_samples_per_second": 18.262,
"eval_steps_per_second": 18.262,
"step": 234
},
{
"epoch": 0.2011555745773593,
"grad_norm": 1.734375,
"learning_rate": 4.5796237703502044e-05,
"loss": 1.6016,
"step": 235
},
{
"epoch": 0.2020115557457736,
"grad_norm": 1.578125,
"learning_rate": 4.5758189761649514e-05,
"loss": 1.5205,
"step": 236
},
{
"epoch": 0.20286753691418788,
"grad_norm": 1.7109375,
"learning_rate": 4.5719986364624866e-05,
"loss": 1.4364,
"step": 237
},
{
"epoch": 0.2037235180826022,
"grad_norm": 2.171875,
"learning_rate": 4.5681627798527965e-05,
"loss": 1.254,
"step": 238
},
{
"epoch": 0.20457949925101648,
"grad_norm": 1.9296875,
"learning_rate": 4.564311435062074e-05,
"loss": 1.5015,
"step": 239
},
{
"epoch": 0.20543548041943077,
"grad_norm": 1.828125,
"learning_rate": 4.5604446309324986e-05,
"loss": 1.3402,
"step": 240
},
{
"epoch": 0.20629146158784506,
"grad_norm": 1.71875,
"learning_rate": 4.5565623964220266e-05,
"loss": 1.436,
"step": 241
},
{
"epoch": 0.20714744275625938,
"grad_norm": 1.7734375,
"learning_rate": 4.5526647606041666e-05,
"loss": 1.6074,
"step": 242
},
{
"epoch": 0.20800342392467366,
"grad_norm": 1.921875,
"learning_rate": 4.548751752667767e-05,
"loss": 1.5374,
"step": 243
},
{
"epoch": 0.20885940509308795,
"grad_norm": 1.828125,
"learning_rate": 4.5448234019167945e-05,
"loss": 1.4411,
"step": 244
},
{
"epoch": 0.20971538626150224,
"grad_norm": 1.6640625,
"learning_rate": 4.5408797377701176e-05,
"loss": 1.4943,
"step": 245
},
{
"epoch": 0.21057136742991653,
"grad_norm": 1.7890625,
"learning_rate": 4.5369207897612854e-05,
"loss": 1.567,
"step": 246
},
{
"epoch": 0.21142734859833084,
"grad_norm": 1.75,
"learning_rate": 4.532946587538302e-05,
"loss": 1.587,
"step": 247
},
{
"epoch": 0.21228332976674513,
"grad_norm": 1.6484375,
"learning_rate": 4.5289571608634116e-05,
"loss": 1.4585,
"step": 248
},
{
"epoch": 0.21313931093515942,
"grad_norm": 1.6484375,
"learning_rate": 4.524952539612872e-05,
"loss": 1.5406,
"step": 249
},
{
"epoch": 0.2139952921035737,
"grad_norm": 1.8671875,
"learning_rate": 4.5209327537767295e-05,
"loss": 1.4958,
"step": 250
},
{
"epoch": 0.21485127327198802,
"grad_norm": 2.015625,
"learning_rate": 4.5168978334585956e-05,
"loss": 1.6202,
"step": 251
},
{
"epoch": 0.2157072544404023,
"grad_norm": 1.7578125,
"learning_rate": 4.512847808875424e-05,
"loss": 1.5408,
"step": 252
},
{
"epoch": 0.2165632356088166,
"grad_norm": 1.953125,
"learning_rate": 4.5087827103572796e-05,
"loss": 1.6394,
"step": 253
},
{
"epoch": 0.2174192167772309,
"grad_norm": 1.6484375,
"learning_rate": 4.504702568347117e-05,
"loss": 1.3343,
"step": 254
},
{
"epoch": 0.2182751979456452,
"grad_norm": 1.7265625,
"learning_rate": 4.500607413400546e-05,
"loss": 1.5471,
"step": 255
},
{
"epoch": 0.2191311791140595,
"grad_norm": 1.7734375,
"learning_rate": 4.4964972761856084e-05,
"loss": 1.4912,
"step": 256
},
{
"epoch": 0.21998716028247378,
"grad_norm": 1.5703125,
"learning_rate": 4.492372187482545e-05,
"loss": 1.2951,
"step": 257
},
{
"epoch": 0.22084314145088807,
"grad_norm": 1.6484375,
"learning_rate": 4.488232178183567e-05,
"loss": 1.4651,
"step": 258
},
{
"epoch": 0.22169912261930239,
"grad_norm": 1.7109375,
"learning_rate": 4.484077279292622e-05,
"loss": 1.3435,
"step": 259
},
{
"epoch": 0.22255510378771667,
"grad_norm": 1.828125,
"learning_rate": 4.479907521925168e-05,
"loss": 1.5813,
"step": 260
},
{
"epoch": 0.22341108495613096,
"grad_norm": 1.875,
"learning_rate": 4.4757229373079306e-05,
"loss": 1.3951,
"step": 261
},
{
"epoch": 0.22426706612454525,
"grad_norm": 1.7890625,
"learning_rate": 4.471523556778679e-05,
"loss": 1.4809,
"step": 262
},
{
"epoch": 0.22512304729295957,
"grad_norm": 1.5390625,
"learning_rate": 4.467309411785984e-05,
"loss": 1.4175,
"step": 263
},
{
"epoch": 0.22597902846137385,
"grad_norm": 1.8828125,
"learning_rate": 4.4630805338889866e-05,
"loss": 1.587,
"step": 264
},
{
"epoch": 0.22683500962978814,
"grad_norm": 1.71875,
"learning_rate": 4.458836954757161e-05,
"loss": 1.3758,
"step": 265
},
{
"epoch": 0.22769099079820243,
"grad_norm": 1.859375,
"learning_rate": 4.454578706170075e-05,
"loss": 1.4746,
"step": 266
},
{
"epoch": 0.22854697196661675,
"grad_norm": 1.703125,
"learning_rate": 4.450305820017156e-05,
"loss": 1.5459,
"step": 267
},
{
"epoch": 0.22940295313503103,
"grad_norm": 1.671875,
"learning_rate": 4.446018328297449e-05,
"loss": 1.361,
"step": 268
},
{
"epoch": 0.23025893430344532,
"grad_norm": 1.875,
"learning_rate": 4.441716263119379e-05,
"loss": 1.6767,
"step": 269
},
{
"epoch": 0.2311149154718596,
"grad_norm": 1.78125,
"learning_rate": 4.437399656700507e-05,
"loss": 1.4742,
"step": 270
},
{
"epoch": 0.23197089664027393,
"grad_norm": 1.6875,
"learning_rate": 4.433068541367295e-05,
"loss": 1.5136,
"step": 271
},
{
"epoch": 0.23282687780868822,
"grad_norm": 1.65625,
"learning_rate": 4.428722949554857e-05,
"loss": 1.4492,
"step": 272
},
{
"epoch": 0.2336828589771025,
"grad_norm": 1.6484375,
"learning_rate": 4.424362913806722e-05,
"loss": 1.4585,
"step": 273
},
{
"epoch": 0.2345388401455168,
"grad_norm": 1.578125,
"learning_rate": 4.419988466774586e-05,
"loss": 1.3074,
"step": 274
},
{
"epoch": 0.23539482131393108,
"grad_norm": 1.5390625,
"learning_rate": 4.415599641218068e-05,
"loss": 1.2787,
"step": 275
},
{
"epoch": 0.2362508024823454,
"grad_norm": 1.515625,
"learning_rate": 4.4111964700044686e-05,
"loss": 1.489,
"step": 276
},
{
"epoch": 0.23710678365075968,
"grad_norm": 1.578125,
"learning_rate": 4.4067789861085185e-05,
"loss": 1.4373,
"step": 277
},
{
"epoch": 0.23796276481917397,
"grad_norm": 1.703125,
"learning_rate": 4.402347222612137e-05,
"loss": 1.4773,
"step": 278
},
{
"epoch": 0.23881874598758826,
"grad_norm": 2.0,
"learning_rate": 4.397901212704176e-05,
"loss": 1.4799,
"step": 279
},
{
"epoch": 0.23967472715600258,
"grad_norm": 1.7421875,
"learning_rate": 4.393440989680184e-05,
"loss": 1.4964,
"step": 280
},
{
"epoch": 0.24053070832441686,
"grad_norm": 1.5859375,
"learning_rate": 4.3889665869421436e-05,
"loss": 1.3405,
"step": 281
},
{
"epoch": 0.24138668949283115,
"grad_norm": 1.5703125,
"learning_rate": 4.3844780379982296e-05,
"loss": 1.4144,
"step": 282
},
{
"epoch": 0.24224267066124544,
"grad_norm": 1.8359375,
"learning_rate": 4.3799753764625564e-05,
"loss": 1.4202,
"step": 283
},
{
"epoch": 0.24309865182965976,
"grad_norm": 1.7578125,
"learning_rate": 4.375458636054924e-05,
"loss": 1.6295,
"step": 284
},
{
"epoch": 0.24395463299807404,
"grad_norm": 1.84375,
"learning_rate": 4.370927850600569e-05,
"loss": 1.5213,
"step": 285
},
{
"epoch": 0.24481061416648833,
"grad_norm": 1.8125,
"learning_rate": 4.366383054029906e-05,
"loss": 1.4423,
"step": 286
},
{
"epoch": 0.24566659533490262,
"grad_norm": 1.7265625,
"learning_rate": 4.3618242803782825e-05,
"loss": 1.6341,
"step": 287
},
{
"epoch": 0.24652257650331694,
"grad_norm": 1.7890625,
"learning_rate": 4.357251563785713e-05,
"loss": 1.5936,
"step": 288
},
{
"epoch": 0.24737855767173123,
"grad_norm": 1.640625,
"learning_rate": 4.352664938496631e-05,
"loss": 1.5026,
"step": 289
},
{
"epoch": 0.2482345388401455,
"grad_norm": 1.765625,
"learning_rate": 4.348064438859629e-05,
"loss": 1.6062,
"step": 290
},
{
"epoch": 0.2490905200085598,
"grad_norm": 1.734375,
"learning_rate": 4.3434500993272066e-05,
"loss": 1.5012,
"step": 291
},
{
"epoch": 0.24994650117697412,
"grad_norm": 1.6328125,
"learning_rate": 4.338821954455503e-05,
"loss": 1.5942,
"step": 292
},
{
"epoch": 0.2508024823453884,
"grad_norm": 1.578125,
"learning_rate": 4.334180038904046e-05,
"loss": 1.5837,
"step": 293
},
{
"epoch": 0.2516584635138027,
"grad_norm": 1.9921875,
"learning_rate": 4.3295243874354926e-05,
"loss": 1.6746,
"step": 294
},
{
"epoch": 0.252514444682217,
"grad_norm": 1.7265625,
"learning_rate": 4.3248550349153616e-05,
"loss": 1.467,
"step": 295
},
{
"epoch": 0.2533704258506313,
"grad_norm": 1.671875,
"learning_rate": 4.3201720163117795e-05,
"loss": 1.497,
"step": 296
},
{
"epoch": 0.2542264070190456,
"grad_norm": 1.5390625,
"learning_rate": 4.315475366695217e-05,
"loss": 1.2926,
"step": 297
},
{
"epoch": 0.2550823881874599,
"grad_norm": 1.859375,
"learning_rate": 4.3107651212382236e-05,
"loss": 1.6157,
"step": 298
},
{
"epoch": 0.25593836935587416,
"grad_norm": 1.8359375,
"learning_rate": 4.306041315215167e-05,
"loss": 1.538,
"step": 299
},
{
"epoch": 0.25679435052428845,
"grad_norm": 1.6796875,
"learning_rate": 4.301303984001967e-05,
"loss": 1.4402,
"step": 300
},
{
"epoch": 0.25765033169270274,
"grad_norm": 1.859375,
"learning_rate": 4.296553163075836e-05,
"loss": 1.6127,
"step": 301
},
{
"epoch": 0.2585063128611171,
"grad_norm": 1.7890625,
"learning_rate": 4.291788888015002e-05,
"loss": 1.5769,
"step": 302
},
{
"epoch": 0.25936229402953137,
"grad_norm": 1.953125,
"learning_rate": 4.287011194498456e-05,
"loss": 1.2251,
"step": 303
},
{
"epoch": 0.26021827519794566,
"grad_norm": 1.5078125,
"learning_rate": 4.282220118305672e-05,
"loss": 1.4914,
"step": 304
},
{
"epoch": 0.26107425636635995,
"grad_norm": 1.640625,
"learning_rate": 4.277415695316349e-05,
"loss": 1.5531,
"step": 305
},
{
"epoch": 0.26193023753477424,
"grad_norm": 1.640625,
"learning_rate": 4.272597961510137e-05,
"loss": 1.3468,
"step": 306
},
{
"epoch": 0.2627862187031885,
"grad_norm": 1.7265625,
"learning_rate": 4.267766952966369e-05,
"loss": 1.5566,
"step": 307
},
{
"epoch": 0.2636421998716028,
"grad_norm": 1.7734375,
"learning_rate": 4.2629227058637904e-05,
"loss": 1.4052,
"step": 308
},
{
"epoch": 0.2644981810400171,
"grad_norm": 1.8046875,
"learning_rate": 4.258065256480288e-05,
"loss": 1.4669,
"step": 309
},
{
"epoch": 0.2653541622084314,
"grad_norm": 1.578125,
"learning_rate": 4.253194641192621e-05,
"loss": 1.3902,
"step": 310
},
{
"epoch": 0.26621014337684573,
"grad_norm": 1.8515625,
"learning_rate": 4.24831089647614e-05,
"loss": 1.4913,
"step": 311
},
{
"epoch": 0.26706612454526,
"grad_norm": 1.765625,
"learning_rate": 4.243414058904528e-05,
"loss": 1.4332,
"step": 312
},
{
"epoch": 0.2679221057136743,
"grad_norm": 1.53125,
"learning_rate": 4.238504165149515e-05,
"loss": 1.3904,
"step": 313
},
{
"epoch": 0.2687780868820886,
"grad_norm": 1.90625,
"learning_rate": 4.233581251980604e-05,
"loss": 1.5778,
"step": 314
},
{
"epoch": 0.2696340680505029,
"grad_norm": 1.765625,
"learning_rate": 4.2286453562648046e-05,
"loss": 1.6316,
"step": 315
},
{
"epoch": 0.2704900492189172,
"grad_norm": 1.8125,
"learning_rate": 4.223696514966346e-05,
"loss": 1.5792,
"step": 316
},
{
"epoch": 0.27134603038733146,
"grad_norm": 1.578125,
"learning_rate": 4.2187347651464055e-05,
"loss": 1.4227,
"step": 317
},
{
"epoch": 0.27220201155574575,
"grad_norm": 1.703125,
"learning_rate": 4.213760143962834e-05,
"loss": 1.3087,
"step": 318
},
{
"epoch": 0.2730579927241601,
"grad_norm": 1.734375,
"learning_rate": 4.20877268866987e-05,
"loss": 1.5096,
"step": 319
},
{
"epoch": 0.2739139738925744,
"grad_norm": 1.640625,
"learning_rate": 4.203772436617868e-05,
"loss": 1.3995,
"step": 320
},
{
"epoch": 0.27476995506098867,
"grad_norm": 1.875,
"learning_rate": 4.198759425253014e-05,
"loss": 1.6112,
"step": 321
},
{
"epoch": 0.27562593622940296,
"grad_norm": 1.6796875,
"learning_rate": 4.1937336921170476e-05,
"loss": 1.6356,
"step": 322
},
{
"epoch": 0.27648191739781725,
"grad_norm": 1.5078125,
"learning_rate": 4.188695274846979e-05,
"loss": 1.3759,
"step": 323
},
{
"epoch": 0.27733789856623153,
"grad_norm": 1.7734375,
"learning_rate": 4.183644211174809e-05,
"loss": 1.4551,
"step": 324
},
{
"epoch": 0.2781938797346458,
"grad_norm": 1.5859375,
"learning_rate": 4.1785805389272445e-05,
"loss": 1.4036,
"step": 325
},
{
"epoch": 0.2790498609030601,
"grad_norm": 1.6484375,
"learning_rate": 4.173504296025417e-05,
"loss": 1.3411,
"step": 326
},
{
"epoch": 0.27990584207147445,
"grad_norm": 1.609375,
"learning_rate": 4.1684155204845974e-05,
"loss": 1.5365,
"step": 327
},
{
"epoch": 0.28076182323988874,
"grad_norm": 2.109375,
"learning_rate": 4.163314250413913e-05,
"loss": 1.6192,
"step": 328
},
{
"epoch": 0.28161780440830303,
"grad_norm": 1.8125,
"learning_rate": 4.15820052401606e-05,
"loss": 1.6317,
"step": 329
},
{
"epoch": 0.2824737855767173,
"grad_norm": 1.7578125,
"learning_rate": 4.153074379587018e-05,
"loss": 1.5873,
"step": 330
},
{
"epoch": 0.2833297667451316,
"grad_norm": 1.578125,
"learning_rate": 4.147935855515763e-05,
"loss": 1.4148,
"step": 331
},
{
"epoch": 0.2841857479135459,
"grad_norm": 1.8671875,
"learning_rate": 4.142784990283982e-05,
"loss": 1.5794,
"step": 332
},
{
"epoch": 0.2850417290819602,
"grad_norm": 1.5625,
"learning_rate": 4.1376218224657825e-05,
"loss": 1.4822,
"step": 333
},
{
"epoch": 0.28589771025037447,
"grad_norm": 1.984375,
"learning_rate": 4.132446390727404e-05,
"loss": 1.4558,
"step": 334
},
{
"epoch": 0.2867536914187888,
"grad_norm": 1.8359375,
"learning_rate": 4.127258733826929e-05,
"loss": 1.61,
"step": 335
},
{
"epoch": 0.2876096725872031,
"grad_norm": 1.8203125,
"learning_rate": 4.122058890613991e-05,
"loss": 1.3766,
"step": 336
},
{
"epoch": 0.2884656537556174,
"grad_norm": 1.6015625,
"learning_rate": 4.1168469000294895e-05,
"loss": 1.4012,
"step": 337
},
{
"epoch": 0.2893216349240317,
"grad_norm": 1.8046875,
"learning_rate": 4.11162280110529e-05,
"loss": 1.5115,
"step": 338
},
{
"epoch": 0.29017761609244597,
"grad_norm": 1.609375,
"learning_rate": 4.106386632963936e-05,
"loss": 1.5486,
"step": 339
},
{
"epoch": 0.29103359726086026,
"grad_norm": 1.6328125,
"learning_rate": 4.101138434818357e-05,
"loss": 1.4817,
"step": 340
},
{
"epoch": 0.29188957842927454,
"grad_norm": 1.6796875,
"learning_rate": 4.095878245971573e-05,
"loss": 1.5482,
"step": 341
},
{
"epoch": 0.29274555959768883,
"grad_norm": 1.5859375,
"learning_rate": 4.0906061058163995e-05,
"loss": 1.48,
"step": 342
},
{
"epoch": 0.2936015407661031,
"grad_norm": 1.8671875,
"learning_rate": 4.085322053835157e-05,
"loss": 1.4816,
"step": 343
},
{
"epoch": 0.29445752193451746,
"grad_norm": 1.6484375,
"learning_rate": 4.080026129599368e-05,
"loss": 1.4987,
"step": 344
},
{
"epoch": 0.29531350310293175,
"grad_norm": 1.625,
"learning_rate": 4.0747183727694674e-05,
"loss": 1.6119,
"step": 345
},
{
"epoch": 0.29616948427134604,
"grad_norm": 1.65625,
"learning_rate": 4.0693988230945004e-05,
"loss": 1.5121,
"step": 346
},
{
"epoch": 0.29702546543976033,
"grad_norm": 1.8359375,
"learning_rate": 4.064067520411831e-05,
"loss": 1.5578,
"step": 347
},
{
"epoch": 0.2978814466081746,
"grad_norm": 2.078125,
"learning_rate": 4.058724504646834e-05,
"loss": 1.4593,
"step": 348
},
{
"epoch": 0.2987374277765889,
"grad_norm": 4.53125,
"learning_rate": 4.0533698158126085e-05,
"loss": 1.3536,
"step": 349
},
{
"epoch": 0.2995934089450032,
"grad_norm": 1.6953125,
"learning_rate": 4.048003494009666e-05,
"loss": 1.4781,
"step": 350
},
{
"epoch": 0.3004493901134175,
"grad_norm": 1.625,
"learning_rate": 4.042625579425639e-05,
"loss": 1.6591,
"step": 351
},
{
"epoch": 0.3004493901134175,
"eval_loss": 1.5815147161483765,
"eval_runtime": 21.3462,
"eval_samples_per_second": 18.27,
"eval_steps_per_second": 18.27,
"step": 351
},
{
"epoch": 0.3013053712818318,
"grad_norm": 1.5390625,
"learning_rate": 4.0372361123349756e-05,
"loss": 1.3439,
"step": 352
},
{
"epoch": 0.3021613524502461,
"grad_norm": 1.7890625,
"learning_rate": 4.031835133098639e-05,
"loss": 1.5028,
"step": 353
},
{
"epoch": 0.3030173336186604,
"grad_norm": 1.8515625,
"learning_rate": 4.026422682163804e-05,
"loss": 1.5099,
"step": 354
},
{
"epoch": 0.3038733147870747,
"grad_norm": 1.671875,
"learning_rate": 4.020998800063559e-05,
"loss": 1.4798,
"step": 355
},
{
"epoch": 0.304729295955489,
"grad_norm": 1.8125,
"learning_rate": 4.015563527416595e-05,
"loss": 1.4064,
"step": 356
},
{
"epoch": 0.30558527712390327,
"grad_norm": 1.9375,
"learning_rate": 4.010116904926907e-05,
"loss": 1.5338,
"step": 357
},
{
"epoch": 0.30644125829231755,
"grad_norm": 1.8046875,
"learning_rate": 4.0046589733834875e-05,
"loss": 1.5153,
"step": 358
},
{
"epoch": 0.30729723946073184,
"grad_norm": 1.8828125,
"learning_rate": 3.9991897736600184e-05,
"loss": 1.4596,
"step": 359
},
{
"epoch": 0.3081532206291462,
"grad_norm": 1.625,
"learning_rate": 3.9937093467145726e-05,
"loss": 1.5873,
"step": 360
},
{
"epoch": 0.3090092017975605,
"grad_norm": 1.65625,
"learning_rate": 3.988217733589296e-05,
"loss": 1.5941,
"step": 361
},
{
"epoch": 0.30986518296597476,
"grad_norm": 1.8203125,
"learning_rate": 3.982714975410111e-05,
"loss": 1.4578,
"step": 362
},
{
"epoch": 0.31072116413438905,
"grad_norm": 1.5234375,
"learning_rate": 3.977201113386402e-05,
"loss": 1.4387,
"step": 363
},
{
"epoch": 0.31157714530280334,
"grad_norm": 1.546875,
"learning_rate": 3.971676188810707e-05,
"loss": 1.496,
"step": 364
},
{
"epoch": 0.3124331264712176,
"grad_norm": 1.609375,
"learning_rate": 3.966140243058413e-05,
"loss": 1.3948,
"step": 365
},
{
"epoch": 0.3132891076396319,
"grad_norm": 1.7578125,
"learning_rate": 3.96059331758744e-05,
"loss": 1.5101,
"step": 366
},
{
"epoch": 0.3141450888080462,
"grad_norm": 1.8125,
"learning_rate": 3.955035453937935e-05,
"loss": 1.5071,
"step": 367
},
{
"epoch": 0.3150010699764605,
"grad_norm": 1.6171875,
"learning_rate": 3.949466693731962e-05,
"loss": 1.4645,
"step": 368
},
{
"epoch": 0.31585705114487483,
"grad_norm": 1.703125,
"learning_rate": 3.9438870786731815e-05,
"loss": 1.522,
"step": 369
},
{
"epoch": 0.3167130323132891,
"grad_norm": 1.5859375,
"learning_rate": 3.938296650546552e-05,
"loss": 1.4065,
"step": 370
},
{
"epoch": 0.3175690134817034,
"grad_norm": 1.6875,
"learning_rate": 3.9326954512180026e-05,
"loss": 1.4124,
"step": 371
},
{
"epoch": 0.3184249946501177,
"grad_norm": 1.703125,
"learning_rate": 3.927083522634132e-05,
"loss": 1.4137,
"step": 372
},
{
"epoch": 0.319280975818532,
"grad_norm": 1.828125,
"learning_rate": 3.9214609068218834e-05,
"loss": 1.482,
"step": 373
},
{
"epoch": 0.3201369569869463,
"grad_norm": 2.03125,
"learning_rate": 3.915827645888241e-05,
"loss": 1.3655,
"step": 374
},
{
"epoch": 0.32099293815536056,
"grad_norm": 1.6875,
"learning_rate": 3.910183782019905e-05,
"loss": 1.3776,
"step": 375
},
{
"epoch": 0.32184891932377485,
"grad_norm": 1.7421875,
"learning_rate": 3.9045293574829814e-05,
"loss": 1.5067,
"step": 376
},
{
"epoch": 0.3227049004921892,
"grad_norm": 1.8125,
"learning_rate": 3.8988644146226606e-05,
"loss": 1.4391,
"step": 377
},
{
"epoch": 0.3235608816606035,
"grad_norm": 1.8671875,
"learning_rate": 3.8931889958629066e-05,
"loss": 1.4054,
"step": 378
},
{
"epoch": 0.32441686282901777,
"grad_norm": 1.734375,
"learning_rate": 3.887503143706134e-05,
"loss": 1.721,
"step": 379
},
{
"epoch": 0.32527284399743206,
"grad_norm": 1.7578125,
"learning_rate": 3.881806900732893e-05,
"loss": 1.5304,
"step": 380
},
{
"epoch": 0.32612882516584635,
"grad_norm": 1.671875,
"learning_rate": 3.8761003096015466e-05,
"loss": 1.4313,
"step": 381
},
{
"epoch": 0.32698480633426064,
"grad_norm": 1.828125,
"learning_rate": 3.870383413047959e-05,
"loss": 1.4311,
"step": 382
},
{
"epoch": 0.3278407875026749,
"grad_norm": 1.828125,
"learning_rate": 3.864656253885163e-05,
"loss": 1.5491,
"step": 383
},
{
"epoch": 0.3286967686710892,
"grad_norm": 1.765625,
"learning_rate": 3.858918875003053e-05,
"loss": 1.5921,
"step": 384
},
{
"epoch": 0.32955274983950356,
"grad_norm": 1.8203125,
"learning_rate": 3.853171319368054e-05,
"loss": 1.3189,
"step": 385
},
{
"epoch": 0.33040873100791784,
"grad_norm": 1.578125,
"learning_rate": 3.847413630022804e-05,
"loss": 1.5709,
"step": 386
},
{
"epoch": 0.33126471217633213,
"grad_norm": 1.7578125,
"learning_rate": 3.841645850085831e-05,
"loss": 1.5226,
"step": 387
},
{
"epoch": 0.3321206933447464,
"grad_norm": 1.6171875,
"learning_rate": 3.835868022751231e-05,
"loss": 1.6103,
"step": 388
},
{
"epoch": 0.3329766745131607,
"grad_norm": 1.71875,
"learning_rate": 3.830080191288342e-05,
"loss": 1.4644,
"step": 389
},
{
"epoch": 0.333832655681575,
"grad_norm": 1.6015625,
"learning_rate": 3.8242823990414214e-05,
"loss": 1.5841,
"step": 390
},
{
"epoch": 0.3346886368499893,
"grad_norm": 1.7421875,
"learning_rate": 3.818474689429323e-05,
"loss": 1.4086,
"step": 391
},
{
"epoch": 0.3355446180184036,
"grad_norm": 1.8046875,
"learning_rate": 3.812657105945171e-05,
"loss": 1.4696,
"step": 392
},
{
"epoch": 0.3364005991868179,
"grad_norm": 1.5703125,
"learning_rate": 3.806829692156031e-05,
"loss": 1.3922,
"step": 393
},
{
"epoch": 0.3372565803552322,
"grad_norm": 1.6796875,
"learning_rate": 3.8009924917025864e-05,
"loss": 1.4289,
"step": 394
},
{
"epoch": 0.3381125615236465,
"grad_norm": 1.8046875,
"learning_rate": 3.795145548298815e-05,
"loss": 1.435,
"step": 395
},
{
"epoch": 0.3389685426920608,
"grad_norm": 1.625,
"learning_rate": 3.789288905731655e-05,
"loss": 1.4943,
"step": 396
},
{
"epoch": 0.33982452386047507,
"grad_norm": 1.75,
"learning_rate": 3.783422607860681e-05,
"loss": 1.5017,
"step": 397
},
{
"epoch": 0.34068050502888936,
"grad_norm": 4.375,
"learning_rate": 3.777546698617776e-05,
"loss": 1.5723,
"step": 398
},
{
"epoch": 0.34153648619730365,
"grad_norm": 1.609375,
"learning_rate": 3.7716612220068006e-05,
"loss": 1.4734,
"step": 399
},
{
"epoch": 0.34239246736571793,
"grad_norm": 1.6796875,
"learning_rate": 3.765766222103262e-05,
"loss": 1.5986,
"step": 400
},
{
"epoch": 0.3432484485341322,
"grad_norm": 1.7421875,
"learning_rate": 3.7598617430539884e-05,
"loss": 1.4154,
"step": 401
},
{
"epoch": 0.34410442970254657,
"grad_norm": 1.8046875,
"learning_rate": 3.753947829076797e-05,
"loss": 1.6668,
"step": 402
},
{
"epoch": 0.34496041087096085,
"grad_norm": 1.5703125,
"learning_rate": 3.7480245244601584e-05,
"loss": 1.4141,
"step": 403
},
{
"epoch": 0.34581639203937514,
"grad_norm": 1.484375,
"learning_rate": 3.742091873562871e-05,
"loss": 1.3079,
"step": 404
},
{
"epoch": 0.34667237320778943,
"grad_norm": 1.4609375,
"learning_rate": 3.7361499208137254e-05,
"loss": 1.5055,
"step": 405
},
{
"epoch": 0.3475283543762037,
"grad_norm": 1.6875,
"learning_rate": 3.730198710711173e-05,
"loss": 1.457,
"step": 406
},
{
"epoch": 0.348384335544618,
"grad_norm": 1.75,
"learning_rate": 3.724238287822991e-05,
"loss": 1.4187,
"step": 407
},
{
"epoch": 0.3492403167130323,
"grad_norm": 1.640625,
"learning_rate": 3.71826869678595e-05,
"loss": 1.4398,
"step": 408
},
{
"epoch": 0.3500962978814466,
"grad_norm": 1.8125,
"learning_rate": 3.7122899823054814e-05,
"loss": 1.4736,
"step": 409
},
{
"epoch": 0.3509522790498609,
"grad_norm": 2.03125,
"learning_rate": 3.706302189155338e-05,
"loss": 1.4837,
"step": 410
},
{
"epoch": 0.3518082602182752,
"grad_norm": 1.6171875,
"learning_rate": 3.7003053621772656e-05,
"loss": 1.4027,
"step": 411
},
{
"epoch": 0.3526642413866895,
"grad_norm": 1.796875,
"learning_rate": 3.694299546280657e-05,
"loss": 1.6534,
"step": 412
},
{
"epoch": 0.3535202225551038,
"grad_norm": 1.703125,
"learning_rate": 3.688284786442229e-05,
"loss": 1.5668,
"step": 413
},
{
"epoch": 0.3543762037235181,
"grad_norm": 1.65625,
"learning_rate": 3.682261127705671e-05,
"loss": 1.3467,
"step": 414
},
{
"epoch": 0.35523218489193237,
"grad_norm": 1.828125,
"learning_rate": 3.676228615181321e-05,
"loss": 1.4635,
"step": 415
},
{
"epoch": 0.35608816606034666,
"grad_norm": 1.5625,
"learning_rate": 3.6701872940458186e-05,
"loss": 1.3886,
"step": 416
},
{
"epoch": 0.35694414722876094,
"grad_norm": 1.609375,
"learning_rate": 3.66413720954177e-05,
"loss": 1.5161,
"step": 417
},
{
"epoch": 0.3578001283971753,
"grad_norm": 1.625,
"learning_rate": 3.6580784069774105e-05,
"loss": 1.5301,
"step": 418
},
{
"epoch": 0.3586561095655896,
"grad_norm": 1.7421875,
"learning_rate": 3.652010931726262e-05,
"loss": 1.3991,
"step": 419
},
{
"epoch": 0.35951209073400386,
"grad_norm": 1.65625,
"learning_rate": 3.645934829226797e-05,
"loss": 1.4226,
"step": 420
},
{
"epoch": 0.36036807190241815,
"grad_norm": 1.6796875,
"learning_rate": 3.6398501449820936e-05,
"loss": 1.5157,
"step": 421
},
{
"epoch": 0.36122405307083244,
"grad_norm": 1.6328125,
"learning_rate": 3.6337569245595005e-05,
"loss": 1.5619,
"step": 422
},
{
"epoch": 0.36208003423924673,
"grad_norm": 1.6015625,
"learning_rate": 3.62765521359029e-05,
"loss": 1.4751,
"step": 423
},
{
"epoch": 0.362936015407661,
"grad_norm": 3.21875,
"learning_rate": 3.6215450577693196e-05,
"loss": 1.4708,
"step": 424
},
{
"epoch": 0.3637919965760753,
"grad_norm": 1.671875,
"learning_rate": 3.615426502854689e-05,
"loss": 1.4924,
"step": 425
},
{
"epoch": 0.36464797774448965,
"grad_norm": 1.6015625,
"learning_rate": 3.6092995946673994e-05,
"loss": 1.5001,
"step": 426
},
{
"epoch": 0.36550395891290394,
"grad_norm": 1.484375,
"learning_rate": 3.603164379091006e-05,
"loss": 1.3498,
"step": 427
},
{
"epoch": 0.3663599400813182,
"grad_norm": 1.5703125,
"learning_rate": 3.597020902071278e-05,
"loss": 1.378,
"step": 428
},
{
"epoch": 0.3672159212497325,
"grad_norm": 2.015625,
"learning_rate": 3.590869209615854e-05,
"loss": 1.7722,
"step": 429
},
{
"epoch": 0.3680719024181468,
"grad_norm": 1.7421875,
"learning_rate": 3.5847093477938956e-05,
"loss": 1.5215,
"step": 430
},
{
"epoch": 0.3689278835865611,
"grad_norm": 1.6875,
"learning_rate": 3.578541362735744e-05,
"loss": 1.5693,
"step": 431
},
{
"epoch": 0.3697838647549754,
"grad_norm": 1.5703125,
"learning_rate": 3.572365300632574e-05,
"loss": 1.5959,
"step": 432
},
{
"epoch": 0.37063984592338967,
"grad_norm": 1.7734375,
"learning_rate": 3.56618120773605e-05,
"loss": 1.6924,
"step": 433
},
{
"epoch": 0.37149582709180395,
"grad_norm": 1.703125,
"learning_rate": 3.5599891303579746e-05,
"loss": 1.6631,
"step": 434
},
{
"epoch": 0.3723518082602183,
"grad_norm": 1.828125,
"learning_rate": 3.553789114869947e-05,
"loss": 1.4271,
"step": 435
},
{
"epoch": 0.3732077894286326,
"grad_norm": 1.5546875,
"learning_rate": 3.547581207703017e-05,
"loss": 1.4559,
"step": 436
},
{
"epoch": 0.3740637705970469,
"grad_norm": 1.6875,
"learning_rate": 3.541365455347327e-05,
"loss": 1.3832,
"step": 437
},
{
"epoch": 0.37491975176546116,
"grad_norm": 1.8203125,
"learning_rate": 3.535141904351776e-05,
"loss": 1.5994,
"step": 438
},
{
"epoch": 0.37577573293387545,
"grad_norm": 1.5546875,
"learning_rate": 3.528910601323666e-05,
"loss": 1.4947,
"step": 439
},
{
"epoch": 0.37663171410228974,
"grad_norm": 1.8671875,
"learning_rate": 3.5226715929283506e-05,
"loss": 1.3042,
"step": 440
},
{
"epoch": 0.377487695270704,
"grad_norm": 1.671875,
"learning_rate": 3.516424925888887e-05,
"loss": 1.4926,
"step": 441
},
{
"epoch": 0.3783436764391183,
"grad_norm": 1.5859375,
"learning_rate": 3.510170646985691e-05,
"loss": 1.4419,
"step": 442
},
{
"epoch": 0.37919965760753266,
"grad_norm": 1.5625,
"learning_rate": 3.50390880305618e-05,
"loss": 1.4541,
"step": 443
},
{
"epoch": 0.38005563877594695,
"grad_norm": 1.703125,
"learning_rate": 3.497639440994424e-05,
"loss": 1.5821,
"step": 444
},
{
"epoch": 0.38091161994436123,
"grad_norm": 1.625,
"learning_rate": 3.491362607750796e-05,
"loss": 1.4526,
"step": 445
},
{
"epoch": 0.3817676011127755,
"grad_norm": 1.546875,
"learning_rate": 3.485078350331622e-05,
"loss": 1.5525,
"step": 446
},
{
"epoch": 0.3826235822811898,
"grad_norm": 1.5703125,
"learning_rate": 3.478786715798823e-05,
"loss": 1.3649,
"step": 447
},
{
"epoch": 0.3834795634496041,
"grad_norm": 1.8125,
"learning_rate": 3.4724877512695674e-05,
"loss": 1.6517,
"step": 448
},
{
"epoch": 0.3843355446180184,
"grad_norm": 1.6953125,
"learning_rate": 3.466181503915918e-05,
"loss": 1.441,
"step": 449
},
{
"epoch": 0.3851915257864327,
"grad_norm": 1.84375,
"learning_rate": 3.459868020964478e-05,
"loss": 1.6027,
"step": 450
},
{
"epoch": 0.386047506954847,
"grad_norm": 1.453125,
"learning_rate": 3.453547349696033e-05,
"loss": 1.3575,
"step": 451
},
{
"epoch": 0.3869034881232613,
"grad_norm": 1.53125,
"learning_rate": 3.447219537445207e-05,
"loss": 1.4457,
"step": 452
},
{
"epoch": 0.3877594692916756,
"grad_norm": 1.6171875,
"learning_rate": 3.4408846316000956e-05,
"loss": 1.4387,
"step": 453
},
{
"epoch": 0.3886154504600899,
"grad_norm": 1.84375,
"learning_rate": 3.434542679601922e-05,
"loss": 1.5498,
"step": 454
},
{
"epoch": 0.38947143162850417,
"grad_norm": 1.78125,
"learning_rate": 3.428193728944675e-05,
"loss": 1.3684,
"step": 455
},
{
"epoch": 0.39032741279691846,
"grad_norm": 1.546875,
"learning_rate": 3.421837827174757e-05,
"loss": 1.5111,
"step": 456
},
{
"epoch": 0.39118339396533275,
"grad_norm": 1.703125,
"learning_rate": 3.415475021890622e-05,
"loss": 1.5642,
"step": 457
},
{
"epoch": 0.39203937513374704,
"grad_norm": 1.640625,
"learning_rate": 3.4091053607424295e-05,
"loss": 1.4413,
"step": 458
},
{
"epoch": 0.3928953563021613,
"grad_norm": 1.5546875,
"learning_rate": 3.402728891431677e-05,
"loss": 1.3544,
"step": 459
},
{
"epoch": 0.39375133747057567,
"grad_norm": 1.6953125,
"learning_rate": 3.396345661710849e-05,
"loss": 1.4379,
"step": 460
},
{
"epoch": 0.39460731863898996,
"grad_norm": 1.84375,
"learning_rate": 3.389955719383058e-05,
"loss": 1.7564,
"step": 461
},
{
"epoch": 0.39546329980740424,
"grad_norm": 1.53125,
"learning_rate": 3.3835591123016865e-05,
"loss": 1.5366,
"step": 462
},
{
"epoch": 0.39631928097581853,
"grad_norm": 1.6015625,
"learning_rate": 3.3771558883700284e-05,
"loss": 1.7521,
"step": 463
},
{
"epoch": 0.3971752621442328,
"grad_norm": 1.4375,
"learning_rate": 3.370746095540928e-05,
"loss": 1.4594,
"step": 464
},
{
"epoch": 0.3980312433126471,
"grad_norm": 1.5859375,
"learning_rate": 3.364329781816426e-05,
"loss": 1.4018,
"step": 465
},
{
"epoch": 0.3988872244810614,
"grad_norm": 1.7421875,
"learning_rate": 3.357906995247396e-05,
"loss": 1.5263,
"step": 466
},
{
"epoch": 0.3997432056494757,
"grad_norm": 1.7265625,
"learning_rate": 3.3514777839331856e-05,
"loss": 1.5457,
"step": 467
},
{
"epoch": 0.40059918681789003,
"grad_norm": 1.59375,
"learning_rate": 3.3450421960212566e-05,
"loss": 1.664,
"step": 468
},
{
"epoch": 0.40059918681789003,
"eval_loss": 1.5587416887283325,
"eval_runtime": 21.3401,
"eval_samples_per_second": 18.275,
"eval_steps_per_second": 18.275,
"step": 468
},
{
"epoch": 0.4014551679863043,
"grad_norm": 1.7578125,
"learning_rate": 3.338600279706826e-05,
"loss": 1.5381,
"step": 469
},
{
"epoch": 0.4023111491547186,
"grad_norm": 2.421875,
"learning_rate": 3.3321520832325e-05,
"loss": 1.4321,
"step": 470
},
{
"epoch": 0.4031671303231329,
"grad_norm": 1.6953125,
"learning_rate": 3.3256976548879184e-05,
"loss": 1.4431,
"step": 471
},
{
"epoch": 0.4040231114915472,
"grad_norm": 1.75,
"learning_rate": 3.319237043009389e-05,
"loss": 1.3993,
"step": 472
},
{
"epoch": 0.40487909265996147,
"grad_norm": 1.5234375,
"learning_rate": 3.3127702959795296e-05,
"loss": 1.3284,
"step": 473
},
{
"epoch": 0.40573507382837576,
"grad_norm": 1.6875,
"learning_rate": 3.306297462226901e-05,
"loss": 1.3601,
"step": 474
},
{
"epoch": 0.40659105499679005,
"grad_norm": 1.5546875,
"learning_rate": 3.299818590225647e-05,
"loss": 1.4164,
"step": 475
},
{
"epoch": 0.4074470361652044,
"grad_norm": 1.7109375,
"learning_rate": 3.2933337284951336e-05,
"loss": 1.4316,
"step": 476
},
{
"epoch": 0.4083030173336187,
"grad_norm": 1.5546875,
"learning_rate": 3.286842925599579e-05,
"loss": 1.5327,
"step": 477
},
{
"epoch": 0.40915899850203297,
"grad_norm": 1.6640625,
"learning_rate": 3.2803462301476964e-05,
"loss": 1.3832,
"step": 478
},
{
"epoch": 0.41001497967044725,
"grad_norm": 1.46875,
"learning_rate": 3.273843690792326e-05,
"loss": 1.2295,
"step": 479
},
{
"epoch": 0.41087096083886154,
"grad_norm": 1.8046875,
"learning_rate": 3.267335356230075e-05,
"loss": 1.4291,
"step": 480
},
{
"epoch": 0.41172694200727583,
"grad_norm": 1.640625,
"learning_rate": 3.260821275200947e-05,
"loss": 1.7269,
"step": 481
},
{
"epoch": 0.4125829231756901,
"grad_norm": 1.5234375,
"learning_rate": 3.2543014964879816e-05,
"loss": 1.3251,
"step": 482
},
{
"epoch": 0.4134389043441044,
"grad_norm": 1.828125,
"learning_rate": 3.247776068916887e-05,
"loss": 1.6163,
"step": 483
},
{
"epoch": 0.41429488551251875,
"grad_norm": 1.5546875,
"learning_rate": 3.241245041355675e-05,
"loss": 1.3584,
"step": 484
},
{
"epoch": 0.41515086668093304,
"grad_norm": 1.703125,
"learning_rate": 3.234708462714297e-05,
"loss": 1.4595,
"step": 485
},
{
"epoch": 0.4160068478493473,
"grad_norm": 2.078125,
"learning_rate": 3.228166381944272e-05,
"loss": 1.7641,
"step": 486
},
{
"epoch": 0.4168628290177616,
"grad_norm": 1.640625,
"learning_rate": 3.2216188480383256e-05,
"loss": 1.4908,
"step": 487
},
{
"epoch": 0.4177188101861759,
"grad_norm": 1.9140625,
"learning_rate": 3.215065910030021e-05,
"loss": 1.6466,
"step": 488
},
{
"epoch": 0.4185747913545902,
"grad_norm": 1.71875,
"learning_rate": 3.208507616993393e-05,
"loss": 1.4535,
"step": 489
},
{
"epoch": 0.4194307725230045,
"grad_norm": 1.5390625,
"learning_rate": 3.201944018042577e-05,
"loss": 1.4366,
"step": 490
},
{
"epoch": 0.42028675369141877,
"grad_norm": 1.6796875,
"learning_rate": 3.1953751623314475e-05,
"loss": 1.3296,
"step": 491
},
{
"epoch": 0.42114273485983306,
"grad_norm": 1.546875,
"learning_rate": 3.1888010990532415e-05,
"loss": 1.4605,
"step": 492
},
{
"epoch": 0.4219987160282474,
"grad_norm": 1.6484375,
"learning_rate": 3.182221877440198e-05,
"loss": 1.3257,
"step": 493
},
{
"epoch": 0.4228546971966617,
"grad_norm": 1.5703125,
"learning_rate": 3.175637546763183e-05,
"loss": 1.4084,
"step": 494
},
{
"epoch": 0.423710678365076,
"grad_norm": 1.5,
"learning_rate": 3.169048156331329e-05,
"loss": 1.5077,
"step": 495
},
{
"epoch": 0.42456665953349026,
"grad_norm": 1.96875,
"learning_rate": 3.162453755491655e-05,
"loss": 1.2778,
"step": 496
},
{
"epoch": 0.42542264070190455,
"grad_norm": 2.0,
"learning_rate": 3.1558543936287035e-05,
"loss": 1.3954,
"step": 497
},
{
"epoch": 0.42627862187031884,
"grad_norm": 1.671875,
"learning_rate": 3.149250120164171e-05,
"loss": 1.4434,
"step": 498
},
{
"epoch": 0.42713460303873313,
"grad_norm": 1.515625,
"learning_rate": 3.142640984556536e-05,
"loss": 1.5035,
"step": 499
},
{
"epoch": 0.4279905842071474,
"grad_norm": 1.7265625,
"learning_rate": 3.136027036300687e-05,
"loss": 1.5234,
"step": 500
},
{
"epoch": 0.42884656537556176,
"grad_norm": 1.5859375,
"learning_rate": 3.1294083249275545e-05,
"loss": 1.3764,
"step": 501
},
{
"epoch": 0.42970254654397605,
"grad_norm": 1.578125,
"learning_rate": 3.122784900003742e-05,
"loss": 1.4066,
"step": 502
},
{
"epoch": 0.43055852771239034,
"grad_norm": 1.828125,
"learning_rate": 3.116156811131148e-05,
"loss": 1.6143,
"step": 503
},
{
"epoch": 0.4314145088808046,
"grad_norm": 1.5703125,
"learning_rate": 3.109524107946602e-05,
"loss": 1.3665,
"step": 504
},
{
"epoch": 0.4322704900492189,
"grad_norm": 1.65625,
"learning_rate": 3.102886840121486e-05,
"loss": 1.3919,
"step": 505
},
{
"epoch": 0.4331264712176332,
"grad_norm": 1.8125,
"learning_rate": 3.0962450573613704e-05,
"loss": 1.6993,
"step": 506
},
{
"epoch": 0.4339824523860475,
"grad_norm": 1.65625,
"learning_rate": 3.089598809405633e-05,
"loss": 1.3292,
"step": 507
},
{
"epoch": 0.4348384335544618,
"grad_norm": 1.515625,
"learning_rate": 3.0829481460270936e-05,
"loss": 1.3597,
"step": 508
},
{
"epoch": 0.4356944147228761,
"grad_norm": 1.453125,
"learning_rate": 3.0762931170316385e-05,
"loss": 1.3326,
"step": 509
},
{
"epoch": 0.4365503958912904,
"grad_norm": 1.59375,
"learning_rate": 3.0696337722578444e-05,
"loss": 1.4273,
"step": 510
},
{
"epoch": 0.4374063770597047,
"grad_norm": 1.4375,
"learning_rate": 3.062970161576612e-05,
"loss": 1.4425,
"step": 511
},
{
"epoch": 0.438262358228119,
"grad_norm": 1.6015625,
"learning_rate": 3.056302334890786e-05,
"loss": 1.5967,
"step": 512
},
{
"epoch": 0.4391183393965333,
"grad_norm": 1.6484375,
"learning_rate": 3.0496303421347872e-05,
"loss": 1.5083,
"step": 513
},
{
"epoch": 0.43997432056494756,
"grad_norm": 1.5,
"learning_rate": 3.0429542332742323e-05,
"loss": 1.3709,
"step": 514
},
{
"epoch": 0.44083030173336185,
"grad_norm": 1.6328125,
"learning_rate": 3.036274058305565e-05,
"loss": 1.4481,
"step": 515
},
{
"epoch": 0.44168628290177614,
"grad_norm": 1.5625,
"learning_rate": 3.029589867255678e-05,
"loss": 1.4541,
"step": 516
},
{
"epoch": 0.4425422640701905,
"grad_norm": 1.625,
"learning_rate": 3.022901710181542e-05,
"loss": 1.6127,
"step": 517
},
{
"epoch": 0.44339824523860477,
"grad_norm": 1.640625,
"learning_rate": 3.0162096371698267e-05,
"loss": 1.2699,
"step": 518
},
{
"epoch": 0.44425422640701906,
"grad_norm": 1.5859375,
"learning_rate": 3.0095136983365286e-05,
"loss": 1.4119,
"step": 519
},
{
"epoch": 0.44511020757543335,
"grad_norm": 1.5078125,
"learning_rate": 3.0028139438265944e-05,
"loss": 1.4058,
"step": 520
},
{
"epoch": 0.44596618874384764,
"grad_norm": 1.703125,
"learning_rate": 2.9961104238135457e-05,
"loss": 1.6121,
"step": 521
},
{
"epoch": 0.4468221699122619,
"grad_norm": 1.6484375,
"learning_rate": 2.989403188499105e-05,
"loss": 1.5662,
"step": 522
},
{
"epoch": 0.4476781510806762,
"grad_norm": 1.453125,
"learning_rate": 2.9826922881128162e-05,
"loss": 1.5012,
"step": 523
},
{
"epoch": 0.4485341322490905,
"grad_norm": 1.515625,
"learning_rate": 2.975977772911671e-05,
"loss": 1.4917,
"step": 524
},
{
"epoch": 0.4493901134175048,
"grad_norm": 1.7109375,
"learning_rate": 2.969259693179733e-05,
"loss": 1.3906,
"step": 525
},
{
"epoch": 0.45024609458591913,
"grad_norm": 1.6953125,
"learning_rate": 2.9625380992277584e-05,
"loss": 1.583,
"step": 526
},
{
"epoch": 0.4511020757543334,
"grad_norm": 1.6875,
"learning_rate": 2.955813041392822e-05,
"loss": 1.4414,
"step": 527
},
{
"epoch": 0.4519580569227477,
"grad_norm": 1.453125,
"learning_rate": 2.949084570037939e-05,
"loss": 1.2735,
"step": 528
},
{
"epoch": 0.452814038091162,
"grad_norm": 1.46875,
"learning_rate": 2.9423527355516876e-05,
"loss": 1.3283,
"step": 529
},
{
"epoch": 0.4536700192595763,
"grad_norm": 1.671875,
"learning_rate": 2.9356175883478322e-05,
"loss": 1.5274,
"step": 530
},
{
"epoch": 0.4545260004279906,
"grad_norm": 1.5234375,
"learning_rate": 2.9288791788649462e-05,
"loss": 1.4455,
"step": 531
},
{
"epoch": 0.45538198159640486,
"grad_norm": 1.6796875,
"learning_rate": 2.922137557566032e-05,
"loss": 1.4383,
"step": 532
},
{
"epoch": 0.45623796276481915,
"grad_norm": 1.5859375,
"learning_rate": 2.9153927749381483e-05,
"loss": 1.4231,
"step": 533
},
{
"epoch": 0.4570939439332335,
"grad_norm": 1.4921875,
"learning_rate": 2.9086448814920242e-05,
"loss": 1.4336,
"step": 534
},
{
"epoch": 0.4579499251016478,
"grad_norm": 1.5078125,
"learning_rate": 2.9018939277616886e-05,
"loss": 1.3865,
"step": 535
},
{
"epoch": 0.45880590627006207,
"grad_norm": 1.71875,
"learning_rate": 2.8951399643040867e-05,
"loss": 1.3812,
"step": 536
},
{
"epoch": 0.45966188743847636,
"grad_norm": 1.4609375,
"learning_rate": 2.888383041698704e-05,
"loss": 1.4111,
"step": 537
},
{
"epoch": 0.46051786860689065,
"grad_norm": 1.5234375,
"learning_rate": 2.8816232105471863e-05,
"loss": 1.2808,
"step": 538
},
{
"epoch": 0.46137384977530493,
"grad_norm": 1.4765625,
"learning_rate": 2.874860521472962e-05,
"loss": 1.4054,
"step": 539
},
{
"epoch": 0.4622298309437192,
"grad_norm": 1.6015625,
"learning_rate": 2.8680950251208595e-05,
"loss": 1.4313,
"step": 540
},
{
"epoch": 0.4630858121121335,
"grad_norm": 1.671875,
"learning_rate": 2.8613267721567333e-05,
"loss": 1.3595,
"step": 541
},
{
"epoch": 0.46394179328054785,
"grad_norm": 1.6015625,
"learning_rate": 2.8545558132670803e-05,
"loss": 1.4876,
"step": 542
},
{
"epoch": 0.46479777444896214,
"grad_norm": 1.7734375,
"learning_rate": 2.847782199158663e-05,
"loss": 1.4332,
"step": 543
},
{
"epoch": 0.46565375561737643,
"grad_norm": 1.8046875,
"learning_rate": 2.8410059805581258e-05,
"loss": 1.4712,
"step": 544
},
{
"epoch": 0.4665097367857907,
"grad_norm": 1.9375,
"learning_rate": 2.834227208211621e-05,
"loss": 1.4455,
"step": 545
},
{
"epoch": 0.467365717954205,
"grad_norm": 1.6015625,
"learning_rate": 2.8274459328844248e-05,
"loss": 1.4987,
"step": 546
},
{
"epoch": 0.4682216991226193,
"grad_norm": 1.78125,
"learning_rate": 2.8206622053605553e-05,
"loss": 1.4329,
"step": 547
},
{
"epoch": 0.4690776802910336,
"grad_norm": 1.6015625,
"learning_rate": 2.813876076442397e-05,
"loss": 1.3499,
"step": 548
},
{
"epoch": 0.46993366145944787,
"grad_norm": 1.671875,
"learning_rate": 2.8070875969503192e-05,
"loss": 1.4936,
"step": 549
},
{
"epoch": 0.47078964262786216,
"grad_norm": 1.75,
"learning_rate": 2.8002968177222917e-05,
"loss": 1.4108,
"step": 550
},
{
"epoch": 0.4716456237962765,
"grad_norm": 1.4296875,
"learning_rate": 2.793503789613507e-05,
"loss": 1.4677,
"step": 551
},
{
"epoch": 0.4725016049646908,
"grad_norm": 1.5625,
"learning_rate": 2.7867085634960016e-05,
"loss": 1.6387,
"step": 552
},
{
"epoch": 0.4733575861331051,
"grad_norm": 1.546875,
"learning_rate": 2.7799111902582696e-05,
"loss": 1.4241,
"step": 553
},
{
"epoch": 0.47421356730151937,
"grad_norm": 1.6875,
"learning_rate": 2.7731117208048872e-05,
"loss": 1.5287,
"step": 554
},
{
"epoch": 0.47506954846993366,
"grad_norm": 1.6484375,
"learning_rate": 2.7663102060561275e-05,
"loss": 1.4029,
"step": 555
},
{
"epoch": 0.47592552963834794,
"grad_norm": 1.5234375,
"learning_rate": 2.75950669694758e-05,
"loss": 1.3678,
"step": 556
},
{
"epoch": 0.47678151080676223,
"grad_norm": 1.5,
"learning_rate": 2.7527012444297707e-05,
"loss": 1.3775,
"step": 557
},
{
"epoch": 0.4776374919751765,
"grad_norm": 1.5625,
"learning_rate": 2.7458938994677786e-05,
"loss": 1.6167,
"step": 558
},
{
"epoch": 0.47849347314359086,
"grad_norm": 1.625,
"learning_rate": 2.739084713040856e-05,
"loss": 1.4628,
"step": 559
},
{
"epoch": 0.47934945431200515,
"grad_norm": 1.59375,
"learning_rate": 2.7322737361420454e-05,
"loss": 1.5349,
"step": 560
},
{
"epoch": 0.48020543548041944,
"grad_norm": 1.6328125,
"learning_rate": 2.725461019777797e-05,
"loss": 1.4614,
"step": 561
},
{
"epoch": 0.48106141664883373,
"grad_norm": 1.609375,
"learning_rate": 2.7186466149675887e-05,
"loss": 1.6509,
"step": 562
},
{
"epoch": 0.481917397817248,
"grad_norm": 1.8046875,
"learning_rate": 2.7118305727435434e-05,
"loss": 1.4552,
"step": 563
},
{
"epoch": 0.4827733789856623,
"grad_norm": 1.765625,
"learning_rate": 2.7050129441500436e-05,
"loss": 1.6248,
"step": 564
},
{
"epoch": 0.4836293601540766,
"grad_norm": 1.515625,
"learning_rate": 2.698193780243355e-05,
"loss": 1.4198,
"step": 565
},
{
"epoch": 0.4844853413224909,
"grad_norm": 1.9140625,
"learning_rate": 2.69137313209124e-05,
"loss": 1.4712,
"step": 566
},
{
"epoch": 0.4853413224909052,
"grad_norm": 1.5703125,
"learning_rate": 2.6845510507725745e-05,
"loss": 1.3251,
"step": 567
},
{
"epoch": 0.4861973036593195,
"grad_norm": 1.5390625,
"learning_rate": 2.67772758737697e-05,
"loss": 1.3109,
"step": 568
},
{
"epoch": 0.4870532848277338,
"grad_norm": 1.4765625,
"learning_rate": 2.670902793004389e-05,
"loss": 1.4285,
"step": 569
},
{
"epoch": 0.4879092659961481,
"grad_norm": 1.5390625,
"learning_rate": 2.664076718764756e-05,
"loss": 1.4363,
"step": 570
},
{
"epoch": 0.4887652471645624,
"grad_norm": 1.578125,
"learning_rate": 2.657249415777585e-05,
"loss": 1.2128,
"step": 571
},
{
"epoch": 0.48962122833297667,
"grad_norm": 1.546875,
"learning_rate": 2.6504209351715914e-05,
"loss": 1.472,
"step": 572
},
{
"epoch": 0.49047720950139095,
"grad_norm": 2.53125,
"learning_rate": 2.643591328084309e-05,
"loss": 1.3816,
"step": 573
},
{
"epoch": 0.49133319066980524,
"grad_norm": 1.8984375,
"learning_rate": 2.6367606456617055e-05,
"loss": 1.5654,
"step": 574
},
{
"epoch": 0.4921891718382196,
"grad_norm": 1.8046875,
"learning_rate": 2.6299289390578053e-05,
"loss": 1.5554,
"step": 575
},
{
"epoch": 0.4930451530066339,
"grad_norm": 1.6875,
"learning_rate": 2.623096259434302e-05,
"loss": 1.5279,
"step": 576
},
{
"epoch": 0.49390113417504816,
"grad_norm": 1.84375,
"learning_rate": 2.616262657960173e-05,
"loss": 1.4617,
"step": 577
},
{
"epoch": 0.49475711534346245,
"grad_norm": 1.640625,
"learning_rate": 2.6094281858113022e-05,
"loss": 1.4409,
"step": 578
},
{
"epoch": 0.49561309651187674,
"grad_norm": 1.5,
"learning_rate": 2.6025928941700945e-05,
"loss": 1.38,
"step": 579
},
{
"epoch": 0.496469077680291,
"grad_norm": 1.6328125,
"learning_rate": 2.595756834225089e-05,
"loss": 1.4494,
"step": 580
},
{
"epoch": 0.4973250588487053,
"grad_norm": 1.6484375,
"learning_rate": 2.5889200571705795e-05,
"loss": 1.4874,
"step": 581
},
{
"epoch": 0.4981810400171196,
"grad_norm": 1.6484375,
"learning_rate": 2.5820826142062323e-05,
"loss": 1.5417,
"step": 582
},
{
"epoch": 0.4990370211855339,
"grad_norm": 1.6328125,
"learning_rate": 2.575244556536697e-05,
"loss": 1.4868,
"step": 583
},
{
"epoch": 0.49989300235394823,
"grad_norm": 1.6796875,
"learning_rate": 2.5684059353712307e-05,
"loss": 1.3093,
"step": 584
},
{
"epoch": 0.5007489835223625,
"grad_norm": 1.921875,
"learning_rate": 2.5615668019233064e-05,
"loss": 1.5308,
"step": 585
},
{
"epoch": 0.5007489835223625,
"eval_loss": 1.5403519868850708,
"eval_runtime": 21.3271,
"eval_samples_per_second": 18.287,
"eval_steps_per_second": 18.287,
"step": 585
},
{
"epoch": 0.5016049646907768,
"grad_norm": 1.4453125,
"learning_rate": 2.5547272074102374e-05,
"loss": 1.339,
"step": 586
},
{
"epoch": 0.5024609458591911,
"grad_norm": 1.5625,
"learning_rate": 2.5478872030527855e-05,
"loss": 1.413,
"step": 587
},
{
"epoch": 0.5033169270276054,
"grad_norm": 1.6640625,
"learning_rate": 2.5410468400747854e-05,
"loss": 1.399,
"step": 588
},
{
"epoch": 0.5041729081960197,
"grad_norm": 1.546875,
"learning_rate": 2.534206169702757e-05,
"loss": 1.5245,
"step": 589
},
{
"epoch": 0.505028889364434,
"grad_norm": 1.7578125,
"learning_rate": 2.5273652431655204e-05,
"loss": 1.418,
"step": 590
},
{
"epoch": 0.5058848705328483,
"grad_norm": 1.5078125,
"learning_rate": 2.520524111693814e-05,
"loss": 1.4231,
"step": 591
},
{
"epoch": 0.5067408517012626,
"grad_norm": 1.5390625,
"learning_rate": 2.513682826519914e-05,
"loss": 1.3967,
"step": 592
},
{
"epoch": 0.5075968328696768,
"grad_norm": 1.453125,
"learning_rate": 2.5068414388772453e-05,
"loss": 1.3799,
"step": 593
},
{
"epoch": 0.5084528140380912,
"grad_norm": 1.84375,
"learning_rate": 2.5e-05,
"loss": 1.4701,
"step": 594
},
{
"epoch": 0.5093087952065054,
"grad_norm": 1.5078125,
"learning_rate": 2.4931585611227543e-05,
"loss": 1.3946,
"step": 595
},
{
"epoch": 0.5101647763749197,
"grad_norm": 1.765625,
"learning_rate": 2.4863171734800865e-05,
"loss": 1.5882,
"step": 596
},
{
"epoch": 0.5110207575433341,
"grad_norm": 1.5625,
"learning_rate": 2.479475888306186e-05,
"loss": 1.4909,
"step": 597
},
{
"epoch": 0.5118767387117483,
"grad_norm": 1.453125,
"learning_rate": 2.472634756834481e-05,
"loss": 1.2668,
"step": 598
},
{
"epoch": 0.5127327198801627,
"grad_norm": 1.5,
"learning_rate": 2.4657938302972437e-05,
"loss": 1.2743,
"step": 599
},
{
"epoch": 0.5135887010485769,
"grad_norm": 1.453125,
"learning_rate": 2.458953159925215e-05,
"loss": 1.3327,
"step": 600
},
{
"epoch": 0.5144446822169912,
"grad_norm": 1.625,
"learning_rate": 2.4521127969472148e-05,
"loss": 1.5656,
"step": 601
},
{
"epoch": 0.5153006633854055,
"grad_norm": 1.546875,
"learning_rate": 2.4452727925897635e-05,
"loss": 1.2883,
"step": 602
},
{
"epoch": 0.5161566445538198,
"grad_norm": 1.546875,
"learning_rate": 2.438433198076694e-05,
"loss": 1.4471,
"step": 603
},
{
"epoch": 0.5170126257222342,
"grad_norm": 1.5703125,
"learning_rate": 2.4315940646287695e-05,
"loss": 1.3376,
"step": 604
},
{
"epoch": 0.5178686068906484,
"grad_norm": 1.6796875,
"learning_rate": 2.424755443463303e-05,
"loss": 1.4541,
"step": 605
},
{
"epoch": 0.5187245880590627,
"grad_norm": 1.4453125,
"learning_rate": 2.4179173857937683e-05,
"loss": 1.2946,
"step": 606
},
{
"epoch": 0.519580569227477,
"grad_norm": 1.6484375,
"learning_rate": 2.411079942829421e-05,
"loss": 1.4473,
"step": 607
},
{
"epoch": 0.5204365503958913,
"grad_norm": 1.7578125,
"learning_rate": 2.4042431657749117e-05,
"loss": 1.5921,
"step": 608
},
{
"epoch": 0.5212925315643056,
"grad_norm": 1.59375,
"learning_rate": 2.3974071058299064e-05,
"loss": 1.3892,
"step": 609
},
{
"epoch": 0.5221485127327199,
"grad_norm": 1.546875,
"learning_rate": 2.390571814188698e-05,
"loss": 1.4598,
"step": 610
},
{
"epoch": 0.5230044939011341,
"grad_norm": 1.625,
"learning_rate": 2.383737342039827e-05,
"loss": 1.4553,
"step": 611
},
{
"epoch": 0.5238604750695485,
"grad_norm": 1.8046875,
"learning_rate": 2.3769037405656987e-05,
"loss": 1.5219,
"step": 612
},
{
"epoch": 0.5247164562379628,
"grad_norm": 1.4765625,
"learning_rate": 2.3700710609421946e-05,
"loss": 1.255,
"step": 613
},
{
"epoch": 0.525572437406377,
"grad_norm": 1.6875,
"learning_rate": 2.3632393543382954e-05,
"loss": 1.4204,
"step": 614
},
{
"epoch": 0.5264284185747914,
"grad_norm": 1.546875,
"learning_rate": 2.356408671915692e-05,
"loss": 1.4509,
"step": 615
},
{
"epoch": 0.5272843997432056,
"grad_norm": 1.4921875,
"learning_rate": 2.3495790648284092e-05,
"loss": 1.3018,
"step": 616
},
{
"epoch": 0.52814038091162,
"grad_norm": 1.546875,
"learning_rate": 2.3427505842224154e-05,
"loss": 1.5016,
"step": 617
},
{
"epoch": 0.5289963620800342,
"grad_norm": 1.5234375,
"learning_rate": 2.3359232812352443e-05,
"loss": 1.3029,
"step": 618
},
{
"epoch": 0.5298523432484485,
"grad_norm": 1.7421875,
"learning_rate": 2.3290972069956117e-05,
"loss": 1.4533,
"step": 619
},
{
"epoch": 0.5307083244168628,
"grad_norm": 1.59375,
"learning_rate": 2.3222724126230294e-05,
"loss": 1.36,
"step": 620
},
{
"epoch": 0.5315643055852771,
"grad_norm": 1.875,
"learning_rate": 2.315448949227426e-05,
"loss": 1.5453,
"step": 621
},
{
"epoch": 0.5324202867536915,
"grad_norm": 1.6328125,
"learning_rate": 2.3086268679087607e-05,
"loss": 1.3677,
"step": 622
},
{
"epoch": 0.5332762679221057,
"grad_norm": 1.7578125,
"learning_rate": 2.3018062197566462e-05,
"loss": 1.5106,
"step": 623
},
{
"epoch": 0.53413224909052,
"grad_norm": 1.765625,
"learning_rate": 2.294987055849957e-05,
"loss": 1.6279,
"step": 624
},
{
"epoch": 0.5349882302589343,
"grad_norm": 1.5234375,
"learning_rate": 2.288169427256458e-05,
"loss": 1.4241,
"step": 625
},
{
"epoch": 0.5358442114273486,
"grad_norm": 1.484375,
"learning_rate": 2.281353385032412e-05,
"loss": 1.4114,
"step": 626
},
{
"epoch": 0.5367001925957628,
"grad_norm": 1.5703125,
"learning_rate": 2.2745389802222032e-05,
"loss": 1.4671,
"step": 627
},
{
"epoch": 0.5375561737641772,
"grad_norm": 1.6875,
"learning_rate": 2.2677262638579555e-05,
"loss": 1.5669,
"step": 628
},
{
"epoch": 0.5384121549325915,
"grad_norm": 1.5703125,
"learning_rate": 2.2609152869591446e-05,
"loss": 1.4634,
"step": 629
},
{
"epoch": 0.5392681361010058,
"grad_norm": 1.671875,
"learning_rate": 2.2541061005322227e-05,
"loss": 1.4757,
"step": 630
},
{
"epoch": 0.5401241172694201,
"grad_norm": 1.6484375,
"learning_rate": 2.2472987555702302e-05,
"loss": 1.504,
"step": 631
},
{
"epoch": 0.5409800984378343,
"grad_norm": 1.6875,
"learning_rate": 2.240493303052421e-05,
"loss": 1.5711,
"step": 632
},
{
"epoch": 0.5418360796062487,
"grad_norm": 1.390625,
"learning_rate": 2.2336897939438734e-05,
"loss": 1.3183,
"step": 633
},
{
"epoch": 0.5426920607746629,
"grad_norm": 1.5078125,
"learning_rate": 2.2268882791951127e-05,
"loss": 1.4867,
"step": 634
},
{
"epoch": 0.5435480419430773,
"grad_norm": 1.390625,
"learning_rate": 2.2200888097417307e-05,
"loss": 1.2882,
"step": 635
},
{
"epoch": 0.5444040231114915,
"grad_norm": 1.609375,
"learning_rate": 2.2132914365039993e-05,
"loss": 1.4977,
"step": 636
},
{
"epoch": 0.5452600042799058,
"grad_norm": 1.4296875,
"learning_rate": 2.2064962103864937e-05,
"loss": 1.4808,
"step": 637
},
{
"epoch": 0.5461159854483202,
"grad_norm": 1.6484375,
"learning_rate": 2.1997031822777093e-05,
"loss": 1.3365,
"step": 638
},
{
"epoch": 0.5469719666167344,
"grad_norm": 1.375,
"learning_rate": 2.1929124030496817e-05,
"loss": 1.3079,
"step": 639
},
{
"epoch": 0.5478279477851488,
"grad_norm": 1.5546875,
"learning_rate": 2.186123923557603e-05,
"loss": 1.4077,
"step": 640
},
{
"epoch": 0.548683928953563,
"grad_norm": 1.578125,
"learning_rate": 2.1793377946394446e-05,
"loss": 1.5337,
"step": 641
},
{
"epoch": 0.5495399101219773,
"grad_norm": 1.5859375,
"learning_rate": 2.1725540671155758e-05,
"loss": 1.3779,
"step": 642
},
{
"epoch": 0.5503958912903916,
"grad_norm": 1.6484375,
"learning_rate": 2.165772791788379e-05,
"loss": 1.2943,
"step": 643
},
{
"epoch": 0.5512518724588059,
"grad_norm": 1.515625,
"learning_rate": 2.1589940194418748e-05,
"loss": 1.4558,
"step": 644
},
{
"epoch": 0.5521078536272201,
"grad_norm": 1.4921875,
"learning_rate": 2.1522178008413377e-05,
"loss": 1.3845,
"step": 645
},
{
"epoch": 0.5529638347956345,
"grad_norm": 1.6171875,
"learning_rate": 2.1454441867329203e-05,
"loss": 1.4121,
"step": 646
},
{
"epoch": 0.5538198159640488,
"grad_norm": 2.625,
"learning_rate": 2.1386732278432676e-05,
"loss": 1.3775,
"step": 647
},
{
"epoch": 0.5546757971324631,
"grad_norm": 1.734375,
"learning_rate": 2.1319049748791418e-05,
"loss": 1.3581,
"step": 648
},
{
"epoch": 0.5555317783008774,
"grad_norm": 1.8125,
"learning_rate": 2.1251394785270386e-05,
"loss": 1.5385,
"step": 649
},
{
"epoch": 0.5563877594692916,
"grad_norm": 1.578125,
"learning_rate": 2.1183767894528136e-05,
"loss": 1.4733,
"step": 650
},
{
"epoch": 0.557243740637706,
"grad_norm": 1.4921875,
"learning_rate": 2.1116169583012965e-05,
"loss": 1.3986,
"step": 651
},
{
"epoch": 0.5580997218061202,
"grad_norm": 1.4140625,
"learning_rate": 2.1048600356959132e-05,
"loss": 1.3114,
"step": 652
},
{
"epoch": 0.5589557029745346,
"grad_norm": 1.75,
"learning_rate": 2.0981060722383127e-05,
"loss": 1.33,
"step": 653
},
{
"epoch": 0.5598116841429489,
"grad_norm": 1.546875,
"learning_rate": 2.0913551185079764e-05,
"loss": 1.4388,
"step": 654
},
{
"epoch": 0.5606676653113631,
"grad_norm": 1.8046875,
"learning_rate": 2.084607225061853e-05,
"loss": 1.6617,
"step": 655
},
{
"epoch": 0.5615236464797775,
"grad_norm": 1.53125,
"learning_rate": 2.077862442433968e-05,
"loss": 1.3882,
"step": 656
},
{
"epoch": 0.5623796276481917,
"grad_norm": 1.5546875,
"learning_rate": 2.071120821135054e-05,
"loss": 1.2734,
"step": 657
},
{
"epoch": 0.5632356088166061,
"grad_norm": 1.4765625,
"learning_rate": 2.064382411652168e-05,
"loss": 1.4545,
"step": 658
},
{
"epoch": 0.5640915899850203,
"grad_norm": 1.6328125,
"learning_rate": 2.057647264448313e-05,
"loss": 1.4795,
"step": 659
},
{
"epoch": 0.5649475711534346,
"grad_norm": 1.65625,
"learning_rate": 2.050915429962062e-05,
"loss": 1.5849,
"step": 660
},
{
"epoch": 0.5658035523218489,
"grad_norm": 1.6796875,
"learning_rate": 2.0441869586071783e-05,
"loss": 1.5012,
"step": 661
},
{
"epoch": 0.5666595334902632,
"grad_norm": 1.6875,
"learning_rate": 2.037461900772242e-05,
"loss": 1.5541,
"step": 662
},
{
"epoch": 0.5675155146586776,
"grad_norm": 1.65625,
"learning_rate": 2.0307403068202676e-05,
"loss": 1.4741,
"step": 663
},
{
"epoch": 0.5683714958270918,
"grad_norm": 1.546875,
"learning_rate": 2.0240222270883288e-05,
"loss": 1.5354,
"step": 664
},
{
"epoch": 0.5692274769955061,
"grad_norm": 1.546875,
"learning_rate": 2.0173077118871844e-05,
"loss": 1.4909,
"step": 665
},
{
"epoch": 0.5700834581639204,
"grad_norm": 1.5390625,
"learning_rate": 2.0105968115008954e-05,
"loss": 1.5927,
"step": 666
},
{
"epoch": 0.5709394393323347,
"grad_norm": 1.6171875,
"learning_rate": 2.003889576186455e-05,
"loss": 1.4568,
"step": 667
},
{
"epoch": 0.5717954205007489,
"grad_norm": 1.8046875,
"learning_rate": 1.997186056173406e-05,
"loss": 1.4905,
"step": 668
},
{
"epoch": 0.5726514016691633,
"grad_norm": 1.5390625,
"learning_rate": 1.9904863016634723e-05,
"loss": 1.5317,
"step": 669
},
{
"epoch": 0.5735073828375776,
"grad_norm": 1.65625,
"learning_rate": 1.983790362830174e-05,
"loss": 1.4985,
"step": 670
},
{
"epoch": 0.5743633640059919,
"grad_norm": 1.421875,
"learning_rate": 1.977098289818459e-05,
"loss": 1.4502,
"step": 671
},
{
"epoch": 0.5752193451744062,
"grad_norm": 1.5625,
"learning_rate": 1.970410132744322e-05,
"loss": 1.3937,
"step": 672
},
{
"epoch": 0.5760753263428204,
"grad_norm": 1.484375,
"learning_rate": 1.9637259416944352e-05,
"loss": 1.3821,
"step": 673
},
{
"epoch": 0.5769313075112348,
"grad_norm": 1.5234375,
"learning_rate": 1.9570457667257686e-05,
"loss": 1.4048,
"step": 674
},
{
"epoch": 0.577787288679649,
"grad_norm": 1.5703125,
"learning_rate": 1.950369657865213e-05,
"loss": 1.3841,
"step": 675
},
{
"epoch": 0.5786432698480634,
"grad_norm": 1.53125,
"learning_rate": 1.9436976651092144e-05,
"loss": 1.4192,
"step": 676
},
{
"epoch": 0.5794992510164776,
"grad_norm": 1.5,
"learning_rate": 1.937029838423389e-05,
"loss": 1.2927,
"step": 677
},
{
"epoch": 0.5803552321848919,
"grad_norm": 1.515625,
"learning_rate": 1.9303662277421568e-05,
"loss": 1.5403,
"step": 678
},
{
"epoch": 0.5812112133533063,
"grad_norm": 1.390625,
"learning_rate": 1.923706882968362e-05,
"loss": 1.3693,
"step": 679
},
{
"epoch": 0.5820671945217205,
"grad_norm": 1.4453125,
"learning_rate": 1.917051853972906e-05,
"loss": 1.3371,
"step": 680
},
{
"epoch": 0.5829231756901349,
"grad_norm": 1.5703125,
"learning_rate": 1.910401190594367e-05,
"loss": 1.4528,
"step": 681
},
{
"epoch": 0.5837791568585491,
"grad_norm": 1.5703125,
"learning_rate": 1.9037549426386302e-05,
"loss": 1.4057,
"step": 682
},
{
"epoch": 0.5846351380269634,
"grad_norm": 1.78125,
"learning_rate": 1.8971131598785148e-05,
"loss": 1.4727,
"step": 683
},
{
"epoch": 0.5854911191953777,
"grad_norm": 1.5625,
"learning_rate": 1.8904758920533988e-05,
"loss": 1.4969,
"step": 684
},
{
"epoch": 0.586347100363792,
"grad_norm": 1.4140625,
"learning_rate": 1.8838431888688527e-05,
"loss": 1.3984,
"step": 685
},
{
"epoch": 0.5872030815322062,
"grad_norm": 1.703125,
"learning_rate": 1.8772150999962587e-05,
"loss": 1.4929,
"step": 686
},
{
"epoch": 0.5880590627006206,
"grad_norm": 1.46875,
"learning_rate": 1.870591675072446e-05,
"loss": 1.3202,
"step": 687
},
{
"epoch": 0.5889150438690349,
"grad_norm": 1.5,
"learning_rate": 1.863972963699314e-05,
"loss": 1.5529,
"step": 688
},
{
"epoch": 0.5897710250374492,
"grad_norm": 1.6640625,
"learning_rate": 1.857359015443465e-05,
"loss": 1.4185,
"step": 689
},
{
"epoch": 0.5906270062058635,
"grad_norm": 1.5234375,
"learning_rate": 1.8507498798358297e-05,
"loss": 1.4122,
"step": 690
},
{
"epoch": 0.5914829873742777,
"grad_norm": 1.8203125,
"learning_rate": 1.844145606371297e-05,
"loss": 1.3178,
"step": 691
},
{
"epoch": 0.5923389685426921,
"grad_norm": 1.640625,
"learning_rate": 1.8375462445083464e-05,
"loss": 1.4875,
"step": 692
},
{
"epoch": 0.5931949497111063,
"grad_norm": 1.5390625,
"learning_rate": 1.830951843668672e-05,
"loss": 1.443,
"step": 693
},
{
"epoch": 0.5940509308795207,
"grad_norm": 1.625,
"learning_rate": 1.8243624532368174e-05,
"loss": 1.4547,
"step": 694
},
{
"epoch": 0.594906912047935,
"grad_norm": 1.453125,
"learning_rate": 1.8177781225598032e-05,
"loss": 1.3457,
"step": 695
},
{
"epoch": 0.5957628932163492,
"grad_norm": 1.6171875,
"learning_rate": 1.811198900946759e-05,
"loss": 1.4981,
"step": 696
},
{
"epoch": 0.5966188743847636,
"grad_norm": 2.359375,
"learning_rate": 1.804624837668553e-05,
"loss": 1.5599,
"step": 697
},
{
"epoch": 0.5974748555531778,
"grad_norm": 1.4921875,
"learning_rate": 1.7980559819574223e-05,
"loss": 1.3979,
"step": 698
},
{
"epoch": 0.5983308367215922,
"grad_norm": 1.5625,
"learning_rate": 1.7914923830066074e-05,
"loss": 1.4061,
"step": 699
},
{
"epoch": 0.5991868178900064,
"grad_norm": 1.6015625,
"learning_rate": 1.784934089969979e-05,
"loss": 1.5827,
"step": 700
},
{
"epoch": 0.6000427990584207,
"grad_norm": 1.7734375,
"learning_rate": 1.7783811519616757e-05,
"loss": 1.4095,
"step": 701
},
{
"epoch": 0.600898780226835,
"grad_norm": 1.5,
"learning_rate": 1.7718336180557288e-05,
"loss": 1.3583,
"step": 702
},
{
"epoch": 0.600898780226835,
"eval_loss": 1.5267729759216309,
"eval_runtime": 21.3333,
"eval_samples_per_second": 18.281,
"eval_steps_per_second": 18.281,
"step": 702
},
{
"epoch": 0.6017547613952493,
"grad_norm": 1.4296875,
"learning_rate": 1.7652915372857035e-05,
"loss": 1.4024,
"step": 703
},
{
"epoch": 0.6026107425636636,
"grad_norm": 1.4375,
"learning_rate": 1.7587549586443252e-05,
"loss": 1.349,
"step": 704
},
{
"epoch": 0.6034667237320779,
"grad_norm": 1.6484375,
"learning_rate": 1.7522239310831134e-05,
"loss": 1.5471,
"step": 705
},
{
"epoch": 0.6043227049004922,
"grad_norm": 1.828125,
"learning_rate": 1.7456985035120193e-05,
"loss": 1.4457,
"step": 706
},
{
"epoch": 0.6051786860689065,
"grad_norm": 1.671875,
"learning_rate": 1.7391787247990538e-05,
"loss": 1.2629,
"step": 707
},
{
"epoch": 0.6060346672373208,
"grad_norm": 1.5546875,
"learning_rate": 1.732664643769926e-05,
"loss": 1.4819,
"step": 708
},
{
"epoch": 0.606890648405735,
"grad_norm": 1.9296875,
"learning_rate": 1.726156309207674e-05,
"loss": 1.4687,
"step": 709
},
{
"epoch": 0.6077466295741494,
"grad_norm": 1.5078125,
"learning_rate": 1.7196537698523052e-05,
"loss": 1.4168,
"step": 710
},
{
"epoch": 0.6086026107425636,
"grad_norm": 1.7734375,
"learning_rate": 1.7131570744004215e-05,
"loss": 1.4856,
"step": 711
},
{
"epoch": 0.609458591910978,
"grad_norm": 1.5078125,
"learning_rate": 1.7066662715048666e-05,
"loss": 1.4287,
"step": 712
},
{
"epoch": 0.6103145730793923,
"grad_norm": 1.578125,
"learning_rate": 1.7001814097743528e-05,
"loss": 1.5557,
"step": 713
},
{
"epoch": 0.6111705542478065,
"grad_norm": 1.6015625,
"learning_rate": 1.693702537773099e-05,
"loss": 1.4353,
"step": 714
},
{
"epoch": 0.6120265354162209,
"grad_norm": 1.6796875,
"learning_rate": 1.687229704020471e-05,
"loss": 1.5126,
"step": 715
},
{
"epoch": 0.6128825165846351,
"grad_norm": 1.6171875,
"learning_rate": 1.6807629569906112e-05,
"loss": 1.479,
"step": 716
},
{
"epoch": 0.6137384977530495,
"grad_norm": 1.78125,
"learning_rate": 1.6743023451120832e-05,
"loss": 1.4706,
"step": 717
},
{
"epoch": 0.6145944789214637,
"grad_norm": 1.609375,
"learning_rate": 1.6678479167675006e-05,
"loss": 1.6114,
"step": 718
},
{
"epoch": 0.615450460089878,
"grad_norm": 1.625,
"learning_rate": 1.6613997202931746e-05,
"loss": 1.4916,
"step": 719
},
{
"epoch": 0.6163064412582924,
"grad_norm": 1.5703125,
"learning_rate": 1.6549578039787436e-05,
"loss": 1.3918,
"step": 720
},
{
"epoch": 0.6171624224267066,
"grad_norm": 1.4296875,
"learning_rate": 1.6485222160668146e-05,
"loss": 1.3791,
"step": 721
},
{
"epoch": 0.618018403595121,
"grad_norm": 1.5390625,
"learning_rate": 1.642093004752605e-05,
"loss": 1.5026,
"step": 722
},
{
"epoch": 0.6188743847635352,
"grad_norm": 1.640625,
"learning_rate": 1.635670218183575e-05,
"loss": 1.4059,
"step": 723
},
{
"epoch": 0.6197303659319495,
"grad_norm": 1.4765625,
"learning_rate": 1.629253904459073e-05,
"loss": 1.4202,
"step": 724
},
{
"epoch": 0.6205863471003638,
"grad_norm": 1.5,
"learning_rate": 1.622844111629972e-05,
"loss": 1.4348,
"step": 725
},
{
"epoch": 0.6214423282687781,
"grad_norm": 1.4921875,
"learning_rate": 1.616440887698313e-05,
"loss": 1.4223,
"step": 726
},
{
"epoch": 0.6222983094371923,
"grad_norm": 1.6015625,
"learning_rate": 1.6100442806169422e-05,
"loss": 1.4637,
"step": 727
},
{
"epoch": 0.6231542906056067,
"grad_norm": 1.5390625,
"learning_rate": 1.6036543382891512e-05,
"loss": 1.3871,
"step": 728
},
{
"epoch": 0.624010271774021,
"grad_norm": 1.5,
"learning_rate": 1.597271108568324e-05,
"loss": 1.5021,
"step": 729
},
{
"epoch": 0.6248662529424353,
"grad_norm": 1.6953125,
"learning_rate": 1.5908946392575714e-05,
"loss": 1.628,
"step": 730
},
{
"epoch": 0.6257222341108496,
"grad_norm": 1.546875,
"learning_rate": 1.5845249781093786e-05,
"loss": 1.4596,
"step": 731
},
{
"epoch": 0.6265782152792638,
"grad_norm": 1.46875,
"learning_rate": 1.578162172825244e-05,
"loss": 1.1683,
"step": 732
},
{
"epoch": 0.6274341964476782,
"grad_norm": 1.7265625,
"learning_rate": 1.5718062710553253e-05,
"loss": 1.3545,
"step": 733
},
{
"epoch": 0.6282901776160924,
"grad_norm": 1.5546875,
"learning_rate": 1.5654573203980784e-05,
"loss": 1.3087,
"step": 734
},
{
"epoch": 0.6291461587845067,
"grad_norm": 1.515625,
"learning_rate": 1.5591153683999043e-05,
"loss": 1.3387,
"step": 735
},
{
"epoch": 0.630002139952921,
"grad_norm": 1.4296875,
"learning_rate": 1.5527804625547938e-05,
"loss": 1.3403,
"step": 736
},
{
"epoch": 0.6308581211213353,
"grad_norm": 1.421875,
"learning_rate": 1.5464526503039666e-05,
"loss": 1.4556,
"step": 737
},
{
"epoch": 0.6317141022897497,
"grad_norm": 2.109375,
"learning_rate": 1.540131979035523e-05,
"loss": 1.3776,
"step": 738
},
{
"epoch": 0.6325700834581639,
"grad_norm": 1.5703125,
"learning_rate": 1.5338184960840824e-05,
"loss": 1.3059,
"step": 739
},
{
"epoch": 0.6334260646265782,
"grad_norm": 1.703125,
"learning_rate": 1.5275122487304335e-05,
"loss": 1.5742,
"step": 740
},
{
"epoch": 0.6342820457949925,
"grad_norm": 1.6015625,
"learning_rate": 1.5212132842011779e-05,
"loss": 1.4275,
"step": 741
},
{
"epoch": 0.6351380269634068,
"grad_norm": 1.5234375,
"learning_rate": 1.5149216496683787e-05,
"loss": 1.489,
"step": 742
},
{
"epoch": 0.635994008131821,
"grad_norm": 1.4609375,
"learning_rate": 1.5086373922492048e-05,
"loss": 1.4186,
"step": 743
},
{
"epoch": 0.6368499893002354,
"grad_norm": 1.7265625,
"learning_rate": 1.5023605590055767e-05,
"loss": 1.4414,
"step": 744
},
{
"epoch": 0.6377059704686497,
"grad_norm": 1.65625,
"learning_rate": 1.4960911969438213e-05,
"loss": 1.3893,
"step": 745
},
{
"epoch": 0.638561951637064,
"grad_norm": 1.6640625,
"learning_rate": 1.4898293530143095e-05,
"loss": 1.4831,
"step": 746
},
{
"epoch": 0.6394179328054783,
"grad_norm": 1.484375,
"learning_rate": 1.4835750741111138e-05,
"loss": 1.4675,
"step": 747
},
{
"epoch": 0.6402739139738926,
"grad_norm": 1.65625,
"learning_rate": 1.4773284070716503e-05,
"loss": 1.4084,
"step": 748
},
{
"epoch": 0.6411298951423069,
"grad_norm": 1.484375,
"learning_rate": 1.4710893986763347e-05,
"loss": 1.4119,
"step": 749
},
{
"epoch": 0.6419858763107211,
"grad_norm": 1.484375,
"learning_rate": 1.464858095648224e-05,
"loss": 1.464,
"step": 750
},
{
"epoch": 0.6428418574791355,
"grad_norm": 1.6796875,
"learning_rate": 1.4586345446526733e-05,
"loss": 1.3932,
"step": 751
},
{
"epoch": 0.6436978386475497,
"grad_norm": 1.859375,
"learning_rate": 1.4524187922969839e-05,
"loss": 1.4852,
"step": 752
},
{
"epoch": 0.644553819815964,
"grad_norm": 1.6875,
"learning_rate": 1.4462108851300523e-05,
"loss": 1.3278,
"step": 753
},
{
"epoch": 0.6454098009843784,
"grad_norm": 1.359375,
"learning_rate": 1.4400108696420264e-05,
"loss": 1.3441,
"step": 754
},
{
"epoch": 0.6462657821527926,
"grad_norm": 1.5,
"learning_rate": 1.4338187922639507e-05,
"loss": 1.3425,
"step": 755
},
{
"epoch": 0.647121763321207,
"grad_norm": 1.5703125,
"learning_rate": 1.4276346993674266e-05,
"loss": 1.381,
"step": 756
},
{
"epoch": 0.6479777444896212,
"grad_norm": 1.6484375,
"learning_rate": 1.4214586372642563e-05,
"loss": 1.4587,
"step": 757
},
{
"epoch": 0.6488337256580355,
"grad_norm": 1.53125,
"learning_rate": 1.4152906522061048e-05,
"loss": 1.3396,
"step": 758
},
{
"epoch": 0.6496897068264498,
"grad_norm": 1.7578125,
"learning_rate": 1.4091307903841466e-05,
"loss": 1.3532,
"step": 759
},
{
"epoch": 0.6505456879948641,
"grad_norm": 1.5,
"learning_rate": 1.4029790979287216e-05,
"loss": 1.3586,
"step": 760
},
{
"epoch": 0.6514016691632785,
"grad_norm": 1.5390625,
"learning_rate": 1.3968356209089944e-05,
"loss": 1.4067,
"step": 761
},
{
"epoch": 0.6522576503316927,
"grad_norm": 1.5234375,
"learning_rate": 1.3907004053326006e-05,
"loss": 1.4696,
"step": 762
},
{
"epoch": 0.653113631500107,
"grad_norm": 1.46875,
"learning_rate": 1.3845734971453114e-05,
"loss": 1.4284,
"step": 763
},
{
"epoch": 0.6539696126685213,
"grad_norm": 1.46875,
"learning_rate": 1.3784549422306808e-05,
"loss": 1.4767,
"step": 764
},
{
"epoch": 0.6548255938369356,
"grad_norm": 1.7734375,
"learning_rate": 1.3723447864097105e-05,
"loss": 1.579,
"step": 765
},
{
"epoch": 0.6556815750053498,
"grad_norm": 1.453125,
"learning_rate": 1.3662430754405004e-05,
"loss": 1.3239,
"step": 766
},
{
"epoch": 0.6565375561737642,
"grad_norm": 1.4296875,
"learning_rate": 1.360149855017906e-05,
"loss": 1.1992,
"step": 767
},
{
"epoch": 0.6573935373421784,
"grad_norm": 1.5546875,
"learning_rate": 1.3540651707732035e-05,
"loss": 1.354,
"step": 768
},
{
"epoch": 0.6582495185105928,
"grad_norm": 1.4765625,
"learning_rate": 1.3479890682737379e-05,
"loss": 1.4431,
"step": 769
},
{
"epoch": 0.6591054996790071,
"grad_norm": 1.5703125,
"learning_rate": 1.3419215930225899e-05,
"loss": 1.3813,
"step": 770
},
{
"epoch": 0.6599614808474213,
"grad_norm": 1.5390625,
"learning_rate": 1.3358627904582307e-05,
"loss": 1.3639,
"step": 771
},
{
"epoch": 0.6608174620158357,
"grad_norm": 1.609375,
"learning_rate": 1.3298127059541828e-05,
"loss": 1.3568,
"step": 772
},
{
"epoch": 0.6616734431842499,
"grad_norm": 1.6328125,
"learning_rate": 1.3237713848186797e-05,
"loss": 1.4048,
"step": 773
},
{
"epoch": 0.6625294243526643,
"grad_norm": 1.546875,
"learning_rate": 1.317738872294329e-05,
"loss": 1.4503,
"step": 774
},
{
"epoch": 0.6633854055210785,
"grad_norm": 1.6640625,
"learning_rate": 1.311715213557772e-05,
"loss": 1.3446,
"step": 775
},
{
"epoch": 0.6642413866894928,
"grad_norm": 1.6796875,
"learning_rate": 1.3057004537193423e-05,
"loss": 1.2524,
"step": 776
},
{
"epoch": 0.6650973678579071,
"grad_norm": 1.5625,
"learning_rate": 1.2996946378227352e-05,
"loss": 1.5227,
"step": 777
},
{
"epoch": 0.6659533490263214,
"grad_norm": 1.515625,
"learning_rate": 1.2936978108446624e-05,
"loss": 1.4289,
"step": 778
},
{
"epoch": 0.6668093301947358,
"grad_norm": 1.75,
"learning_rate": 1.28771001769452e-05,
"loss": 1.6197,
"step": 779
},
{
"epoch": 0.66766531136315,
"grad_norm": 1.5078125,
"learning_rate": 1.2817313032140505e-05,
"loss": 1.4775,
"step": 780
},
{
"epoch": 0.6685212925315643,
"grad_norm": 1.4609375,
"learning_rate": 1.2757617121770093e-05,
"loss": 1.4731,
"step": 781
},
{
"epoch": 0.6693772736999786,
"grad_norm": 1.5703125,
"learning_rate": 1.2698012892888272e-05,
"loss": 1.4356,
"step": 782
},
{
"epoch": 0.6702332548683929,
"grad_norm": 1.453125,
"learning_rate": 1.263850079186274e-05,
"loss": 1.3329,
"step": 783
},
{
"epoch": 0.6710892360368071,
"grad_norm": 1.4453125,
"learning_rate": 1.257908126437129e-05,
"loss": 1.4069,
"step": 784
},
{
"epoch": 0.6719452172052215,
"grad_norm": 1.734375,
"learning_rate": 1.2519754755398422e-05,
"loss": 1.501,
"step": 785
},
{
"epoch": 0.6728011983736358,
"grad_norm": 1.7265625,
"learning_rate": 1.2460521709232043e-05,
"loss": 1.5482,
"step": 786
},
{
"epoch": 0.6736571795420501,
"grad_norm": 1.609375,
"learning_rate": 1.2401382569460119e-05,
"loss": 1.3473,
"step": 787
},
{
"epoch": 0.6745131607104644,
"grad_norm": 1.5546875,
"learning_rate": 1.2342337778967384e-05,
"loss": 1.4373,
"step": 788
},
{
"epoch": 0.6753691418788786,
"grad_norm": 1.4296875,
"learning_rate": 1.2283387779932005e-05,
"loss": 1.4588,
"step": 789
},
{
"epoch": 0.676225123047293,
"grad_norm": 1.6171875,
"learning_rate": 1.2224533013822238e-05,
"loss": 1.2549,
"step": 790
},
{
"epoch": 0.6770811042157072,
"grad_norm": 1.546875,
"learning_rate": 1.216577392139319e-05,
"loss": 1.4916,
"step": 791
},
{
"epoch": 0.6779370853841216,
"grad_norm": 1.5390625,
"learning_rate": 1.2107110942683459e-05,
"loss": 1.4571,
"step": 792
},
{
"epoch": 0.6787930665525358,
"grad_norm": 1.6953125,
"learning_rate": 1.2048544517011862e-05,
"loss": 1.4943,
"step": 793
},
{
"epoch": 0.6796490477209501,
"grad_norm": 1.5,
"learning_rate": 1.1990075082974139e-05,
"loss": 1.3433,
"step": 794
},
{
"epoch": 0.6805050288893645,
"grad_norm": 1.6484375,
"learning_rate": 1.1931703078439704e-05,
"loss": 1.5043,
"step": 795
},
{
"epoch": 0.6813610100577787,
"grad_norm": 1.7890625,
"learning_rate": 1.1873428940548292e-05,
"loss": 1.5344,
"step": 796
},
{
"epoch": 0.6822169912261931,
"grad_norm": 1.453125,
"learning_rate": 1.181525310570677e-05,
"loss": 1.4948,
"step": 797
},
{
"epoch": 0.6830729723946073,
"grad_norm": 1.3984375,
"learning_rate": 1.1757176009585793e-05,
"loss": 1.4303,
"step": 798
},
{
"epoch": 0.6839289535630216,
"grad_norm": 1.6875,
"learning_rate": 1.1699198087116589e-05,
"loss": 1.4565,
"step": 799
},
{
"epoch": 0.6847849347314359,
"grad_norm": 1.5078125,
"learning_rate": 1.1641319772487699e-05,
"loss": 1.5477,
"step": 800
},
{
"epoch": 0.6856409158998502,
"grad_norm": 1.515625,
"learning_rate": 1.158354149914169e-05,
"loss": 1.4628,
"step": 801
},
{
"epoch": 0.6864968970682644,
"grad_norm": 1.578125,
"learning_rate": 1.1525863699771966e-05,
"loss": 1.5269,
"step": 802
},
{
"epoch": 0.6873528782366788,
"grad_norm": 1.5078125,
"learning_rate": 1.1468286806319462e-05,
"loss": 1.355,
"step": 803
},
{
"epoch": 0.6882088594050931,
"grad_norm": 1.671875,
"learning_rate": 1.1410811249969475e-05,
"loss": 1.531,
"step": 804
},
{
"epoch": 0.6890648405735074,
"grad_norm": 1.609375,
"learning_rate": 1.1353437461148377e-05,
"loss": 1.596,
"step": 805
},
{
"epoch": 0.6899208217419217,
"grad_norm": 1.515625,
"learning_rate": 1.129616586952042e-05,
"loss": 1.2953,
"step": 806
},
{
"epoch": 0.6907768029103359,
"grad_norm": 1.4453125,
"learning_rate": 1.1238996903984537e-05,
"loss": 1.2693,
"step": 807
},
{
"epoch": 0.6916327840787503,
"grad_norm": 1.5078125,
"learning_rate": 1.1181930992671078e-05,
"loss": 1.176,
"step": 808
},
{
"epoch": 0.6924887652471645,
"grad_norm": 1.640625,
"learning_rate": 1.112496856293867e-05,
"loss": 1.3185,
"step": 809
},
{
"epoch": 0.6933447464155789,
"grad_norm": 1.46875,
"learning_rate": 1.1068110041370938e-05,
"loss": 1.4027,
"step": 810
},
{
"epoch": 0.6942007275839932,
"grad_norm": 1.453125,
"learning_rate": 1.10113558537734e-05,
"loss": 1.3788,
"step": 811
},
{
"epoch": 0.6950567087524074,
"grad_norm": 1.6015625,
"learning_rate": 1.0954706425170197e-05,
"loss": 1.4144,
"step": 812
},
{
"epoch": 0.6959126899208218,
"grad_norm": 1.625,
"learning_rate": 1.0898162179800947e-05,
"loss": 1.5627,
"step": 813
},
{
"epoch": 0.696768671089236,
"grad_norm": 1.703125,
"learning_rate": 1.0841723541117594e-05,
"loss": 1.5203,
"step": 814
},
{
"epoch": 0.6976246522576504,
"grad_norm": 1.5625,
"learning_rate": 1.0785390931781165e-05,
"loss": 1.5606,
"step": 815
},
{
"epoch": 0.6984806334260646,
"grad_norm": 1.75,
"learning_rate": 1.0729164773658693e-05,
"loss": 1.399,
"step": 816
},
{
"epoch": 0.6993366145944789,
"grad_norm": 1.5546875,
"learning_rate": 1.0673045487819975e-05,
"loss": 1.3372,
"step": 817
},
{
"epoch": 0.7001925957628932,
"grad_norm": 1.5078125,
"learning_rate": 1.0617033494534486e-05,
"loss": 1.3698,
"step": 818
},
{
"epoch": 0.7010485769313075,
"grad_norm": 1.46875,
"learning_rate": 1.0561129213268187e-05,
"loss": 1.4297,
"step": 819
},
{
"epoch": 0.7010485769313075,
"eval_loss": 1.5197569131851196,
"eval_runtime": 21.3185,
"eval_samples_per_second": 18.294,
"eval_steps_per_second": 18.294,
"step": 819
},
{
"epoch": 0.7019045580997219,
"grad_norm": 1.5390625,
"learning_rate": 1.0505333062680383e-05,
"loss": 1.4227,
"step": 820
},
{
"epoch": 0.7027605392681361,
"grad_norm": 1.5859375,
"learning_rate": 1.0449645460620649e-05,
"loss": 1.3861,
"step": 821
},
{
"epoch": 0.7036165204365504,
"grad_norm": 1.578125,
"learning_rate": 1.0394066824125603e-05,
"loss": 1.4062,
"step": 822
},
{
"epoch": 0.7044725016049647,
"grad_norm": 1.5546875,
"learning_rate": 1.0338597569415877e-05,
"loss": 1.3354,
"step": 823
},
{
"epoch": 0.705328482773379,
"grad_norm": 1.5625,
"learning_rate": 1.028323811189293e-05,
"loss": 1.4555,
"step": 824
},
{
"epoch": 0.7061844639417932,
"grad_norm": 1.484375,
"learning_rate": 1.0227988866135996e-05,
"loss": 1.2839,
"step": 825
},
{
"epoch": 0.7070404451102076,
"grad_norm": 1.5234375,
"learning_rate": 1.0172850245898893e-05,
"loss": 1.5304,
"step": 826
},
{
"epoch": 0.7078964262786218,
"grad_norm": 1.6875,
"learning_rate": 1.0117822664107038e-05,
"loss": 1.6997,
"step": 827
},
{
"epoch": 0.7087524074470362,
"grad_norm": 1.5234375,
"learning_rate": 1.0062906532854283e-05,
"loss": 1.3367,
"step": 828
},
{
"epoch": 0.7096083886154505,
"grad_norm": 1.5234375,
"learning_rate": 1.000810226339981e-05,
"loss": 1.3577,
"step": 829
},
{
"epoch": 0.7104643697838647,
"grad_norm": 1.7265625,
"learning_rate": 9.95341026616513e-06,
"loss": 1.5752,
"step": 830
},
{
"epoch": 0.7113203509522791,
"grad_norm": 1.6640625,
"learning_rate": 9.898830950730933e-06,
"loss": 1.5784,
"step": 831
},
{
"epoch": 0.7121763321206933,
"grad_norm": 1.546875,
"learning_rate": 9.844364725834057e-06,
"loss": 1.527,
"step": 832
},
{
"epoch": 0.7130323132891077,
"grad_norm": 1.5390625,
"learning_rate": 9.790011999364413e-06,
"loss": 1.5338,
"step": 833
},
{
"epoch": 0.7138882944575219,
"grad_norm": 1.6015625,
"learning_rate": 9.735773178361964e-06,
"loss": 1.3994,
"step": 834
},
{
"epoch": 0.7147442756259362,
"grad_norm": 1.6953125,
"learning_rate": 9.681648669013619e-06,
"loss": 1.4432,
"step": 835
},
{
"epoch": 0.7156002567943506,
"grad_norm": 1.40625,
"learning_rate": 9.627638876650243e-06,
"loss": 1.3741,
"step": 836
},
{
"epoch": 0.7164562379627648,
"grad_norm": 1.6171875,
"learning_rate": 9.573744205743612e-06,
"loss": 1.3791,
"step": 837
},
{
"epoch": 0.7173122191311792,
"grad_norm": 1.53125,
"learning_rate": 9.519965059903349e-06,
"loss": 1.4102,
"step": 838
},
{
"epoch": 0.7181682002995934,
"grad_norm": 1.421875,
"learning_rate": 9.46630184187393e-06,
"loss": 1.3081,
"step": 839
},
{
"epoch": 0.7190241814680077,
"grad_norm": 1.4765625,
"learning_rate": 9.412754953531663e-06,
"loss": 1.3067,
"step": 840
},
{
"epoch": 0.719880162636422,
"grad_norm": 1.5,
"learning_rate": 9.359324795881708e-06,
"loss": 1.3967,
"step": 841
},
{
"epoch": 0.7207361438048363,
"grad_norm": 1.4765625,
"learning_rate": 9.306011769054998e-06,
"loss": 1.3527,
"step": 842
},
{
"epoch": 0.7215921249732505,
"grad_norm": 1.5234375,
"learning_rate": 9.252816272305329e-06,
"loss": 1.4973,
"step": 843
},
{
"epoch": 0.7224481061416649,
"grad_norm": 1.5,
"learning_rate": 9.199738704006321e-06,
"loss": 1.3451,
"step": 844
},
{
"epoch": 0.7233040873100792,
"grad_norm": 1.453125,
"learning_rate": 9.146779461648436e-06,
"loss": 1.3985,
"step": 845
},
{
"epoch": 0.7241600684784935,
"grad_norm": 1.5625,
"learning_rate": 9.09393894183601e-06,
"loss": 1.5013,
"step": 846
},
{
"epoch": 0.7250160496469078,
"grad_norm": 1.6484375,
"learning_rate": 9.041217540284277e-06,
"loss": 1.4524,
"step": 847
},
{
"epoch": 0.725872030815322,
"grad_norm": 1.5703125,
"learning_rate": 8.98861565181644e-06,
"loss": 1.4127,
"step": 848
},
{
"epoch": 0.7267280119837364,
"grad_norm": 1.6796875,
"learning_rate": 8.936133670360644e-06,
"loss": 1.5011,
"step": 849
},
{
"epoch": 0.7275839931521506,
"grad_norm": 1.59375,
"learning_rate": 8.883771988947099e-06,
"loss": 1.4038,
"step": 850
},
{
"epoch": 0.728439974320565,
"grad_norm": 1.578125,
"learning_rate": 8.831530999705104e-06,
"loss": 1.4896,
"step": 851
},
{
"epoch": 0.7292959554889793,
"grad_norm": 1.6171875,
"learning_rate": 8.77941109386009e-06,
"loss": 1.3577,
"step": 852
},
{
"epoch": 0.7301519366573935,
"grad_norm": 1.5,
"learning_rate": 8.727412661730724e-06,
"loss": 1.3243,
"step": 853
},
{
"epoch": 0.7310079178258079,
"grad_norm": 1.6484375,
"learning_rate": 8.675536092725966e-06,
"loss": 1.482,
"step": 854
},
{
"epoch": 0.7318638989942221,
"grad_norm": 1.84375,
"learning_rate": 8.623781775342183e-06,
"loss": 1.5252,
"step": 855
},
{
"epoch": 0.7327198801626364,
"grad_norm": 1.71875,
"learning_rate": 8.572150097160179e-06,
"loss": 1.5078,
"step": 856
},
{
"epoch": 0.7335758613310507,
"grad_norm": 1.4296875,
"learning_rate": 8.520641444842373e-06,
"loss": 1.4596,
"step": 857
},
{
"epoch": 0.734431842499465,
"grad_norm": 1.6953125,
"learning_rate": 8.469256204129828e-06,
"loss": 1.4019,
"step": 858
},
{
"epoch": 0.7352878236678793,
"grad_norm": 1.5078125,
"learning_rate": 8.417994759839401e-06,
"loss": 1.3862,
"step": 859
},
{
"epoch": 0.7361438048362936,
"grad_norm": 1.78125,
"learning_rate": 8.36685749586087e-06,
"loss": 1.5438,
"step": 860
},
{
"epoch": 0.736999786004708,
"grad_norm": 1.5234375,
"learning_rate": 8.315844795154024e-06,
"loss": 1.1669,
"step": 861
},
{
"epoch": 0.7378557671731222,
"grad_norm": 1.5703125,
"learning_rate": 8.264957039745836e-06,
"loss": 1.2759,
"step": 862
},
{
"epoch": 0.7387117483415365,
"grad_norm": 1.578125,
"learning_rate": 8.214194610727557e-06,
"loss": 1.3324,
"step": 863
},
{
"epoch": 0.7395677295099508,
"grad_norm": 1.640625,
"learning_rate": 8.163557888251917e-06,
"loss": 1.4036,
"step": 864
},
{
"epoch": 0.7404237106783651,
"grad_norm": 1.4296875,
"learning_rate": 8.113047251530215e-06,
"loss": 1.4018,
"step": 865
},
{
"epoch": 0.7412796918467793,
"grad_norm": 1.703125,
"learning_rate": 8.062663078829525e-06,
"loss": 1.3247,
"step": 866
},
{
"epoch": 0.7421356730151937,
"grad_norm": 1.8671875,
"learning_rate": 8.012405747469862e-06,
"loss": 1.4302,
"step": 867
},
{
"epoch": 0.7429916541836079,
"grad_norm": 1.453125,
"learning_rate": 7.96227563382132e-06,
"loss": 1.5155,
"step": 868
},
{
"epoch": 0.7438476353520223,
"grad_norm": 1.421875,
"learning_rate": 7.912273113301306e-06,
"loss": 1.4633,
"step": 869
},
{
"epoch": 0.7447036165204366,
"grad_norm": 1.46875,
"learning_rate": 7.862398560371664e-06,
"loss": 1.3607,
"step": 870
},
{
"epoch": 0.7455595976888508,
"grad_norm": 1.484375,
"learning_rate": 7.812652348535948e-06,
"loss": 1.4725,
"step": 871
},
{
"epoch": 0.7464155788572652,
"grad_norm": 1.5859375,
"learning_rate": 7.763034850336553e-06,
"loss": 1.4298,
"step": 872
},
{
"epoch": 0.7472715600256794,
"grad_norm": 1.390625,
"learning_rate": 7.713546437351965e-06,
"loss": 1.3457,
"step": 873
},
{
"epoch": 0.7481275411940937,
"grad_norm": 1.7890625,
"learning_rate": 7.66418748019396e-06,
"loss": 1.265,
"step": 874
},
{
"epoch": 0.748983522362508,
"grad_norm": 1.7890625,
"learning_rate": 7.614958348504853e-06,
"loss": 1.5109,
"step": 875
},
{
"epoch": 0.7498395035309223,
"grad_norm": 1.5703125,
"learning_rate": 7.565859410954718e-06,
"loss": 1.457,
"step": 876
},
{
"epoch": 0.7506954846993367,
"grad_norm": 1.625,
"learning_rate": 7.516891035238596e-06,
"loss": 1.6443,
"step": 877
},
{
"epoch": 0.7515514658677509,
"grad_norm": 1.4375,
"learning_rate": 7.468053588073803e-06,
"loss": 1.4027,
"step": 878
},
{
"epoch": 0.7524074470361652,
"grad_norm": 1.4765625,
"learning_rate": 7.4193474351971245e-06,
"loss": 1.4607,
"step": 879
},
{
"epoch": 0.7532634282045795,
"grad_norm": 1.578125,
"learning_rate": 7.3707729413621055e-06,
"loss": 1.4838,
"step": 880
},
{
"epoch": 0.7541194093729938,
"grad_norm": 1.546875,
"learning_rate": 7.3223304703363135e-06,
"loss": 1.309,
"step": 881
},
{
"epoch": 0.754975390541408,
"grad_norm": 1.65625,
"learning_rate": 7.274020384898628e-06,
"loss": 1.3888,
"step": 882
},
{
"epoch": 0.7558313717098224,
"grad_norm": 1.5625,
"learning_rate": 7.225843046836514e-06,
"loss": 1.649,
"step": 883
},
{
"epoch": 0.7566873528782366,
"grad_norm": 1.640625,
"learning_rate": 7.177798816943287e-06,
"loss": 1.4189,
"step": 884
},
{
"epoch": 0.757543334046651,
"grad_norm": 1.515625,
"learning_rate": 7.129888055015455e-06,
"loss": 1.4323,
"step": 885
},
{
"epoch": 0.7583993152150653,
"grad_norm": 1.4609375,
"learning_rate": 7.0821111198499795e-06,
"loss": 1.4199,
"step": 886
},
{
"epoch": 0.7592552963834795,
"grad_norm": 1.84375,
"learning_rate": 7.034468369241651e-06,
"loss": 1.6481,
"step": 887
},
{
"epoch": 0.7601112775518939,
"grad_norm": 1.625,
"learning_rate": 6.986960159980327e-06,
"loss": 1.4306,
"step": 888
},
{
"epoch": 0.7609672587203081,
"grad_norm": 1.671875,
"learning_rate": 6.939586847848334e-06,
"loss": 1.4569,
"step": 889
},
{
"epoch": 0.7618232398887225,
"grad_norm": 1.5703125,
"learning_rate": 6.892348787617769e-06,
"loss": 1.4033,
"step": 890
},
{
"epoch": 0.7626792210571367,
"grad_norm": 1.578125,
"learning_rate": 6.845246333047836e-06,
"loss": 1.5268,
"step": 891
},
{
"epoch": 0.763535202225551,
"grad_norm": 1.5,
"learning_rate": 6.79827983688221e-06,
"loss": 1.4712,
"step": 892
},
{
"epoch": 0.7643911833939653,
"grad_norm": 1.5546875,
"learning_rate": 6.751449650846389e-06,
"loss": 1.3403,
"step": 893
},
{
"epoch": 0.7652471645623796,
"grad_norm": 1.59375,
"learning_rate": 6.704756125645081e-06,
"loss": 1.3823,
"step": 894
},
{
"epoch": 0.766103145730794,
"grad_norm": 1.515625,
"learning_rate": 6.658199610959537e-06,
"loss": 1.2359,
"step": 895
},
{
"epoch": 0.7669591268992082,
"grad_norm": 1.453125,
"learning_rate": 6.611780455444979e-06,
"loss": 1.3427,
"step": 896
},
{
"epoch": 0.7678151080676225,
"grad_norm": 1.6640625,
"learning_rate": 6.565499006727938e-06,
"loss": 1.4077,
"step": 897
},
{
"epoch": 0.7686710892360368,
"grad_norm": 1.7421875,
"learning_rate": 6.51935561140371e-06,
"loss": 1.5555,
"step": 898
},
{
"epoch": 0.7695270704044511,
"grad_norm": 1.6171875,
"learning_rate": 6.4733506150337016e-06,
"loss": 1.3699,
"step": 899
},
{
"epoch": 0.7703830515728654,
"grad_norm": 1.625,
"learning_rate": 6.427484362142877e-06,
"loss": 1.3224,
"step": 900
},
{
"epoch": 0.7712390327412797,
"grad_norm": 1.7890625,
"learning_rate": 6.381757196217181e-06,
"loss": 1.5472,
"step": 901
},
{
"epoch": 0.772095013909694,
"grad_norm": 1.5234375,
"learning_rate": 6.336169459700933e-06,
"loss": 1.5253,
"step": 902
},
{
"epoch": 0.7729509950781083,
"grad_norm": 1.3984375,
"learning_rate": 6.290721493994317e-06,
"loss": 1.3984,
"step": 903
},
{
"epoch": 0.7738069762465226,
"grad_norm": 1.4921875,
"learning_rate": 6.245413639450757e-06,
"loss": 1.3538,
"step": 904
},
{
"epoch": 0.7746629574149368,
"grad_norm": 1.671875,
"learning_rate": 6.200246235374438e-06,
"loss": 1.5044,
"step": 905
},
{
"epoch": 0.7755189385833512,
"grad_norm": 1.53125,
"learning_rate": 6.155219620017708e-06,
"loss": 1.4854,
"step": 906
},
{
"epoch": 0.7763749197517654,
"grad_norm": 1.671875,
"learning_rate": 6.1103341305785655e-06,
"loss": 1.6012,
"step": 907
},
{
"epoch": 0.7772309009201798,
"grad_norm": 1.625,
"learning_rate": 6.065590103198165e-06,
"loss": 1.4091,
"step": 908
},
{
"epoch": 0.778086882088594,
"grad_norm": 1.4765625,
"learning_rate": 6.020987872958236e-06,
"loss": 1.4079,
"step": 909
},
{
"epoch": 0.7789428632570083,
"grad_norm": 1.6328125,
"learning_rate": 5.97652777387864e-06,
"loss": 1.4951,
"step": 910
},
{
"epoch": 0.7797988444254227,
"grad_norm": 1.5078125,
"learning_rate": 5.932210138914821e-06,
"loss": 1.5049,
"step": 911
},
{
"epoch": 0.7806548255938369,
"grad_norm": 1.65625,
"learning_rate": 5.888035299955325e-06,
"loss": 1.3488,
"step": 912
},
{
"epoch": 0.7815108067622513,
"grad_norm": 1.5625,
"learning_rate": 5.844003587819327e-06,
"loss": 1.5192,
"step": 913
},
{
"epoch": 0.7823667879306655,
"grad_norm": 1.6953125,
"learning_rate": 5.800115332254144e-06,
"loss": 1.549,
"step": 914
},
{
"epoch": 0.7832227690990798,
"grad_norm": 1.453125,
"learning_rate": 5.75637086193278e-06,
"loss": 1.4354,
"step": 915
},
{
"epoch": 0.7840787502674941,
"grad_norm": 1.8046875,
"learning_rate": 5.712770504451426e-06,
"loss": 1.4676,
"step": 916
},
{
"epoch": 0.7849347314359084,
"grad_norm": 1.625,
"learning_rate": 5.669314586327054e-06,
"loss": 1.5199,
"step": 917
},
{
"epoch": 0.7857907126043226,
"grad_norm": 1.515625,
"learning_rate": 5.626003432994933e-06,
"loss": 1.4853,
"step": 918
},
{
"epoch": 0.786646693772737,
"grad_norm": 1.640625,
"learning_rate": 5.582837368806224e-06,
"loss": 1.3789,
"step": 919
},
{
"epoch": 0.7875026749411513,
"grad_norm": 1.5390625,
"learning_rate": 5.539816717025515e-06,
"loss": 1.5069,
"step": 920
},
{
"epoch": 0.7883586561095656,
"grad_norm": 1.4921875,
"learning_rate": 5.496941799828443e-06,
"loss": 1.364,
"step": 921
},
{
"epoch": 0.7892146372779799,
"grad_norm": 1.7890625,
"learning_rate": 5.454212938299255e-06,
"loss": 1.4134,
"step": 922
},
{
"epoch": 0.7900706184463941,
"grad_norm": 1.5078125,
"learning_rate": 5.411630452428395e-06,
"loss": 1.4641,
"step": 923
},
{
"epoch": 0.7909265996148085,
"grad_norm": 1.7421875,
"learning_rate": 5.369194661110138e-06,
"loss": 1.2542,
"step": 924
},
{
"epoch": 0.7917825807832227,
"grad_norm": 1.4921875,
"learning_rate": 5.326905882140168e-06,
"loss": 1.5729,
"step": 925
},
{
"epoch": 0.7926385619516371,
"grad_norm": 1.5703125,
"learning_rate": 5.284764432213221e-06,
"loss": 1.5403,
"step": 926
},
{
"epoch": 0.7934945431200514,
"grad_norm": 1.5234375,
"learning_rate": 5.242770626920695e-06,
"loss": 1.418,
"step": 927
},
{
"epoch": 0.7943505242884656,
"grad_norm": 1.5703125,
"learning_rate": 5.200924780748323e-06,
"loss": 1.4128,
"step": 928
},
{
"epoch": 0.79520650545688,
"grad_norm": 1.6796875,
"learning_rate": 5.15922720707378e-06,
"loss": 1.4845,
"step": 929
},
{
"epoch": 0.7960624866252942,
"grad_norm": 1.515625,
"learning_rate": 5.117678218164338e-06,
"loss": 1.5405,
"step": 930
},
{
"epoch": 0.7969184677937086,
"grad_norm": 1.6875,
"learning_rate": 5.076278125174555e-06,
"loss": 1.361,
"step": 931
},
{
"epoch": 0.7977744489621228,
"grad_norm": 1.7265625,
"learning_rate": 5.0350272381439244e-06,
"loss": 1.5649,
"step": 932
},
{
"epoch": 0.7986304301305371,
"grad_norm": 1.4609375,
"learning_rate": 4.993925865994548e-06,
"loss": 1.388,
"step": 933
},
{
"epoch": 0.7994864112989514,
"grad_norm": 1.6953125,
"learning_rate": 4.952974316528833e-06,
"loss": 1.5386,
"step": 934
},
{
"epoch": 0.8003423924673657,
"grad_norm": 1.40625,
"learning_rate": 4.912172896427205e-06,
"loss": 1.2794,
"step": 935
},
{
"epoch": 0.8011983736357801,
"grad_norm": 1.703125,
"learning_rate": 4.8715219112457635e-06,
"loss": 1.7561,
"step": 936
},
{
"epoch": 0.8011983736357801,
"eval_loss": 1.516785979270935,
"eval_runtime": 21.3507,
"eval_samples_per_second": 18.266,
"eval_steps_per_second": 18.266,
"step": 936
},
{
"epoch": 0.8020543548041943,
"grad_norm": 1.7109375,
"learning_rate": 4.8310216654140425e-06,
"loss": 1.5413,
"step": 937
},
{
"epoch": 0.8029103359726086,
"grad_norm": 1.3671875,
"learning_rate": 4.790672462232715e-06,
"loss": 1.3485,
"step": 938
},
{
"epoch": 0.8037663171410229,
"grad_norm": 1.515625,
"learning_rate": 4.750474603871283e-06,
"loss": 1.3616,
"step": 939
},
{
"epoch": 0.8046222983094372,
"grad_norm": 1.609375,
"learning_rate": 4.710428391365887e-06,
"loss": 1.5232,
"step": 940
},
{
"epoch": 0.8054782794778514,
"grad_norm": 1.625,
"learning_rate": 4.670534124616982e-06,
"loss": 1.5764,
"step": 941
},
{
"epoch": 0.8063342606462658,
"grad_norm": 1.53125,
"learning_rate": 4.630792102387155e-06,
"loss": 1.4513,
"step": 942
},
{
"epoch": 0.8071902418146801,
"grad_norm": 1.6953125,
"learning_rate": 4.591202622298824e-06,
"loss": 1.4137,
"step": 943
},
{
"epoch": 0.8080462229830944,
"grad_norm": 1.5390625,
"learning_rate": 4.551765980832059e-06,
"loss": 1.3718,
"step": 944
},
{
"epoch": 0.8089022041515087,
"grad_norm": 1.671875,
"learning_rate": 4.512482473322341e-06,
"loss": 1.4205,
"step": 945
},
{
"epoch": 0.8097581853199229,
"grad_norm": 1.5234375,
"learning_rate": 4.473352393958338e-06,
"loss": 1.5571,
"step": 946
},
{
"epoch": 0.8106141664883373,
"grad_norm": 1.4296875,
"learning_rate": 4.4343760357797386e-06,
"loss": 1.2288,
"step": 947
},
{
"epoch": 0.8114701476567515,
"grad_norm": 1.46875,
"learning_rate": 4.3955536906750135e-06,
"loss": 1.3646,
"step": 948
},
{
"epoch": 0.8123261288251659,
"grad_norm": 1.46875,
"learning_rate": 4.356885649379269e-06,
"loss": 1.4272,
"step": 949
},
{
"epoch": 0.8131821099935801,
"grad_norm": 1.640625,
"learning_rate": 4.318372201472037e-06,
"loss": 1.4271,
"step": 950
},
{
"epoch": 0.8140380911619944,
"grad_norm": 1.46875,
"learning_rate": 4.280013635375138e-06,
"loss": 1.5182,
"step": 951
},
{
"epoch": 0.8148940723304088,
"grad_norm": 1.5390625,
"learning_rate": 4.2418102383504885e-06,
"loss": 1.3662,
"step": 952
},
{
"epoch": 0.815750053498823,
"grad_norm": 1.7265625,
"learning_rate": 4.203762296497965e-06,
"loss": 1.5375,
"step": 953
},
{
"epoch": 0.8166060346672374,
"grad_norm": 1.53125,
"learning_rate": 4.1658700947532795e-06,
"loss": 1.4522,
"step": 954
},
{
"epoch": 0.8174620158356516,
"grad_norm": 1.6015625,
"learning_rate": 4.128133916885804e-06,
"loss": 1.4576,
"step": 955
},
{
"epoch": 0.8183179970040659,
"grad_norm": 1.453125,
"learning_rate": 4.0905540454965006e-06,
"loss": 1.3513,
"step": 956
},
{
"epoch": 0.8191739781724802,
"grad_norm": 1.65625,
"learning_rate": 4.053130762015736e-06,
"loss": 1.4043,
"step": 957
},
{
"epoch": 0.8200299593408945,
"grad_norm": 1.3828125,
"learning_rate": 4.015864346701251e-06,
"loss": 1.3098,
"step": 958
},
{
"epoch": 0.8208859405093087,
"grad_norm": 1.6484375,
"learning_rate": 3.978755078635995e-06,
"loss": 1.5399,
"step": 959
},
{
"epoch": 0.8217419216777231,
"grad_norm": 1.671875,
"learning_rate": 3.941803235726069e-06,
"loss": 1.6757,
"step": 960
},
{
"epoch": 0.8225979028461374,
"grad_norm": 1.6171875,
"learning_rate": 3.90500909469865e-06,
"loss": 1.3494,
"step": 961
},
{
"epoch": 0.8234538840145517,
"grad_norm": 1.46875,
"learning_rate": 3.8683729310998926e-06,
"loss": 1.3379,
"step": 962
},
{
"epoch": 0.824309865182966,
"grad_norm": 1.625,
"learning_rate": 3.831895019292897e-06,
"loss": 1.5126,
"step": 963
},
{
"epoch": 0.8251658463513802,
"grad_norm": 1.734375,
"learning_rate": 3.7955756324556197e-06,
"loss": 1.507,
"step": 964
},
{
"epoch": 0.8260218275197946,
"grad_norm": 1.5078125,
"learning_rate": 3.7594150425788675e-06,
"loss": 1.3546,
"step": 965
},
{
"epoch": 0.8268778086882088,
"grad_norm": 1.6640625,
"learning_rate": 3.7234135204642195e-06,
"loss": 1.466,
"step": 966
},
{
"epoch": 0.8277337898566232,
"grad_norm": 1.6484375,
"learning_rate": 3.687571335722023e-06,
"loss": 1.388,
"step": 967
},
{
"epoch": 0.8285897710250375,
"grad_norm": 1.796875,
"learning_rate": 3.651888756769381e-06,
"loss": 1.4069,
"step": 968
},
{
"epoch": 0.8294457521934517,
"grad_norm": 1.6953125,
"learning_rate": 3.6163660508281154e-06,
"loss": 1.451,
"step": 969
},
{
"epoch": 0.8303017333618661,
"grad_norm": 1.59375,
"learning_rate": 3.5810034839228015e-06,
"loss": 1.4336,
"step": 970
},
{
"epoch": 0.8311577145302803,
"grad_norm": 1.765625,
"learning_rate": 3.5458013208787333e-06,
"loss": 1.4418,
"step": 971
},
{
"epoch": 0.8320136956986947,
"grad_norm": 1.5546875,
"learning_rate": 3.5107598253199758e-06,
"loss": 1.4126,
"step": 972
},
{
"epoch": 0.8328696768671089,
"grad_norm": 1.5078125,
"learning_rate": 3.4758792596673725e-06,
"loss": 1.3229,
"step": 973
},
{
"epoch": 0.8337256580355232,
"grad_norm": 1.5703125,
"learning_rate": 3.4411598851365966e-06,
"loss": 1.2822,
"step": 974
},
{
"epoch": 0.8345816392039375,
"grad_norm": 1.6328125,
"learning_rate": 3.406601961736164e-06,
"loss": 1.607,
"step": 975
},
{
"epoch": 0.8354376203723518,
"grad_norm": 1.4609375,
"learning_rate": 3.372205748265522e-06,
"loss": 1.5054,
"step": 976
},
{
"epoch": 0.8362936015407662,
"grad_norm": 1.5625,
"learning_rate": 3.337971502313095e-06,
"loss": 1.4882,
"step": 977
},
{
"epoch": 0.8371495827091804,
"grad_norm": 1.625,
"learning_rate": 3.3038994802543467e-06,
"loss": 1.5285,
"step": 978
},
{
"epoch": 0.8380055638775947,
"grad_norm": 1.671875,
"learning_rate": 3.2699899372498733e-06,
"loss": 1.4404,
"step": 979
},
{
"epoch": 0.838861545046009,
"grad_norm": 1.5625,
"learning_rate": 3.236243127243477e-06,
"loss": 1.5433,
"step": 980
},
{
"epoch": 0.8397175262144233,
"grad_norm": 1.5625,
"learning_rate": 3.202659302960301e-06,
"loss": 1.207,
"step": 981
},
{
"epoch": 0.8405735073828375,
"grad_norm": 1.40625,
"learning_rate": 3.169238715904882e-06,
"loss": 1.4336,
"step": 982
},
{
"epoch": 0.8414294885512519,
"grad_norm": 1.859375,
"learning_rate": 3.135981616359315e-06,
"loss": 1.4036,
"step": 983
},
{
"epoch": 0.8422854697196661,
"grad_norm": 1.65625,
"learning_rate": 3.1028882533813643e-06,
"loss": 1.4233,
"step": 984
},
{
"epoch": 0.8431414508880805,
"grad_norm": 1.421875,
"learning_rate": 3.0699588748025755e-06,
"loss": 1.3475,
"step": 985
},
{
"epoch": 0.8439974320564948,
"grad_norm": 1.7109375,
"learning_rate": 3.037193727226445e-06,
"loss": 1.3735,
"step": 986
},
{
"epoch": 0.844853413224909,
"grad_norm": 1.640625,
"learning_rate": 3.0045930560265666e-06,
"loss": 1.49,
"step": 987
},
{
"epoch": 0.8457093943933234,
"grad_norm": 1.6171875,
"learning_rate": 2.9721571053448053e-06,
"loss": 1.5413,
"step": 988
},
{
"epoch": 0.8465653755617376,
"grad_norm": 1.4453125,
"learning_rate": 2.9398861180894355e-06,
"loss": 1.4234,
"step": 989
},
{
"epoch": 0.847421356730152,
"grad_norm": 1.6328125,
"learning_rate": 2.9077803359333607e-06,
"loss": 1.542,
"step": 990
},
{
"epoch": 0.8482773378985662,
"grad_norm": 1.4765625,
"learning_rate": 2.8758399993122854e-06,
"loss": 1.4682,
"step": 991
},
{
"epoch": 0.8491333190669805,
"grad_norm": 1.5234375,
"learning_rate": 2.8440653474229085e-06,
"loss": 1.3124,
"step": 992
},
{
"epoch": 0.8499893002353949,
"grad_norm": 1.703125,
"learning_rate": 2.812456618221143e-06,
"loss": 1.5196,
"step": 993
},
{
"epoch": 0.8508452814038091,
"grad_norm": 1.5,
"learning_rate": 2.7810140484203188e-06,
"loss": 1.4316,
"step": 994
},
{
"epoch": 0.8517012625722234,
"grad_norm": 1.71875,
"learning_rate": 2.7497378734894497e-06,
"loss": 1.408,
"step": 995
},
{
"epoch": 0.8525572437406377,
"grad_norm": 1.5234375,
"learning_rate": 2.718628327651407e-06,
"loss": 1.4881,
"step": 996
},
{
"epoch": 0.853413224909052,
"grad_norm": 1.5546875,
"learning_rate": 2.6876856438812296e-06,
"loss": 1.4838,
"step": 997
},
{
"epoch": 0.8542692060774663,
"grad_norm": 1.5703125,
"learning_rate": 2.6569100539043325e-06,
"loss": 1.4737,
"step": 998
},
{
"epoch": 0.8551251872458806,
"grad_norm": 1.75,
"learning_rate": 2.626301788194785e-06,
"loss": 1.319,
"step": 999
},
{
"epoch": 0.8559811684142948,
"grad_norm": 1.3515625,
"learning_rate": 2.595861075973613e-06,
"loss": 1.2919,
"step": 1000
},
{
"epoch": 0.8568371495827092,
"grad_norm": 1.6796875,
"learning_rate": 2.5655881452070264e-06,
"loss": 1.5445,
"step": 1001
},
{
"epoch": 0.8576931307511235,
"grad_norm": 1.4375,
"learning_rate": 2.5354832226047705e-06,
"loss": 1.2502,
"step": 1002
},
{
"epoch": 0.8585491119195378,
"grad_norm": 1.4453125,
"learning_rate": 2.5055465336183774e-06,
"loss": 1.3512,
"step": 1003
},
{
"epoch": 0.8594050930879521,
"grad_norm": 1.5625,
"learning_rate": 2.475778302439524e-06,
"loss": 1.4999,
"step": 1004
},
{
"epoch": 0.8602610742563663,
"grad_norm": 1.4765625,
"learning_rate": 2.4461787519983127e-06,
"loss": 1.2433,
"step": 1005
},
{
"epoch": 0.8611170554247807,
"grad_norm": 1.46875,
"learning_rate": 2.416748103961625e-06,
"loss": 1.4821,
"step": 1006
},
{
"epoch": 0.8619730365931949,
"grad_norm": 1.4375,
"learning_rate": 2.3874865787314598e-06,
"loss": 1.4389,
"step": 1007
},
{
"epoch": 0.8628290177616093,
"grad_norm": 1.8671875,
"learning_rate": 2.3583943954432725e-06,
"loss": 1.6777,
"step": 1008
},
{
"epoch": 0.8636849989300235,
"grad_norm": 1.5859375,
"learning_rate": 2.3294717719643534e-06,
"loss": 1.5674,
"step": 1009
},
{
"epoch": 0.8645409800984378,
"grad_norm": 1.6953125,
"learning_rate": 2.300718924892159e-06,
"loss": 1.3248,
"step": 1010
},
{
"epoch": 0.8653969612668522,
"grad_norm": 1.6484375,
"learning_rate": 2.2721360695527437e-06,
"loss": 1.3882,
"step": 1011
},
{
"epoch": 0.8662529424352664,
"grad_norm": 1.59375,
"learning_rate": 2.243723419999097e-06,
"loss": 1.4108,
"step": 1012
},
{
"epoch": 0.8671089236036807,
"grad_norm": 1.4296875,
"learning_rate": 2.2154811890095605e-06,
"loss": 1.2796,
"step": 1013
},
{
"epoch": 0.867964904772095,
"grad_norm": 1.578125,
"learning_rate": 2.1874095880862505e-06,
"loss": 1.4911,
"step": 1014
},
{
"epoch": 0.8688208859405093,
"grad_norm": 1.5859375,
"learning_rate": 2.1595088274534436e-06,
"loss": 1.4234,
"step": 1015
},
{
"epoch": 0.8696768671089236,
"grad_norm": 1.6328125,
"learning_rate": 2.1317791160560318e-06,
"loss": 1.4759,
"step": 1016
},
{
"epoch": 0.8705328482773379,
"grad_norm": 1.5625,
"learning_rate": 2.1042206615579237e-06,
"loss": 1.5507,
"step": 1017
},
{
"epoch": 0.8713888294457522,
"grad_norm": 1.421875,
"learning_rate": 2.076833670340533e-06,
"loss": 1.3777,
"step": 1018
},
{
"epoch": 0.8722448106141665,
"grad_norm": 1.359375,
"learning_rate": 2.0496183475011894e-06,
"loss": 1.4103,
"step": 1019
},
{
"epoch": 0.8731007917825808,
"grad_norm": 1.6953125,
"learning_rate": 2.0225748968516284e-06,
"loss": 1.2965,
"step": 1020
},
{
"epoch": 0.873956772950995,
"grad_norm": 1.4375,
"learning_rate": 1.995703520916456e-06,
"loss": 1.4232,
"step": 1021
},
{
"epoch": 0.8748127541194094,
"grad_norm": 1.390625,
"learning_rate": 1.9690044209316444e-06,
"loss": 1.4387,
"step": 1022
},
{
"epoch": 0.8756687352878236,
"grad_norm": 1.828125,
"learning_rate": 1.9424777968430146e-06,
"loss": 1.4927,
"step": 1023
},
{
"epoch": 0.876524716456238,
"grad_norm": 1.4921875,
"learning_rate": 1.916123847304721e-06,
"loss": 1.6027,
"step": 1024
},
{
"epoch": 0.8773806976246522,
"grad_norm": 1.6796875,
"learning_rate": 1.8899427696778105e-06,
"loss": 1.5225,
"step": 1025
},
{
"epoch": 0.8782366787930665,
"grad_norm": 1.5078125,
"learning_rate": 1.8639347600286877e-06,
"loss": 1.4555,
"step": 1026
},
{
"epoch": 0.8790926599614809,
"grad_norm": 1.6328125,
"learning_rate": 1.8381000131277e-06,
"loss": 1.4311,
"step": 1027
},
{
"epoch": 0.8799486411298951,
"grad_norm": 1.625,
"learning_rate": 1.8124387224476347e-06,
"loss": 1.5006,
"step": 1028
},
{
"epoch": 0.8808046222983095,
"grad_norm": 1.5703125,
"learning_rate": 1.7869510801623053e-06,
"loss": 1.4779,
"step": 1029
},
{
"epoch": 0.8816606034667237,
"grad_norm": 1.4140625,
"learning_rate": 1.761637277145095e-06,
"loss": 1.2851,
"step": 1030
},
{
"epoch": 0.882516584635138,
"grad_norm": 1.5703125,
"learning_rate": 1.7364975029675184e-06,
"loss": 1.4212,
"step": 1031
},
{
"epoch": 0.8833725658035523,
"grad_norm": 1.53125,
"learning_rate": 1.7115319458978236e-06,
"loss": 1.5496,
"step": 1032
},
{
"epoch": 0.8842285469719666,
"grad_norm": 1.5390625,
"learning_rate": 1.6867407928995577e-06,
"loss": 1.4217,
"step": 1033
},
{
"epoch": 0.885084528140381,
"grad_norm": 1.6015625,
"learning_rate": 1.6621242296301964e-06,
"loss": 1.5435,
"step": 1034
},
{
"epoch": 0.8859405093087952,
"grad_norm": 1.4921875,
"learning_rate": 1.6376824404397251e-06,
"loss": 1.3545,
"step": 1035
},
{
"epoch": 0.8867964904772095,
"grad_norm": 1.609375,
"learning_rate": 1.613415608369284e-06,
"loss": 1.4856,
"step": 1036
},
{
"epoch": 0.8876524716456238,
"grad_norm": 1.453125,
"learning_rate": 1.5893239151497652e-06,
"loss": 1.3376,
"step": 1037
},
{
"epoch": 0.8885084528140381,
"grad_norm": 1.4765625,
"learning_rate": 1.5654075412004893e-06,
"loss": 1.4068,
"step": 1038
},
{
"epoch": 0.8893644339824524,
"grad_norm": 1.6328125,
"learning_rate": 1.5416666656278222e-06,
"loss": 1.3882,
"step": 1039
},
{
"epoch": 0.8902204151508667,
"grad_norm": 1.4375,
"learning_rate": 1.5181014662238508e-06,
"loss": 1.2639,
"step": 1040
},
{
"epoch": 0.8910763963192809,
"grad_norm": 1.5078125,
"learning_rate": 1.4947121194650527e-06,
"loss": 1.388,
"step": 1041
},
{
"epoch": 0.8919323774876953,
"grad_norm": 1.5,
"learning_rate": 1.471498800510962e-06,
"loss": 1.5616,
"step": 1042
},
{
"epoch": 0.8927883586561096,
"grad_norm": 1.6953125,
"learning_rate": 1.448461683202873e-06,
"loss": 1.5843,
"step": 1043
},
{
"epoch": 0.8936443398245238,
"grad_norm": 1.6328125,
"learning_rate": 1.4256009400625214e-06,
"loss": 1.4752,
"step": 1044
},
{
"epoch": 0.8945003209929382,
"grad_norm": 1.453125,
"learning_rate": 1.4029167422908107e-06,
"loss": 1.4538,
"step": 1045
},
{
"epoch": 0.8953563021613524,
"grad_norm": 1.515625,
"learning_rate": 1.3804092597665186e-06,
"loss": 1.4397,
"step": 1046
},
{
"epoch": 0.8962122833297668,
"grad_norm": 1.5703125,
"learning_rate": 1.3580786610450202e-06,
"loss": 1.3437,
"step": 1047
},
{
"epoch": 0.897068264498181,
"grad_norm": 1.3828125,
"learning_rate": 1.335925113357042e-06,
"loss": 1.3665,
"step": 1048
},
{
"epoch": 0.8979242456665953,
"grad_norm": 3.375,
"learning_rate": 1.3139487826073937e-06,
"loss": 1.2993,
"step": 1049
},
{
"epoch": 0.8987802268350096,
"grad_norm": 1.71875,
"learning_rate": 1.2921498333737375e-06,
"loss": 1.4769,
"step": 1050
},
{
"epoch": 0.8996362080034239,
"grad_norm": 1.546875,
"learning_rate": 1.2705284289053403e-06,
"loss": 1.5111,
"step": 1051
},
{
"epoch": 0.9004921891718383,
"grad_norm": 1.71875,
"learning_rate": 1.2490847311218773e-06,
"loss": 1.619,
"step": 1052
},
{
"epoch": 0.9013481703402525,
"grad_norm": 1.6328125,
"learning_rate": 1.2278189006121904e-06,
"loss": 1.6656,
"step": 1053
},
{
"epoch": 0.9013481703402525,
"eval_loss": 1.516126275062561,
"eval_runtime": 21.3286,
"eval_samples_per_second": 18.285,
"eval_steps_per_second": 18.285,
"step": 1053
},
{
"epoch": 0.9022041515086668,
"grad_norm": 1.578125,
"learning_rate": 1.2067310966330959e-06,
"loss": 1.0402,
"step": 1054
},
{
"epoch": 0.9030601326770811,
"grad_norm": 1.4609375,
"learning_rate": 1.185821477108212e-06,
"loss": 1.231,
"step": 1055
},
{
"epoch": 0.9039161138454954,
"grad_norm": 2.0,
"learning_rate": 1.1650901986267365e-06,
"loss": 1.2964,
"step": 1056
},
{
"epoch": 0.9047720950139096,
"grad_norm": 1.8359375,
"learning_rate": 1.144537416442315e-06,
"loss": 1.5158,
"step": 1057
},
{
"epoch": 0.905628076182324,
"grad_norm": 1.6015625,
"learning_rate": 1.1241632844718465e-06,
"loss": 1.5144,
"step": 1058
},
{
"epoch": 0.9064840573507383,
"grad_norm": 1.4375,
"learning_rate": 1.1039679552943493e-06,
"loss": 1.4618,
"step": 1059
},
{
"epoch": 0.9073400385191526,
"grad_norm": 1.4765625,
"learning_rate": 1.0839515801498084e-06,
"loss": 1.3926,
"step": 1060
},
{
"epoch": 0.9081960196875669,
"grad_norm": 1.609375,
"learning_rate": 1.0641143089380523e-06,
"loss": 1.454,
"step": 1061
},
{
"epoch": 0.9090520008559811,
"grad_norm": 1.46875,
"learning_rate": 1.0444562902176296e-06,
"loss": 1.349,
"step": 1062
},
{
"epoch": 0.9099079820243955,
"grad_norm": 1.5234375,
"learning_rate": 1.0249776712046744e-06,
"loss": 1.33,
"step": 1063
},
{
"epoch": 0.9107639631928097,
"grad_norm": 1.453125,
"learning_rate": 1.0056785977718447e-06,
"loss": 1.3807,
"step": 1064
},
{
"epoch": 0.9116199443612241,
"grad_norm": 1.65625,
"learning_rate": 9.865592144471886e-07,
"loss": 1.539,
"step": 1065
},
{
"epoch": 0.9124759255296383,
"grad_norm": 1.46875,
"learning_rate": 9.67619664413086e-07,
"loss": 1.4141,
"step": 1066
},
{
"epoch": 0.9133319066980526,
"grad_norm": 1.484375,
"learning_rate": 9.488600895051714e-07,
"loss": 1.4709,
"step": 1067
},
{
"epoch": 0.914187887866467,
"grad_norm": 1.4921875,
"learning_rate": 9.302806302112693e-07,
"loss": 1.383,
"step": 1068
},
{
"epoch": 0.9150438690348812,
"grad_norm": 1.59375,
"learning_rate": 9.118814256703523e-07,
"loss": 1.3668,
"step": 1069
},
{
"epoch": 0.9158998502032956,
"grad_norm": 1.4765625,
"learning_rate": 8.936626136714754e-07,
"loss": 1.4376,
"step": 1070
},
{
"epoch": 0.9167558313717098,
"grad_norm": 1.6171875,
"learning_rate": 8.756243306527689e-07,
"loss": 1.3291,
"step": 1071
},
{
"epoch": 0.9176118125401241,
"grad_norm": 1.4140625,
"learning_rate": 8.577667117004085e-07,
"loss": 1.447,
"step": 1072
},
{
"epoch": 0.9184677937085384,
"grad_norm": 1.671875,
"learning_rate": 8.400898905475934e-07,
"loss": 1.4864,
"step": 1073
},
{
"epoch": 0.9193237748769527,
"grad_norm": 1.484375,
"learning_rate": 8.225939995735593e-07,
"loss": 1.3948,
"step": 1074
},
{
"epoch": 0.920179756045367,
"grad_norm": 1.859375,
"learning_rate": 8.05279169802578e-07,
"loss": 1.5526,
"step": 1075
},
{
"epoch": 0.9210357372137813,
"grad_norm": 1.4296875,
"learning_rate": 7.881455309029894e-07,
"loss": 1.2545,
"step": 1076
},
{
"epoch": 0.9218917183821956,
"grad_norm": 1.5078125,
"learning_rate": 7.711932111862025e-07,
"loss": 1.4127,
"step": 1077
},
{
"epoch": 0.9227476995506099,
"grad_norm": 1.5546875,
"learning_rate": 7.544223376057702e-07,
"loss": 1.3862,
"step": 1078
},
{
"epoch": 0.9236036807190242,
"grad_norm": 1.5234375,
"learning_rate": 7.378330357564134e-07,
"loss": 1.3634,
"step": 1079
},
{
"epoch": 0.9244596618874384,
"grad_norm": 1.5703125,
"learning_rate": 7.214254298730793e-07,
"loss": 1.4651,
"step": 1080
},
{
"epoch": 0.9253156430558528,
"grad_norm": 1.7265625,
"learning_rate": 7.051996428300317e-07,
"loss": 1.5383,
"step": 1081
},
{
"epoch": 0.926171624224267,
"grad_norm": 1.484375,
"learning_rate": 6.891557961399175e-07,
"loss": 1.3888,
"step": 1082
},
{
"epoch": 0.9270276053926814,
"grad_norm": 1.5390625,
"learning_rate": 6.73294009952849e-07,
"loss": 1.3839,
"step": 1083
},
{
"epoch": 0.9278835865610957,
"grad_norm": 1.53125,
"learning_rate": 6.576144030555259e-07,
"loss": 1.3927,
"step": 1084
},
{
"epoch": 0.9287395677295099,
"grad_norm": 1.6484375,
"learning_rate": 6.421170928703174e-07,
"loss": 1.4722,
"step": 1085
},
{
"epoch": 0.9295955488979243,
"grad_norm": 1.546875,
"learning_rate": 6.268021954544096e-07,
"loss": 1.2901,
"step": 1086
},
{
"epoch": 0.9304515300663385,
"grad_norm": 1.546875,
"learning_rate": 6.116698254989256e-07,
"loss": 1.39,
"step": 1087
},
{
"epoch": 0.9313075112347529,
"grad_norm": 1.4921875,
"learning_rate": 5.967200963280545e-07,
"loss": 1.2328,
"step": 1088
},
{
"epoch": 0.9321634924031671,
"grad_norm": 1.5859375,
"learning_rate": 5.819531198982264e-07,
"loss": 1.3817,
"step": 1089
},
{
"epoch": 0.9330194735715814,
"grad_norm": 1.8046875,
"learning_rate": 5.673690067972553e-07,
"loss": 1.521,
"step": 1090
},
{
"epoch": 0.9338754547399957,
"grad_norm": 1.53125,
"learning_rate": 5.529678662435228e-07,
"loss": 1.5298,
"step": 1091
},
{
"epoch": 0.93473143590841,
"grad_norm": 1.8125,
"learning_rate": 5.387498060851454e-07,
"loss": 1.4792,
"step": 1092
},
{
"epoch": 0.9355874170768244,
"grad_norm": 1.4375,
"learning_rate": 5.247149327991835e-07,
"loss": 1.3686,
"step": 1093
},
{
"epoch": 0.9364433982452386,
"grad_norm": 1.4921875,
"learning_rate": 5.108633514908367e-07,
"loss": 1.4196,
"step": 1094
},
{
"epoch": 0.9372993794136529,
"grad_norm": 1.5625,
"learning_rate": 4.971951658926527e-07,
"loss": 1.3794,
"step": 1095
},
{
"epoch": 0.9381553605820672,
"grad_norm": 1.546875,
"learning_rate": 4.83710478363758e-07,
"loss": 1.5475,
"step": 1096
},
{
"epoch": 0.9390113417504815,
"grad_norm": 1.6015625,
"learning_rate": 4.704093898890871e-07,
"loss": 1.4144,
"step": 1097
},
{
"epoch": 0.9398673229188957,
"grad_norm": 1.5546875,
"learning_rate": 4.5729200007862683e-07,
"loss": 1.3816,
"step": 1098
},
{
"epoch": 0.9407233040873101,
"grad_norm": 1.53125,
"learning_rate": 4.4435840716667007e-07,
"loss": 1.4647,
"step": 1099
},
{
"epoch": 0.9415792852557243,
"grad_norm": 1.4765625,
"learning_rate": 4.316087080110748e-07,
"loss": 1.2495,
"step": 1100
},
{
"epoch": 0.9424352664241387,
"grad_norm": 1.5859375,
"learning_rate": 4.1904299809255867e-07,
"loss": 1.4592,
"step": 1101
},
{
"epoch": 0.943291247592553,
"grad_norm": 1.5390625,
"learning_rate": 4.0666137151395277e-07,
"loss": 1.4884,
"step": 1102
},
{
"epoch": 0.9441472287609672,
"grad_norm": 1.6015625,
"learning_rate": 3.944639209995299e-07,
"loss": 1.5319,
"step": 1103
},
{
"epoch": 0.9450032099293816,
"grad_norm": 1.5,
"learning_rate": 3.824507378942799e-07,
"loss": 1.2856,
"step": 1104
},
{
"epoch": 0.9458591910977958,
"grad_norm": 1.40625,
"learning_rate": 3.70621912163252e-07,
"loss": 1.3458,
"step": 1105
},
{
"epoch": 0.9467151722662102,
"grad_norm": 1.8671875,
"learning_rate": 3.589775323908612e-07,
"loss": 1.7292,
"step": 1106
},
{
"epoch": 0.9475711534346244,
"grad_norm": 1.421875,
"learning_rate": 3.475176857802298e-07,
"loss": 1.3163,
"step": 1107
},
{
"epoch": 0.9484271346030387,
"grad_norm": 1.6875,
"learning_rate": 3.3624245815254975e-07,
"loss": 1.5198,
"step": 1108
},
{
"epoch": 0.9492831157714531,
"grad_norm": 1.625,
"learning_rate": 3.2515193394641595e-07,
"loss": 1.5222,
"step": 1109
},
{
"epoch": 0.9501390969398673,
"grad_norm": 1.84375,
"learning_rate": 3.142461962172105e-07,
"loss": 1.3569,
"step": 1110
},
{
"epoch": 0.9509950781082817,
"grad_norm": 1.515625,
"learning_rate": 3.035253266364696e-07,
"loss": 1.4204,
"step": 1111
},
{
"epoch": 0.9518510592766959,
"grad_norm": 1.4765625,
"learning_rate": 2.9298940549128964e-07,
"loss": 1.3198,
"step": 1112
},
{
"epoch": 0.9527070404451102,
"grad_norm": 1.65625,
"learning_rate": 2.8263851168369714e-07,
"loss": 1.4136,
"step": 1113
},
{
"epoch": 0.9535630216135245,
"grad_norm": 1.5,
"learning_rate": 2.724727227300911e-07,
"loss": 1.4979,
"step": 1114
},
{
"epoch": 0.9544190027819388,
"grad_norm": 1.46875,
"learning_rate": 2.624921147606374e-07,
"loss": 1.3033,
"step": 1115
},
{
"epoch": 0.955274983950353,
"grad_norm": 1.4921875,
"learning_rate": 2.526967625187088e-07,
"loss": 1.429,
"step": 1116
},
{
"epoch": 0.9561309651187674,
"grad_norm": 1.59375,
"learning_rate": 2.4308673936032646e-07,
"loss": 1.4569,
"step": 1117
},
{
"epoch": 0.9569869462871817,
"grad_norm": 1.3984375,
"learning_rate": 2.3366211725360798e-07,
"loss": 1.3534,
"step": 1118
},
{
"epoch": 0.957842927455596,
"grad_norm": 1.5078125,
"learning_rate": 2.244229667782205e-07,
"loss": 1.5615,
"step": 1119
},
{
"epoch": 0.9586989086240103,
"grad_norm": 1.40625,
"learning_rate": 2.1536935712486994e-07,
"loss": 1.4168,
"step": 1120
},
{
"epoch": 0.9595548897924245,
"grad_norm": 1.484375,
"learning_rate": 2.0650135609477094e-07,
"loss": 1.3854,
"step": 1121
},
{
"epoch": 0.9604108709608389,
"grad_norm": 1.4921875,
"learning_rate": 1.9781903009913338e-07,
"loss": 1.3355,
"step": 1122
},
{
"epoch": 0.9612668521292531,
"grad_norm": 1.9375,
"learning_rate": 1.893224441586877e-07,
"loss": 1.5445,
"step": 1123
},
{
"epoch": 0.9621228332976675,
"grad_norm": 1.7109375,
"learning_rate": 1.8101166190316875e-07,
"loss": 1.4701,
"step": 1124
},
{
"epoch": 0.9629788144660818,
"grad_norm": 1.5625,
"learning_rate": 1.7288674557086048e-07,
"loss": 1.4356,
"step": 1125
},
{
"epoch": 0.963834795634496,
"grad_norm": 1.75,
"learning_rate": 1.6494775600812417e-07,
"loss": 1.4501,
"step": 1126
},
{
"epoch": 0.9646907768029104,
"grad_norm": 1.6484375,
"learning_rate": 1.571947526689349e-07,
"loss": 1.4544,
"step": 1127
},
{
"epoch": 0.9655467579713246,
"grad_norm": 1.5625,
"learning_rate": 1.4962779361445412e-07,
"loss": 1.5713,
"step": 1128
},
{
"epoch": 0.966402739139739,
"grad_norm": 1.6796875,
"learning_rate": 1.4224693551256885e-07,
"loss": 1.7056,
"step": 1129
},
{
"epoch": 0.9672587203081532,
"grad_norm": 1.390625,
"learning_rate": 1.3505223363749487e-07,
"loss": 1.2895,
"step": 1130
},
{
"epoch": 0.9681147014765675,
"grad_norm": 1.4921875,
"learning_rate": 1.2804374186934643e-07,
"loss": 1.3881,
"step": 1131
},
{
"epoch": 0.9689706826449818,
"grad_norm": 1.53125,
"learning_rate": 1.2122151269373383e-07,
"loss": 1.2761,
"step": 1132
},
{
"epoch": 0.9698266638133961,
"grad_norm": 1.5,
"learning_rate": 1.1458559720137762e-07,
"loss": 1.3987,
"step": 1133
},
{
"epoch": 0.9706826449818104,
"grad_norm": 1.53125,
"learning_rate": 1.0813604508771169e-07,
"loss": 1.3975,
"step": 1134
},
{
"epoch": 0.9715386261502247,
"grad_norm": 1.859375,
"learning_rate": 1.018729046525363e-07,
"loss": 1.3861,
"step": 1135
},
{
"epoch": 0.972394607318639,
"grad_norm": 1.4453125,
"learning_rate": 9.579622279962397e-08,
"loss": 1.3842,
"step": 1136
},
{
"epoch": 0.9732505884870533,
"grad_norm": 1.546875,
"learning_rate": 8.990604503639477e-08,
"loss": 1.4654,
"step": 1137
},
{
"epoch": 0.9741065696554676,
"grad_norm": 1.578125,
"learning_rate": 8.420241547356933e-08,
"loss": 1.4066,
"step": 1138
},
{
"epoch": 0.9749625508238818,
"grad_norm": 1.3515625,
"learning_rate": 7.868537682482469e-08,
"loss": 1.3077,
"step": 1139
},
{
"epoch": 0.9758185319922962,
"grad_norm": 1.5,
"learning_rate": 7.335497040648898e-08,
"loss": 1.4708,
"step": 1140
},
{
"epoch": 0.9766745131607104,
"grad_norm": 1.515625,
"learning_rate": 6.821123613723057e-08,
"loss": 1.6011,
"step": 1141
},
{
"epoch": 0.9775304943291248,
"grad_norm": 1.4453125,
"learning_rate": 6.325421253775277e-08,
"loss": 1.2807,
"step": 1142
},
{
"epoch": 0.9783864754975391,
"grad_norm": 1.484375,
"learning_rate": 5.848393673051067e-08,
"loss": 1.3443,
"step": 1143
},
{
"epoch": 0.9792424566659533,
"grad_norm": 1.4140625,
"learning_rate": 5.390044443943365e-08,
"loss": 1.5044,
"step": 1144
},
{
"epoch": 0.9800984378343677,
"grad_norm": 1.8125,
"learning_rate": 4.9503769989647786e-08,
"loss": 1.3441,
"step": 1145
},
{
"epoch": 0.9809544190027819,
"grad_norm": 1.546875,
"learning_rate": 4.529394630723438e-08,
"loss": 1.3954,
"step": 1146
},
{
"epoch": 0.9818104001711963,
"grad_norm": 1.59375,
"learning_rate": 4.1271004918971847e-08,
"loss": 1.3292,
"step": 1147
},
{
"epoch": 0.9826663813396105,
"grad_norm": 1.421875,
"learning_rate": 3.7434975952102546e-08,
"loss": 1.2322,
"step": 1148
},
{
"epoch": 0.9835223625080248,
"grad_norm": 1.5,
"learning_rate": 3.378588813411354e-08,
"loss": 1.3188,
"step": 1149
},
{
"epoch": 0.9843783436764392,
"grad_norm": 1.421875,
"learning_rate": 3.032376879250898e-08,
"loss": 1.2855,
"step": 1150
},
{
"epoch": 0.9852343248448534,
"grad_norm": 1.71875,
"learning_rate": 2.7048643854615806e-08,
"loss": 1.6131,
"step": 1151
},
{
"epoch": 0.9860903060132677,
"grad_norm": 1.3359375,
"learning_rate": 2.3960537847383946e-08,
"loss": 1.2274,
"step": 1152
},
{
"epoch": 0.986946287181682,
"grad_norm": 1.7421875,
"learning_rate": 2.1059473897208637e-08,
"loss": 1.4714,
"step": 1153
},
{
"epoch": 0.9878022683500963,
"grad_norm": 1.46875,
"learning_rate": 1.834547372975004e-08,
"loss": 1.4097,
"step": 1154
},
{
"epoch": 0.9886582495185106,
"grad_norm": 1.5859375,
"learning_rate": 1.581855766977225e-08,
"loss": 1.292,
"step": 1155
},
{
"epoch": 0.9895142306869249,
"grad_norm": 1.5,
"learning_rate": 1.3478744640998963e-08,
"loss": 1.1981,
"step": 1156
},
{
"epoch": 0.9903702118553391,
"grad_norm": 1.6171875,
"learning_rate": 1.1326052165960831e-08,
"loss": 1.5196,
"step": 1157
},
{
"epoch": 0.9912261930237535,
"grad_norm": 1.5625,
"learning_rate": 9.360496365870553e-09,
"loss": 1.4829,
"step": 1158
},
{
"epoch": 0.9920821741921678,
"grad_norm": 1.6875,
"learning_rate": 7.582091960497973e-09,
"loss": 1.4483,
"step": 1159
},
{
"epoch": 0.992938155360582,
"grad_norm": 1.4296875,
"learning_rate": 5.990852268064618e-09,
"loss": 1.5446,
"step": 1160
},
{
"epoch": 0.9937941365289964,
"grad_norm": 1.421875,
"learning_rate": 4.586789205140995e-09,
"loss": 1.39,
"step": 1161
},
{
"epoch": 0.9946501176974106,
"grad_norm": 1.4296875,
"learning_rate": 3.3699132865605553e-09,
"loss": 1.2557,
"step": 1162
},
{
"epoch": 0.995506098865825,
"grad_norm": 1.453125,
"learning_rate": 2.3402336253364187e-09,
"loss": 1.4511,
"step": 1163
},
{
"epoch": 0.9963620800342392,
"grad_norm": 1.6328125,
"learning_rate": 1.4977579325919923e-09,
"loss": 1.3592,
"step": 1164
},
{
"epoch": 0.9972180612026535,
"grad_norm": 1.6953125,
"learning_rate": 8.424925175137821e-10,
"loss": 1.4615,
"step": 1165
},
{
"epoch": 0.9980740423710678,
"grad_norm": 1.6015625,
"learning_rate": 3.744422872875575e-10,
"loss": 1.5993,
"step": 1166
},
{
"epoch": 0.9989300235394821,
"grad_norm": 1.4453125,
"learning_rate": 9.361074708169604e-11,
"loss": 1.3935,
"step": 1167
},
{
"epoch": 0.9997860047078965,
"grad_norm": 1.6875,
"learning_rate": 0.0,
"loss": 1.271,
"step": 1168
}
],
"logging_steps": 1,
"max_steps": 1168,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.472919195037204e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}