Qwen2.5-1.5B-Open-R1-Distill / trainer_state.json
Jforeverss's picture
Model save
7f06539 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2930,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0017064846416382253,
"grad_norm": 2.823172429776875,
"learning_rate": 1.360544217687075e-06,
"loss": 0.8715,
"num_tokens": 949756.0,
"step": 5
},
{
"epoch": 0.0034129692832764505,
"grad_norm": 1.9356174655366971,
"learning_rate": 3.0612244897959185e-06,
"loss": 0.8609,
"num_tokens": 1934239.0,
"step": 10
},
{
"epoch": 0.005119453924914676,
"grad_norm": 1.460519264059952,
"learning_rate": 4.7619047619047615e-06,
"loss": 0.794,
"num_tokens": 2810536.0,
"step": 15
},
{
"epoch": 0.006825938566552901,
"grad_norm": 0.904578522693587,
"learning_rate": 6.462585034013606e-06,
"loss": 0.7436,
"num_tokens": 3759778.0,
"step": 20
},
{
"epoch": 0.008532423208191127,
"grad_norm": 0.7983479886595664,
"learning_rate": 8.163265306122448e-06,
"loss": 0.7124,
"num_tokens": 4719221.0,
"step": 25
},
{
"epoch": 0.010238907849829351,
"grad_norm": 0.7054491846301127,
"learning_rate": 9.863945578231292e-06,
"loss": 0.7244,
"num_tokens": 5645472.0,
"step": 30
},
{
"epoch": 0.011945392491467578,
"grad_norm": 0.5846860059632117,
"learning_rate": 1.1564625850340138e-05,
"loss": 0.6856,
"num_tokens": 6675650.0,
"step": 35
},
{
"epoch": 0.013651877133105802,
"grad_norm": 0.5252998046067419,
"learning_rate": 1.3265306122448982e-05,
"loss": 0.6694,
"num_tokens": 7670069.0,
"step": 40
},
{
"epoch": 0.015358361774744027,
"grad_norm": 0.5533849135779766,
"learning_rate": 1.4965986394557824e-05,
"loss": 0.6436,
"num_tokens": 8562223.0,
"step": 45
},
{
"epoch": 0.017064846416382253,
"grad_norm": 0.5631922874607858,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.6546,
"num_tokens": 9514590.0,
"step": 50
},
{
"epoch": 0.01877133105802048,
"grad_norm": 0.6078968314204046,
"learning_rate": 1.836734693877551e-05,
"loss": 0.6175,
"num_tokens": 10423653.0,
"step": 55
},
{
"epoch": 0.020477815699658702,
"grad_norm": 0.5693094869748001,
"learning_rate": 2.0068027210884355e-05,
"loss": 0.6177,
"num_tokens": 11370767.0,
"step": 60
},
{
"epoch": 0.02218430034129693,
"grad_norm": 0.6043968806478303,
"learning_rate": 2.17687074829932e-05,
"loss": 0.6241,
"num_tokens": 12205003.0,
"step": 65
},
{
"epoch": 0.023890784982935155,
"grad_norm": 0.6098681736085941,
"learning_rate": 2.3469387755102043e-05,
"loss": 0.6197,
"num_tokens": 13221634.0,
"step": 70
},
{
"epoch": 0.025597269624573378,
"grad_norm": 0.5652622276816748,
"learning_rate": 2.5170068027210887e-05,
"loss": 0.6032,
"num_tokens": 14132790.0,
"step": 75
},
{
"epoch": 0.027303754266211604,
"grad_norm": 0.5925207051821937,
"learning_rate": 2.687074829931973e-05,
"loss": 0.6095,
"num_tokens": 15120586.0,
"step": 80
},
{
"epoch": 0.02901023890784983,
"grad_norm": 0.6004251741348174,
"learning_rate": 2.857142857142857e-05,
"loss": 0.627,
"num_tokens": 16123604.0,
"step": 85
},
{
"epoch": 0.030716723549488054,
"grad_norm": 0.6116000919042185,
"learning_rate": 3.0272108843537418e-05,
"loss": 0.6003,
"num_tokens": 17045024.0,
"step": 90
},
{
"epoch": 0.032423208191126277,
"grad_norm": 0.6033555839102962,
"learning_rate": 3.1972789115646265e-05,
"loss": 0.6099,
"num_tokens": 17979516.0,
"step": 95
},
{
"epoch": 0.034129692832764506,
"grad_norm": 0.6174109618196879,
"learning_rate": 3.36734693877551e-05,
"loss": 0.5928,
"num_tokens": 18909989.0,
"step": 100
},
{
"epoch": 0.03583617747440273,
"grad_norm": 0.6562250628577161,
"learning_rate": 3.5374149659863946e-05,
"loss": 0.6091,
"num_tokens": 19893037.0,
"step": 105
},
{
"epoch": 0.03754266211604096,
"grad_norm": 0.6359940327448688,
"learning_rate": 3.707482993197279e-05,
"loss": 0.5757,
"num_tokens": 20794568.0,
"step": 110
},
{
"epoch": 0.03924914675767918,
"grad_norm": 0.591968085837116,
"learning_rate": 3.8775510204081634e-05,
"loss": 0.601,
"num_tokens": 21708539.0,
"step": 115
},
{
"epoch": 0.040955631399317405,
"grad_norm": 0.6284093561745501,
"learning_rate": 4.047619047619048e-05,
"loss": 0.6047,
"num_tokens": 22645536.0,
"step": 120
},
{
"epoch": 0.042662116040955635,
"grad_norm": 0.622896210775772,
"learning_rate": 4.217687074829932e-05,
"loss": 0.5941,
"num_tokens": 23599448.0,
"step": 125
},
{
"epoch": 0.04436860068259386,
"grad_norm": 0.5576038080368834,
"learning_rate": 4.387755102040816e-05,
"loss": 0.6178,
"num_tokens": 24609009.0,
"step": 130
},
{
"epoch": 0.04607508532423208,
"grad_norm": 0.5507377217724624,
"learning_rate": 4.557823129251701e-05,
"loss": 0.6048,
"num_tokens": 25618632.0,
"step": 135
},
{
"epoch": 0.04778156996587031,
"grad_norm": 0.6383078614695583,
"learning_rate": 4.7278911564625856e-05,
"loss": 0.5975,
"num_tokens": 26641035.0,
"step": 140
},
{
"epoch": 0.04948805460750853,
"grad_norm": 0.6846737722897228,
"learning_rate": 4.89795918367347e-05,
"loss": 0.5744,
"num_tokens": 27517779.0,
"step": 145
},
{
"epoch": 0.051194539249146756,
"grad_norm": 0.7663095587321149,
"learning_rate": 4.999994265630655e-05,
"loss": 0.5675,
"num_tokens": 28445983.0,
"step": 150
},
{
"epoch": 0.052901023890784986,
"grad_norm": 0.6816635800011499,
"learning_rate": 4.999929754311198e-05,
"loss": 0.5903,
"num_tokens": 29489865.0,
"step": 155
},
{
"epoch": 0.05460750853242321,
"grad_norm": 0.6680964778036098,
"learning_rate": 4.999793565772626e-05,
"loss": 0.5989,
"num_tokens": 30423007.0,
"step": 160
},
{
"epoch": 0.05631399317406143,
"grad_norm": 0.5649434202104989,
"learning_rate": 4.999585704353568e-05,
"loss": 0.5801,
"num_tokens": 31372257.0,
"step": 165
},
{
"epoch": 0.05802047781569966,
"grad_norm": 0.5122745336387098,
"learning_rate": 4.999306176675979e-05,
"loss": 0.5998,
"num_tokens": 32356676.0,
"step": 170
},
{
"epoch": 0.059726962457337884,
"grad_norm": 0.5332799116717496,
"learning_rate": 4.998954991644921e-05,
"loss": 0.5904,
"num_tokens": 33261796.0,
"step": 175
},
{
"epoch": 0.06143344709897611,
"grad_norm": 0.5068363705703531,
"learning_rate": 4.9985321604482835e-05,
"loss": 0.59,
"num_tokens": 34237001.0,
"step": 180
},
{
"epoch": 0.06313993174061433,
"grad_norm": 0.6492409848966394,
"learning_rate": 4.9980376965564286e-05,
"loss": 0.5955,
"num_tokens": 35167253.0,
"step": 185
},
{
"epoch": 0.06484641638225255,
"grad_norm": 0.5820124301992429,
"learning_rate": 4.997471615721756e-05,
"loss": 0.5767,
"num_tokens": 36074352.0,
"step": 190
},
{
"epoch": 0.06655290102389079,
"grad_norm": 0.5394095878459838,
"learning_rate": 4.996833935978207e-05,
"loss": 0.624,
"num_tokens": 37055456.0,
"step": 195
},
{
"epoch": 0.06825938566552901,
"grad_norm": 0.5362539000244216,
"learning_rate": 4.996124677640687e-05,
"loss": 0.5722,
"num_tokens": 37967720.0,
"step": 200
},
{
"epoch": 0.06996587030716724,
"grad_norm": 0.49110498514505857,
"learning_rate": 4.99534386330442e-05,
"loss": 0.5993,
"num_tokens": 38924279.0,
"step": 205
},
{
"epoch": 0.07167235494880546,
"grad_norm": 0.546699809294802,
"learning_rate": 4.994491517844227e-05,
"loss": 0.5938,
"num_tokens": 39853351.0,
"step": 210
},
{
"epoch": 0.07337883959044368,
"grad_norm": 0.5776918031520784,
"learning_rate": 4.993567668413733e-05,
"loss": 0.5809,
"num_tokens": 40811069.0,
"step": 215
},
{
"epoch": 0.07508532423208192,
"grad_norm": 0.4951200074682594,
"learning_rate": 4.992572344444507e-05,
"loss": 0.6027,
"num_tokens": 41833783.0,
"step": 220
},
{
"epoch": 0.07679180887372014,
"grad_norm": 0.5901419308378143,
"learning_rate": 4.991505577645118e-05,
"loss": 0.5747,
"num_tokens": 42744744.0,
"step": 225
},
{
"epoch": 0.07849829351535836,
"grad_norm": 0.5304430248970768,
"learning_rate": 4.9903674020001284e-05,
"loss": 0.6,
"num_tokens": 43682290.0,
"step": 230
},
{
"epoch": 0.08020477815699659,
"grad_norm": 0.5211597930937201,
"learning_rate": 4.989157853769009e-05,
"loss": 0.5805,
"num_tokens": 44609387.0,
"step": 235
},
{
"epoch": 0.08191126279863481,
"grad_norm": 0.5133063136328732,
"learning_rate": 4.987876971484988e-05,
"loss": 0.5787,
"num_tokens": 45550959.0,
"step": 240
},
{
"epoch": 0.08361774744027303,
"grad_norm": 0.6095708293579054,
"learning_rate": 4.9865247959538194e-05,
"loss": 0.5976,
"num_tokens": 46433321.0,
"step": 245
},
{
"epoch": 0.08532423208191127,
"grad_norm": 0.4814034781564727,
"learning_rate": 4.985101370252483e-05,
"loss": 0.5872,
"num_tokens": 47474526.0,
"step": 250
},
{
"epoch": 0.08703071672354949,
"grad_norm": 0.5162969780823659,
"learning_rate": 4.983606739727816e-05,
"loss": 0.5863,
"num_tokens": 48425336.0,
"step": 255
},
{
"epoch": 0.08873720136518772,
"grad_norm": 0.5073448552205259,
"learning_rate": 4.982040951995066e-05,
"loss": 0.5821,
"num_tokens": 49377672.0,
"step": 260
},
{
"epoch": 0.09044368600682594,
"grad_norm": 0.6390759679559125,
"learning_rate": 4.980404056936371e-05,
"loss": 0.5822,
"num_tokens": 50297482.0,
"step": 265
},
{
"epoch": 0.09215017064846416,
"grad_norm": 0.505488369723625,
"learning_rate": 4.978696106699175e-05,
"loss": 0.5777,
"num_tokens": 51201531.0,
"step": 270
},
{
"epoch": 0.09385665529010238,
"grad_norm": 0.4834540354696749,
"learning_rate": 4.976917155694565e-05,
"loss": 0.5706,
"num_tokens": 52135060.0,
"step": 275
},
{
"epoch": 0.09556313993174062,
"grad_norm": 0.5091081042854496,
"learning_rate": 4.9750672605955385e-05,
"loss": 0.5887,
"num_tokens": 53019691.0,
"step": 280
},
{
"epoch": 0.09726962457337884,
"grad_norm": 0.5272029644773706,
"learning_rate": 4.9731464803351944e-05,
"loss": 0.5768,
"num_tokens": 54021567.0,
"step": 285
},
{
"epoch": 0.09897610921501707,
"grad_norm": 0.5618712865310092,
"learning_rate": 4.971154876104862e-05,
"loss": 0.5707,
"num_tokens": 54928756.0,
"step": 290
},
{
"epoch": 0.10068259385665529,
"grad_norm": 0.5079662485961427,
"learning_rate": 4.969092511352143e-05,
"loss": 0.5764,
"num_tokens": 55901721.0,
"step": 295
},
{
"epoch": 0.10238907849829351,
"grad_norm": 0.5590533288223599,
"learning_rate": 4.9669594517789004e-05,
"loss": 0.6059,
"num_tokens": 56919336.0,
"step": 300
},
{
"epoch": 0.10409556313993173,
"grad_norm": 0.5124289657226824,
"learning_rate": 4.9647557653391544e-05,
"loss": 0.5486,
"num_tokens": 57859693.0,
"step": 305
},
{
"epoch": 0.10580204778156997,
"grad_norm": 0.5236808101484978,
"learning_rate": 4.9624815222369283e-05,
"loss": 0.5744,
"num_tokens": 58769487.0,
"step": 310
},
{
"epoch": 0.1075085324232082,
"grad_norm": 0.41840286703818447,
"learning_rate": 4.9601367949240034e-05,
"loss": 0.5571,
"num_tokens": 59686216.0,
"step": 315
},
{
"epoch": 0.10921501706484642,
"grad_norm": 0.49183814869408765,
"learning_rate": 4.957721658097616e-05,
"loss": 0.5778,
"num_tokens": 60660649.0,
"step": 320
},
{
"epoch": 0.11092150170648464,
"grad_norm": 0.5261379848235058,
"learning_rate": 4.955236188698076e-05,
"loss": 0.5581,
"num_tokens": 61640951.0,
"step": 325
},
{
"epoch": 0.11262798634812286,
"grad_norm": 0.5047161464074051,
"learning_rate": 4.9526804659063135e-05,
"loss": 0.5453,
"num_tokens": 62673288.0,
"step": 330
},
{
"epoch": 0.11433447098976109,
"grad_norm": 0.5370013069114835,
"learning_rate": 4.950054571141362e-05,
"loss": 0.5704,
"num_tokens": 63654789.0,
"step": 335
},
{
"epoch": 0.11604095563139932,
"grad_norm": 0.5065504794071035,
"learning_rate": 4.94735858805776e-05,
"loss": 0.5564,
"num_tokens": 64575366.0,
"step": 340
},
{
"epoch": 0.11774744027303755,
"grad_norm": 0.5367370357250334,
"learning_rate": 4.9445926025428856e-05,
"loss": 0.5682,
"num_tokens": 65572577.0,
"step": 345
},
{
"epoch": 0.11945392491467577,
"grad_norm": 0.6256624388445353,
"learning_rate": 4.9417567027142245e-05,
"loss": 0.5691,
"num_tokens": 66496209.0,
"step": 350
},
{
"epoch": 0.12116040955631399,
"grad_norm": 0.5364526247895124,
"learning_rate": 4.938850978916557e-05,
"loss": 0.5963,
"num_tokens": 67477381.0,
"step": 355
},
{
"epoch": 0.12286689419795221,
"grad_norm": 0.5760517333994615,
"learning_rate": 4.935875523719086e-05,
"loss": 0.5676,
"num_tokens": 68438217.0,
"step": 360
},
{
"epoch": 0.12457337883959044,
"grad_norm": 0.4912183552600293,
"learning_rate": 4.932830431912484e-05,
"loss": 0.5689,
"num_tokens": 69455969.0,
"step": 365
},
{
"epoch": 0.12627986348122866,
"grad_norm": 0.4760184495750453,
"learning_rate": 4.929715800505873e-05,
"loss": 0.5763,
"num_tokens": 70344364.0,
"step": 370
},
{
"epoch": 0.12798634812286688,
"grad_norm": 0.45385512480590845,
"learning_rate": 4.926531728723738e-05,
"loss": 0.5871,
"num_tokens": 71311780.0,
"step": 375
},
{
"epoch": 0.1296928327645051,
"grad_norm": 0.4698082651802184,
"learning_rate": 4.923278318002761e-05,
"loss": 0.5545,
"num_tokens": 72264786.0,
"step": 380
},
{
"epoch": 0.13139931740614336,
"grad_norm": 0.45414350096683564,
"learning_rate": 4.919955671988592e-05,
"loss": 0.5368,
"num_tokens": 73254319.0,
"step": 385
},
{
"epoch": 0.13310580204778158,
"grad_norm": 0.4488305223658382,
"learning_rate": 4.916563896532549e-05,
"loss": 0.5538,
"num_tokens": 74233642.0,
"step": 390
},
{
"epoch": 0.1348122866894198,
"grad_norm": 0.4525201301734097,
"learning_rate": 4.91310309968824e-05,
"loss": 0.5689,
"num_tokens": 75216559.0,
"step": 395
},
{
"epoch": 0.13651877133105803,
"grad_norm": 0.5580205197947308,
"learning_rate": 4.90957339170813e-05,
"loss": 0.5684,
"num_tokens": 76173511.0,
"step": 400
},
{
"epoch": 0.13822525597269625,
"grad_norm": 0.43833715671179474,
"learning_rate": 4.905974885040015e-05,
"loss": 0.5537,
"num_tokens": 77137128.0,
"step": 405
},
{
"epoch": 0.13993174061433447,
"grad_norm": 0.43081913118053056,
"learning_rate": 4.902307694323456e-05,
"loss": 0.5595,
"num_tokens": 78183541.0,
"step": 410
},
{
"epoch": 0.1416382252559727,
"grad_norm": 0.46606853464523745,
"learning_rate": 4.8985719363861135e-05,
"loss": 0.572,
"num_tokens": 79163656.0,
"step": 415
},
{
"epoch": 0.14334470989761092,
"grad_norm": 0.5472361464183433,
"learning_rate": 4.8947677302400326e-05,
"loss": 0.5522,
"num_tokens": 80166162.0,
"step": 420
},
{
"epoch": 0.14505119453924914,
"grad_norm": 0.4537573554760025,
"learning_rate": 4.890895197077848e-05,
"loss": 0.5507,
"num_tokens": 81121834.0,
"step": 425
},
{
"epoch": 0.14675767918088736,
"grad_norm": 0.5290898874704273,
"learning_rate": 4.886954460268927e-05,
"loss": 0.5702,
"num_tokens": 81987283.0,
"step": 430
},
{
"epoch": 0.14846416382252559,
"grad_norm": 0.45456782141151,
"learning_rate": 4.882945645355435e-05,
"loss": 0.5756,
"num_tokens": 82994121.0,
"step": 435
},
{
"epoch": 0.15017064846416384,
"grad_norm": 0.4809659734303757,
"learning_rate": 4.878868880048341e-05,
"loss": 0.5614,
"num_tokens": 83915537.0,
"step": 440
},
{
"epoch": 0.15187713310580206,
"grad_norm": 0.52139675211743,
"learning_rate": 4.874724294223343e-05,
"loss": 0.5444,
"num_tokens": 84755157.0,
"step": 445
},
{
"epoch": 0.15358361774744028,
"grad_norm": 0.4544341064553721,
"learning_rate": 4.870512019916734e-05,
"loss": 0.5433,
"num_tokens": 85690047.0,
"step": 450
},
{
"epoch": 0.1552901023890785,
"grad_norm": 0.4558960340950479,
"learning_rate": 4.866232191321199e-05,
"loss": 0.5933,
"num_tokens": 86736902.0,
"step": 455
},
{
"epoch": 0.15699658703071673,
"grad_norm": 0.47990907617604506,
"learning_rate": 4.8618849447815305e-05,
"loss": 0.5745,
"num_tokens": 87705484.0,
"step": 460
},
{
"epoch": 0.15870307167235495,
"grad_norm": 0.4471349487997525,
"learning_rate": 4.8574704187902955e-05,
"loss": 0.5493,
"num_tokens": 88652585.0,
"step": 465
},
{
"epoch": 0.16040955631399317,
"grad_norm": 0.5050821167458766,
"learning_rate": 4.8529887539834144e-05,
"loss": 0.5559,
"num_tokens": 89594618.0,
"step": 470
},
{
"epoch": 0.1621160409556314,
"grad_norm": 0.433008399283799,
"learning_rate": 4.84844009313569e-05,
"loss": 0.5465,
"num_tokens": 90594832.0,
"step": 475
},
{
"epoch": 0.16382252559726962,
"grad_norm": 0.42424498611581374,
"learning_rate": 4.843824581156249e-05,
"loss": 0.5555,
"num_tokens": 91557529.0,
"step": 480
},
{
"epoch": 0.16552901023890784,
"grad_norm": 0.4536724303374971,
"learning_rate": 4.839142365083932e-05,
"loss": 0.5586,
"num_tokens": 92520759.0,
"step": 485
},
{
"epoch": 0.16723549488054607,
"grad_norm": 0.4135691134510148,
"learning_rate": 4.8343935940826104e-05,
"loss": 0.5463,
"num_tokens": 93508244.0,
"step": 490
},
{
"epoch": 0.1689419795221843,
"grad_norm": 0.49925946717984443,
"learning_rate": 4.829578419436427e-05,
"loss": 0.5758,
"num_tokens": 94489856.0,
"step": 495
},
{
"epoch": 0.17064846416382254,
"grad_norm": 0.4795792461133882,
"learning_rate": 4.824696994544985e-05,
"loss": 0.5581,
"num_tokens": 95453893.0,
"step": 500
},
{
"epoch": 0.17235494880546076,
"grad_norm": 0.46669919948991945,
"learning_rate": 4.819749474918455e-05,
"loss": 0.556,
"num_tokens": 96333287.0,
"step": 505
},
{
"epoch": 0.17406143344709898,
"grad_norm": 0.4707277196611808,
"learning_rate": 4.814736018172624e-05,
"loss": 0.5583,
"num_tokens": 97319183.0,
"step": 510
},
{
"epoch": 0.1757679180887372,
"grad_norm": 0.44856398131453706,
"learning_rate": 4.809656784023872e-05,
"loss": 0.5643,
"num_tokens": 98374455.0,
"step": 515
},
{
"epoch": 0.17747440273037543,
"grad_norm": 0.41999315891743993,
"learning_rate": 4.8045119342840885e-05,
"loss": 0.5368,
"num_tokens": 99400041.0,
"step": 520
},
{
"epoch": 0.17918088737201365,
"grad_norm": 0.4314801891904282,
"learning_rate": 4.799301632855508e-05,
"loss": 0.5682,
"num_tokens": 100419726.0,
"step": 525
},
{
"epoch": 0.18088737201365188,
"grad_norm": 0.4532725946713624,
"learning_rate": 4.794026045725501e-05,
"loss": 0.5413,
"num_tokens": 101373875.0,
"step": 530
},
{
"epoch": 0.1825938566552901,
"grad_norm": 0.45204637278231036,
"learning_rate": 4.788685340961276e-05,
"loss": 0.5561,
"num_tokens": 102331658.0,
"step": 535
},
{
"epoch": 0.18430034129692832,
"grad_norm": 0.4711186304154966,
"learning_rate": 4.7832796887045276e-05,
"loss": 0.5421,
"num_tokens": 103240516.0,
"step": 540
},
{
"epoch": 0.18600682593856654,
"grad_norm": 0.4626002965808097,
"learning_rate": 4.7778092611660225e-05,
"loss": 0.5696,
"num_tokens": 104162605.0,
"step": 545
},
{
"epoch": 0.18771331058020477,
"grad_norm": 0.423630300008881,
"learning_rate": 4.772274232620104e-05,
"loss": 0.5532,
"num_tokens": 105061908.0,
"step": 550
},
{
"epoch": 0.189419795221843,
"grad_norm": 0.5303058647014298,
"learning_rate": 4.766674779399145e-05,
"loss": 0.5634,
"num_tokens": 105919969.0,
"step": 555
},
{
"epoch": 0.19112627986348124,
"grad_norm": 0.4528277982211153,
"learning_rate": 4.76101107988793e-05,
"loss": 0.5775,
"num_tokens": 106919294.0,
"step": 560
},
{
"epoch": 0.19283276450511946,
"grad_norm": 0.375604144309103,
"learning_rate": 4.7552833145179746e-05,
"loss": 0.5127,
"num_tokens": 107846976.0,
"step": 565
},
{
"epoch": 0.1945392491467577,
"grad_norm": 0.44488024598088494,
"learning_rate": 4.749491665761772e-05,
"loss": 0.5388,
"num_tokens": 108819219.0,
"step": 570
},
{
"epoch": 0.1962457337883959,
"grad_norm": 0.4154713260123601,
"learning_rate": 4.7436363181269825e-05,
"loss": 0.5469,
"num_tokens": 109845258.0,
"step": 575
},
{
"epoch": 0.19795221843003413,
"grad_norm": 0.39816353681189776,
"learning_rate": 4.737717458150558e-05,
"loss": 0.5519,
"num_tokens": 110858993.0,
"step": 580
},
{
"epoch": 0.19965870307167236,
"grad_norm": 0.44831838753632824,
"learning_rate": 4.7317352743927954e-05,
"loss": 0.5578,
"num_tokens": 111788546.0,
"step": 585
},
{
"epoch": 0.20136518771331058,
"grad_norm": 0.4186460516616093,
"learning_rate": 4.7256899574313304e-05,
"loss": 0.5472,
"num_tokens": 112732095.0,
"step": 590
},
{
"epoch": 0.2030716723549488,
"grad_norm": 0.44237132476023605,
"learning_rate": 4.71958169985507e-05,
"loss": 0.5493,
"num_tokens": 113649022.0,
"step": 595
},
{
"epoch": 0.20477815699658702,
"grad_norm": 0.43701302963747873,
"learning_rate": 4.7134106962580516e-05,
"loss": 0.5569,
"num_tokens": 114540376.0,
"step": 600
},
{
"epoch": 0.20648464163822525,
"grad_norm": 0.4128633428095234,
"learning_rate": 4.707177143233247e-05,
"loss": 0.5513,
"num_tokens": 115480997.0,
"step": 605
},
{
"epoch": 0.20819112627986347,
"grad_norm": 0.3755079914854989,
"learning_rate": 4.7008812393662996e-05,
"loss": 0.5255,
"num_tokens": 116464215.0,
"step": 610
},
{
"epoch": 0.2098976109215017,
"grad_norm": 0.4092404736922847,
"learning_rate": 4.694523185229196e-05,
"loss": 0.5398,
"num_tokens": 117413382.0,
"step": 615
},
{
"epoch": 0.21160409556313994,
"grad_norm": 0.4130984750062132,
"learning_rate": 4.688103183373877e-05,
"loss": 0.5355,
"num_tokens": 118465258.0,
"step": 620
},
{
"epoch": 0.21331058020477817,
"grad_norm": 0.4350871959699708,
"learning_rate": 4.6816214383257864e-05,
"loss": 0.5507,
"num_tokens": 119368272.0,
"step": 625
},
{
"epoch": 0.2150170648464164,
"grad_norm": 0.42712415977098717,
"learning_rate": 4.6750781565773524e-05,
"loss": 0.5376,
"num_tokens": 120323497.0,
"step": 630
},
{
"epoch": 0.2167235494880546,
"grad_norm": 0.4018174070587826,
"learning_rate": 4.6684735465814114e-05,
"loss": 0.5623,
"num_tokens": 121336091.0,
"step": 635
},
{
"epoch": 0.21843003412969283,
"grad_norm": 0.40967629179586657,
"learning_rate": 4.661807818744568e-05,
"loss": 0.5345,
"num_tokens": 122331818.0,
"step": 640
},
{
"epoch": 0.22013651877133106,
"grad_norm": 0.4516013972993547,
"learning_rate": 4.6550811854204896e-05,
"loss": 0.545,
"num_tokens": 123276577.0,
"step": 645
},
{
"epoch": 0.22184300341296928,
"grad_norm": 0.4167635858598471,
"learning_rate": 4.6482938609031406e-05,
"loss": 0.5574,
"num_tokens": 124260967.0,
"step": 650
},
{
"epoch": 0.2235494880546075,
"grad_norm": 0.4027183249033178,
"learning_rate": 4.6414460614199614e-05,
"loss": 0.558,
"num_tokens": 125178584.0,
"step": 655
},
{
"epoch": 0.22525597269624573,
"grad_norm": 0.3943619941232963,
"learning_rate": 4.6345380051249726e-05,
"loss": 0.5359,
"num_tokens": 126115279.0,
"step": 660
},
{
"epoch": 0.22696245733788395,
"grad_norm": 0.41605588299949164,
"learning_rate": 4.627569912091829e-05,
"loss": 0.5308,
"num_tokens": 127123510.0,
"step": 665
},
{
"epoch": 0.22866894197952217,
"grad_norm": 0.4232494441201061,
"learning_rate": 4.620542004306808e-05,
"loss": 0.5291,
"num_tokens": 128096244.0,
"step": 670
},
{
"epoch": 0.23037542662116042,
"grad_norm": 0.36709719294748555,
"learning_rate": 4.613454505661738e-05,
"loss": 0.545,
"num_tokens": 129070712.0,
"step": 675
},
{
"epoch": 0.23208191126279865,
"grad_norm": 0.4284861781533593,
"learning_rate": 4.606307641946867e-05,
"loss": 0.5639,
"num_tokens": 129992439.0,
"step": 680
},
{
"epoch": 0.23378839590443687,
"grad_norm": 0.4278163699139823,
"learning_rate": 4.599101640843664e-05,
"loss": 0.539,
"num_tokens": 130917322.0,
"step": 685
},
{
"epoch": 0.2354948805460751,
"grad_norm": 0.3880253078166962,
"learning_rate": 4.591836731917573e-05,
"loss": 0.5683,
"num_tokens": 131869001.0,
"step": 690
},
{
"epoch": 0.23720136518771331,
"grad_norm": 0.48519543443409835,
"learning_rate": 4.584513146610694e-05,
"loss": 0.5578,
"num_tokens": 132871820.0,
"step": 695
},
{
"epoch": 0.23890784982935154,
"grad_norm": 0.4331997241836761,
"learning_rate": 4.577131118234413e-05,
"loss": 0.5642,
"num_tokens": 133787994.0,
"step": 700
},
{
"epoch": 0.24061433447098976,
"grad_norm": 0.4124588160404072,
"learning_rate": 4.569690881961967e-05,
"loss": 0.531,
"num_tokens": 134665258.0,
"step": 705
},
{
"epoch": 0.24232081911262798,
"grad_norm": 0.40688160744028257,
"learning_rate": 4.562192674820957e-05,
"loss": 0.536,
"num_tokens": 135563718.0,
"step": 710
},
{
"epoch": 0.2440273037542662,
"grad_norm": 0.3855522562994955,
"learning_rate": 4.554636735685786e-05,
"loss": 0.5366,
"num_tokens": 136530274.0,
"step": 715
},
{
"epoch": 0.24573378839590443,
"grad_norm": 0.4249521897707511,
"learning_rate": 4.547023305270064e-05,
"loss": 0.5475,
"num_tokens": 137544925.0,
"step": 720
},
{
"epoch": 0.24744027303754265,
"grad_norm": 0.6254856649079702,
"learning_rate": 4.539352626118926e-05,
"loss": 0.5417,
"num_tokens": 138475799.0,
"step": 725
},
{
"epoch": 0.24914675767918087,
"grad_norm": 0.3911156875271076,
"learning_rate": 4.5316249426013126e-05,
"loss": 0.5201,
"num_tokens": 139435802.0,
"step": 730
},
{
"epoch": 0.2508532423208191,
"grad_norm": 0.4688833261931172,
"learning_rate": 4.523840500902183e-05,
"loss": 0.5373,
"num_tokens": 140314284.0,
"step": 735
},
{
"epoch": 0.2525597269624573,
"grad_norm": 0.4261015924614589,
"learning_rate": 4.515999549014673e-05,
"loss": 0.5329,
"num_tokens": 141219364.0,
"step": 740
},
{
"epoch": 0.25426621160409557,
"grad_norm": 0.46451445077951103,
"learning_rate": 4.5081023367321916e-05,
"loss": 0.5369,
"num_tokens": 142303539.0,
"step": 745
},
{
"epoch": 0.25597269624573377,
"grad_norm": 0.4899621819611558,
"learning_rate": 4.500149115640468e-05,
"loss": 0.5736,
"num_tokens": 143301347.0,
"step": 750
},
{
"epoch": 0.257679180887372,
"grad_norm": 0.40185507186448355,
"learning_rate": 4.492140139109533e-05,
"loss": 0.529,
"num_tokens": 144231893.0,
"step": 755
},
{
"epoch": 0.2593856655290102,
"grad_norm": 0.3751786581425354,
"learning_rate": 4.484075662285647e-05,
"loss": 0.5366,
"num_tokens": 145160611.0,
"step": 760
},
{
"epoch": 0.26109215017064846,
"grad_norm": 0.383945095710832,
"learning_rate": 4.475955942083176e-05,
"loss": 0.5286,
"num_tokens": 146121565.0,
"step": 765
},
{
"epoch": 0.2627986348122867,
"grad_norm": 0.39293769351911984,
"learning_rate": 4.4677812371764e-05,
"loss": 0.5177,
"num_tokens": 147031619.0,
"step": 770
},
{
"epoch": 0.2645051194539249,
"grad_norm": 0.36541237345446015,
"learning_rate": 4.45955180799128e-05,
"loss": 0.5289,
"num_tokens": 147981269.0,
"step": 775
},
{
"epoch": 0.26621160409556316,
"grad_norm": 0.4692171769339349,
"learning_rate": 4.4512679166971553e-05,
"loss": 0.5489,
"num_tokens": 148964661.0,
"step": 780
},
{
"epoch": 0.26791808873720135,
"grad_norm": 0.3687554090744247,
"learning_rate": 4.442929827198395e-05,
"loss": 0.5471,
"num_tokens": 150008239.0,
"step": 785
},
{
"epoch": 0.2696245733788396,
"grad_norm": 0.45244715310494926,
"learning_rate": 4.43453780512599e-05,
"loss": 0.5466,
"num_tokens": 150937307.0,
"step": 790
},
{
"epoch": 0.2713310580204778,
"grad_norm": 0.4452320820857673,
"learning_rate": 4.4260921178290866e-05,
"loss": 0.5407,
"num_tokens": 151860116.0,
"step": 795
},
{
"epoch": 0.27303754266211605,
"grad_norm": 0.42732274617096166,
"learning_rate": 4.417593034366478e-05,
"loss": 0.5311,
"num_tokens": 152834849.0,
"step": 800
},
{
"epoch": 0.27474402730375425,
"grad_norm": 0.4052649603454169,
"learning_rate": 4.409040825498024e-05,
"loss": 0.5115,
"num_tokens": 153761800.0,
"step": 805
},
{
"epoch": 0.2764505119453925,
"grad_norm": 0.4397536584781063,
"learning_rate": 4.40043576367603e-05,
"loss": 0.5268,
"num_tokens": 154739335.0,
"step": 810
},
{
"epoch": 0.2781569965870307,
"grad_norm": 0.42381590061263635,
"learning_rate": 4.3917781230365677e-05,
"loss": 0.5554,
"num_tokens": 155726110.0,
"step": 815
},
{
"epoch": 0.27986348122866894,
"grad_norm": 0.4189254439755074,
"learning_rate": 4.383068179390739e-05,
"loss": 0.5435,
"num_tokens": 156709373.0,
"step": 820
},
{
"epoch": 0.2815699658703072,
"grad_norm": 0.4194447248910214,
"learning_rate": 4.3743062102158896e-05,
"loss": 0.5318,
"num_tokens": 157605031.0,
"step": 825
},
{
"epoch": 0.2832764505119454,
"grad_norm": 0.3731419235770229,
"learning_rate": 4.3654924946467724e-05,
"loss": 0.517,
"num_tokens": 158541316.0,
"step": 830
},
{
"epoch": 0.28498293515358364,
"grad_norm": 0.3749285811277496,
"learning_rate": 4.3566273134666525e-05,
"loss": 0.5494,
"num_tokens": 159525622.0,
"step": 835
},
{
"epoch": 0.28668941979522183,
"grad_norm": 0.4309015308982524,
"learning_rate": 4.3477109490983626e-05,
"loss": 0.5424,
"num_tokens": 160459756.0,
"step": 840
},
{
"epoch": 0.2883959044368601,
"grad_norm": 0.3986942602299617,
"learning_rate": 4.338743685595304e-05,
"loss": 0.5228,
"num_tokens": 161382919.0,
"step": 845
},
{
"epoch": 0.2901023890784983,
"grad_norm": 0.4258708102475343,
"learning_rate": 4.329725808632403e-05,
"loss": 0.5365,
"num_tokens": 162358277.0,
"step": 850
},
{
"epoch": 0.29180887372013653,
"grad_norm": 0.38918974623711616,
"learning_rate": 4.320657605497001e-05,
"loss": 0.5522,
"num_tokens": 163332894.0,
"step": 855
},
{
"epoch": 0.2935153583617747,
"grad_norm": 0.41923964286389137,
"learning_rate": 4.3115393650797095e-05,
"loss": 0.5384,
"num_tokens": 164401378.0,
"step": 860
},
{
"epoch": 0.295221843003413,
"grad_norm": 0.4156858487356743,
"learning_rate": 4.3023713778652074e-05,
"loss": 0.5049,
"num_tokens": 165316411.0,
"step": 865
},
{
"epoch": 0.29692832764505117,
"grad_norm": 0.39774241581331693,
"learning_rate": 4.2931539359229804e-05,
"loss": 0.5192,
"num_tokens": 166276916.0,
"step": 870
},
{
"epoch": 0.2986348122866894,
"grad_norm": 0.35406058199730683,
"learning_rate": 4.283887332898019e-05,
"loss": 0.5127,
"num_tokens": 167298025.0,
"step": 875
},
{
"epoch": 0.3003412969283277,
"grad_norm": 0.3843676536025804,
"learning_rate": 4.2745718640014696e-05,
"loss": 0.5318,
"num_tokens": 168250987.0,
"step": 880
},
{
"epoch": 0.30204778156996587,
"grad_norm": 0.38653674163211976,
"learning_rate": 4.265207826001219e-05,
"loss": 0.5336,
"num_tokens": 169245557.0,
"step": 885
},
{
"epoch": 0.3037542662116041,
"grad_norm": 0.4322065456917861,
"learning_rate": 4.255795517212451e-05,
"loss": 0.5489,
"num_tokens": 170217424.0,
"step": 890
},
{
"epoch": 0.3054607508532423,
"grad_norm": 0.41164873108292127,
"learning_rate": 4.246335237488136e-05,
"loss": 0.5171,
"num_tokens": 171143325.0,
"step": 895
},
{
"epoch": 0.30716723549488056,
"grad_norm": 0.4104853255150601,
"learning_rate": 4.236827288209478e-05,
"loss": 0.5223,
"num_tokens": 172160313.0,
"step": 900
},
{
"epoch": 0.30887372013651876,
"grad_norm": 0.43043638343636487,
"learning_rate": 4.2272719722763197e-05,
"loss": 0.5246,
"num_tokens": 173195128.0,
"step": 905
},
{
"epoch": 0.310580204778157,
"grad_norm": 0.41502896344603557,
"learning_rate": 4.217669594097485e-05,
"loss": 0.5379,
"num_tokens": 174112017.0,
"step": 910
},
{
"epoch": 0.3122866894197952,
"grad_norm": 0.40961938646342305,
"learning_rate": 4.208020459581087e-05,
"loss": 0.5343,
"num_tokens": 175151908.0,
"step": 915
},
{
"epoch": 0.31399317406143346,
"grad_norm": 0.4009346397822526,
"learning_rate": 4.19832487612478e-05,
"loss": 0.5057,
"num_tokens": 176115268.0,
"step": 920
},
{
"epoch": 0.31569965870307165,
"grad_norm": 0.38967085585332806,
"learning_rate": 4.1885831526059674e-05,
"loss": 0.5108,
"num_tokens": 177084976.0,
"step": 925
},
{
"epoch": 0.3174061433447099,
"grad_norm": 0.3386319476796638,
"learning_rate": 4.178795599371961e-05,
"loss": 0.4975,
"num_tokens": 178049137.0,
"step": 930
},
{
"epoch": 0.3191126279863481,
"grad_norm": 0.430097609031975,
"learning_rate": 4.168962528230096e-05,
"loss": 0.5321,
"num_tokens": 178990489.0,
"step": 935
},
{
"epoch": 0.32081911262798635,
"grad_norm": 0.3918509038411313,
"learning_rate": 4.1590842524377914e-05,
"loss": 0.5297,
"num_tokens": 179947208.0,
"step": 940
},
{
"epoch": 0.3225255972696246,
"grad_norm": 0.4113406051465179,
"learning_rate": 4.149161086692581e-05,
"loss": 0.5375,
"num_tokens": 180895300.0,
"step": 945
},
{
"epoch": 0.3242320819112628,
"grad_norm": 0.3812963085578539,
"learning_rate": 4.139193347122077e-05,
"loss": 0.5323,
"num_tokens": 181891310.0,
"step": 950
},
{
"epoch": 0.32593856655290104,
"grad_norm": 0.427163248722511,
"learning_rate": 4.1291813512739074e-05,
"loss": 0.53,
"num_tokens": 182829455.0,
"step": 955
},
{
"epoch": 0.32764505119453924,
"grad_norm": 0.3684699313638392,
"learning_rate": 4.1191254181055936e-05,
"loss": 0.52,
"num_tokens": 183776326.0,
"step": 960
},
{
"epoch": 0.3293515358361775,
"grad_norm": 0.36070078090014446,
"learning_rate": 4.1090258679743934e-05,
"loss": 0.5176,
"num_tokens": 184739434.0,
"step": 965
},
{
"epoch": 0.3310580204778157,
"grad_norm": 0.45870874889230046,
"learning_rate": 4.098883022627094e-05,
"loss": 0.5657,
"num_tokens": 185721070.0,
"step": 970
},
{
"epoch": 0.33276450511945393,
"grad_norm": 0.4188344945747834,
"learning_rate": 4.0886972051897594e-05,
"loss": 0.533,
"num_tokens": 186739113.0,
"step": 975
},
{
"epoch": 0.33447098976109213,
"grad_norm": 0.45500040021056365,
"learning_rate": 4.078468740157439e-05,
"loss": 0.537,
"num_tokens": 187730193.0,
"step": 980
},
{
"epoch": 0.3361774744027304,
"grad_norm": 0.40761293819970623,
"learning_rate": 4.068197953383832e-05,
"loss": 0.5221,
"num_tokens": 188652348.0,
"step": 985
},
{
"epoch": 0.3378839590443686,
"grad_norm": 0.4263098333005298,
"learning_rate": 4.0578851720709e-05,
"loss": 0.519,
"num_tokens": 189611145.0,
"step": 990
},
{
"epoch": 0.3395904436860068,
"grad_norm": 0.35780922785817854,
"learning_rate": 4.047530724758451e-05,
"loss": 0.5263,
"num_tokens": 190593764.0,
"step": 995
},
{
"epoch": 0.3412969283276451,
"grad_norm": 0.3827403926446559,
"learning_rate": 4.037134941313668e-05,
"loss": 0.5182,
"num_tokens": 191543591.0,
"step": 1000
},
{
"epoch": 0.3430034129692833,
"grad_norm": 0.391417552124784,
"learning_rate": 4.026698152920599e-05,
"loss": 0.514,
"num_tokens": 192536034.0,
"step": 1005
},
{
"epoch": 0.3447098976109215,
"grad_norm": 0.3750506973668557,
"learning_rate": 4.016220692069612e-05,
"loss": 0.5227,
"num_tokens": 193451364.0,
"step": 1010
},
{
"epoch": 0.3464163822525597,
"grad_norm": 0.40025338962482404,
"learning_rate": 4.005702892546798e-05,
"loss": 0.534,
"num_tokens": 194391841.0,
"step": 1015
},
{
"epoch": 0.34812286689419797,
"grad_norm": 0.36297817083661593,
"learning_rate": 3.9951450894233365e-05,
"loss": 0.5183,
"num_tokens": 195399830.0,
"step": 1020
},
{
"epoch": 0.34982935153583616,
"grad_norm": 0.3598098656728241,
"learning_rate": 3.984547619044827e-05,
"loss": 0.5115,
"num_tokens": 196363387.0,
"step": 1025
},
{
"epoch": 0.3515358361774744,
"grad_norm": 0.38480064839017475,
"learning_rate": 3.973910819020567e-05,
"loss": 0.5009,
"num_tokens": 197289380.0,
"step": 1030
},
{
"epoch": 0.3532423208191126,
"grad_norm": 0.37900568253469097,
"learning_rate": 3.963235028212802e-05,
"loss": 0.5334,
"num_tokens": 198263603.0,
"step": 1035
},
{
"epoch": 0.35494880546075086,
"grad_norm": 0.35505267262656565,
"learning_rate": 3.9525205867259246e-05,
"loss": 0.4977,
"num_tokens": 199248080.0,
"step": 1040
},
{
"epoch": 0.35665529010238906,
"grad_norm": 0.4178322580187183,
"learning_rate": 3.941767835895647e-05,
"loss": 0.5247,
"num_tokens": 200182740.0,
"step": 1045
},
{
"epoch": 0.3583617747440273,
"grad_norm": 0.3886991806809196,
"learning_rate": 3.9309771182781194e-05,
"loss": 0.5592,
"num_tokens": 201135153.0,
"step": 1050
},
{
"epoch": 0.36006825938566556,
"grad_norm": 0.3729591088065611,
"learning_rate": 3.9201487776390215e-05,
"loss": 0.5174,
"num_tokens": 202034499.0,
"step": 1055
},
{
"epoch": 0.36177474402730375,
"grad_norm": 0.43050604928861597,
"learning_rate": 3.90928315894261e-05,
"loss": 0.5203,
"num_tokens": 202936389.0,
"step": 1060
},
{
"epoch": 0.363481228668942,
"grad_norm": 0.3903868868255724,
"learning_rate": 3.898380608340728e-05,
"loss": 0.5121,
"num_tokens": 203808949.0,
"step": 1065
},
{
"epoch": 0.3651877133105802,
"grad_norm": 0.43315506398310644,
"learning_rate": 3.887441473161779e-05,
"loss": 0.5268,
"num_tokens": 204803047.0,
"step": 1070
},
{
"epoch": 0.36689419795221845,
"grad_norm": 0.3826063463200933,
"learning_rate": 3.87646610189966e-05,
"loss": 0.526,
"num_tokens": 205764413.0,
"step": 1075
},
{
"epoch": 0.36860068259385664,
"grad_norm": 0.3804292887775751,
"learning_rate": 3.8654548442026615e-05,
"loss": 0.5121,
"num_tokens": 206795183.0,
"step": 1080
},
{
"epoch": 0.3703071672354949,
"grad_norm": 0.386486637072244,
"learning_rate": 3.854408050862326e-05,
"loss": 0.5197,
"num_tokens": 207776278.0,
"step": 1085
},
{
"epoch": 0.3720136518771331,
"grad_norm": 0.3679656863557538,
"learning_rate": 3.843326073802275e-05,
"loss": 0.5109,
"num_tokens": 208806680.0,
"step": 1090
},
{
"epoch": 0.37372013651877134,
"grad_norm": 0.36341929778692034,
"learning_rate": 3.832209266066996e-05,
"loss": 0.5117,
"num_tokens": 209705965.0,
"step": 1095
},
{
"epoch": 0.37542662116040953,
"grad_norm": 0.36095775324919993,
"learning_rate": 3.821057981810597e-05,
"loss": 0.5173,
"num_tokens": 210620229.0,
"step": 1100
},
{
"epoch": 0.3771331058020478,
"grad_norm": 0.37920120200226964,
"learning_rate": 3.809872576285522e-05,
"loss": 0.5278,
"num_tokens": 211518941.0,
"step": 1105
},
{
"epoch": 0.378839590443686,
"grad_norm": 0.37125338141855646,
"learning_rate": 3.798653405831236e-05,
"loss": 0.5213,
"num_tokens": 212481056.0,
"step": 1110
},
{
"epoch": 0.38054607508532423,
"grad_norm": 0.39363430670991295,
"learning_rate": 3.78740082786287e-05,
"loss": 0.5081,
"num_tokens": 213401212.0,
"step": 1115
},
{
"epoch": 0.3822525597269625,
"grad_norm": 0.41301840155723674,
"learning_rate": 3.7761152008598356e-05,
"loss": 0.5262,
"num_tokens": 214296967.0,
"step": 1120
},
{
"epoch": 0.3839590443686007,
"grad_norm": 0.4004388299714662,
"learning_rate": 3.764796884354408e-05,
"loss": 0.5295,
"num_tokens": 215306580.0,
"step": 1125
},
{
"epoch": 0.3856655290102389,
"grad_norm": 0.35857577455632367,
"learning_rate": 3.7534462389202655e-05,
"loss": 0.5328,
"num_tokens": 216266686.0,
"step": 1130
},
{
"epoch": 0.3873720136518771,
"grad_norm": 0.357927002022941,
"learning_rate": 3.742063626161011e-05,
"loss": 0.5307,
"num_tokens": 217244190.0,
"step": 1135
},
{
"epoch": 0.3890784982935154,
"grad_norm": 0.4316739204850572,
"learning_rate": 3.7306494086986424e-05,
"loss": 0.5115,
"num_tokens": 218179883.0,
"step": 1140
},
{
"epoch": 0.39078498293515357,
"grad_norm": 0.3236368199361241,
"learning_rate": 3.7192039501620114e-05,
"loss": 0.5265,
"num_tokens": 219217201.0,
"step": 1145
},
{
"epoch": 0.3924914675767918,
"grad_norm": 0.38665727290508983,
"learning_rate": 3.7077276151752274e-05,
"loss": 0.5137,
"num_tokens": 220144855.0,
"step": 1150
},
{
"epoch": 0.39419795221843,
"grad_norm": 0.37991237354351537,
"learning_rate": 3.696220769346052e-05,
"loss": 0.515,
"num_tokens": 221131861.0,
"step": 1155
},
{
"epoch": 0.39590443686006827,
"grad_norm": 0.4047905746984203,
"learning_rate": 3.6846837792542446e-05,
"loss": 0.5289,
"num_tokens": 222093783.0,
"step": 1160
},
{
"epoch": 0.39761092150170646,
"grad_norm": 0.4226911202426522,
"learning_rate": 3.673117012439889e-05,
"loss": 0.5267,
"num_tokens": 223054352.0,
"step": 1165
},
{
"epoch": 0.3993174061433447,
"grad_norm": 0.37522461139025454,
"learning_rate": 3.6615208373916775e-05,
"loss": 0.4879,
"num_tokens": 223929151.0,
"step": 1170
},
{
"epoch": 0.40102389078498296,
"grad_norm": 0.40207189529772014,
"learning_rate": 3.6498956235351815e-05,
"loss": 0.5245,
"num_tokens": 224865728.0,
"step": 1175
},
{
"epoch": 0.40273037542662116,
"grad_norm": 0.3930165821540444,
"learning_rate": 3.6382417412210744e-05,
"loss": 0.5087,
"num_tokens": 225865685.0,
"step": 1180
},
{
"epoch": 0.4044368600682594,
"grad_norm": 0.3578549306424923,
"learning_rate": 3.6265595617133366e-05,
"loss": 0.4939,
"num_tokens": 226749326.0,
"step": 1185
},
{
"epoch": 0.4061433447098976,
"grad_norm": 0.36807883381787004,
"learning_rate": 3.6148494571774275e-05,
"loss": 0.5286,
"num_tokens": 227786006.0,
"step": 1190
},
{
"epoch": 0.40784982935153585,
"grad_norm": 0.41611237665417367,
"learning_rate": 3.603111800668428e-05,
"loss": 0.5099,
"num_tokens": 228763631.0,
"step": 1195
},
{
"epoch": 0.40955631399317405,
"grad_norm": 0.3799547117691007,
"learning_rate": 3.591346966119159e-05,
"loss": 0.5094,
"num_tokens": 229748231.0,
"step": 1200
},
{
"epoch": 0.4112627986348123,
"grad_norm": 0.3478677676406051,
"learning_rate": 3.579555328328265e-05,
"loss": 0.5117,
"num_tokens": 230738165.0,
"step": 1205
},
{
"epoch": 0.4129692832764505,
"grad_norm": 0.3296304695850409,
"learning_rate": 3.5677372629482775e-05,
"loss": 0.521,
"num_tokens": 231716185.0,
"step": 1210
},
{
"epoch": 0.41467576791808874,
"grad_norm": 0.382979600202463,
"learning_rate": 3.555893146473644e-05,
"loss": 0.5262,
"num_tokens": 232698142.0,
"step": 1215
},
{
"epoch": 0.41638225255972694,
"grad_norm": 0.35116928864638025,
"learning_rate": 3.5440233562287376e-05,
"loss": 0.5417,
"num_tokens": 233655900.0,
"step": 1220
},
{
"epoch": 0.4180887372013652,
"grad_norm": 0.3728136743436132,
"learning_rate": 3.532128270355832e-05,
"loss": 0.516,
"num_tokens": 234596302.0,
"step": 1225
},
{
"epoch": 0.4197952218430034,
"grad_norm": 0.4116335593681845,
"learning_rate": 3.520208267803059e-05,
"loss": 0.5242,
"num_tokens": 235502719.0,
"step": 1230
},
{
"epoch": 0.42150170648464164,
"grad_norm": 0.38580680323921696,
"learning_rate": 3.508263728312336e-05,
"loss": 0.5278,
"num_tokens": 236475023.0,
"step": 1235
},
{
"epoch": 0.4232081911262799,
"grad_norm": 0.38551884251450047,
"learning_rate": 3.496295032407263e-05,
"loss": 0.5229,
"num_tokens": 237433481.0,
"step": 1240
},
{
"epoch": 0.4249146757679181,
"grad_norm": 0.38557583134333984,
"learning_rate": 3.484302561381007e-05,
"loss": 0.5029,
"num_tokens": 238378423.0,
"step": 1245
},
{
"epoch": 0.42662116040955633,
"grad_norm": 0.40297119331954867,
"learning_rate": 3.47228669728415e-05,
"loss": 0.5288,
"num_tokens": 239310469.0,
"step": 1250
},
{
"epoch": 0.4283276450511945,
"grad_norm": 0.36323193941329485,
"learning_rate": 3.4602478229125197e-05,
"loss": 0.5178,
"num_tokens": 240265629.0,
"step": 1255
},
{
"epoch": 0.4300341296928328,
"grad_norm": 0.4010782882289214,
"learning_rate": 3.4481863217949964e-05,
"loss": 0.5211,
"num_tokens": 241153898.0,
"step": 1260
},
{
"epoch": 0.431740614334471,
"grad_norm": 0.3725440207487266,
"learning_rate": 3.43610257818129e-05,
"loss": 0.5339,
"num_tokens": 242074086.0,
"step": 1265
},
{
"epoch": 0.4334470989761092,
"grad_norm": 0.3573796647038546,
"learning_rate": 3.4239969770297033e-05,
"loss": 0.5275,
"num_tokens": 243032696.0,
"step": 1270
},
{
"epoch": 0.4351535836177474,
"grad_norm": 0.3500180473141626,
"learning_rate": 3.411869903994867e-05,
"loss": 0.5237,
"num_tokens": 244052484.0,
"step": 1275
},
{
"epoch": 0.43686006825938567,
"grad_norm": 0.3630134268573173,
"learning_rate": 3.399721745415451e-05,
"loss": 0.4863,
"num_tokens": 245008254.0,
"step": 1280
},
{
"epoch": 0.43856655290102387,
"grad_norm": 0.3276111054395765,
"learning_rate": 3.38755288830186e-05,
"loss": 0.5239,
"num_tokens": 246076299.0,
"step": 1285
},
{
"epoch": 0.4402730375426621,
"grad_norm": 0.37922721950544974,
"learning_rate": 3.375363720323904e-05,
"loss": 0.5558,
"num_tokens": 247016964.0,
"step": 1290
},
{
"epoch": 0.44197952218430037,
"grad_norm": 0.4053020528274456,
"learning_rate": 3.363154629798444e-05,
"loss": 0.4991,
"num_tokens": 247913243.0,
"step": 1295
},
{
"epoch": 0.44368600682593856,
"grad_norm": 0.39968767139036077,
"learning_rate": 3.350926005677027e-05,
"loss": 0.5163,
"num_tokens": 248791992.0,
"step": 1300
},
{
"epoch": 0.4453924914675768,
"grad_norm": 0.4196675646397248,
"learning_rate": 3.338678237533491e-05,
"loss": 0.5155,
"num_tokens": 249736240.0,
"step": 1305
},
{
"epoch": 0.447098976109215,
"grad_norm": 0.36733233226120704,
"learning_rate": 3.326411715551559e-05,
"loss": 0.5187,
"num_tokens": 250713070.0,
"step": 1310
},
{
"epoch": 0.44880546075085326,
"grad_norm": 0.3526872487151841,
"learning_rate": 3.314126830512397e-05,
"loss": 0.5183,
"num_tokens": 251635307.0,
"step": 1315
},
{
"epoch": 0.45051194539249145,
"grad_norm": 0.3666740114223966,
"learning_rate": 3.3018239737821806e-05,
"loss": 0.4913,
"num_tokens": 252648795.0,
"step": 1320
},
{
"epoch": 0.4522184300341297,
"grad_norm": 0.3865152760583026,
"learning_rate": 3.289503537299616e-05,
"loss": 0.5326,
"num_tokens": 253618343.0,
"step": 1325
},
{
"epoch": 0.4539249146757679,
"grad_norm": 0.3882700849297493,
"learning_rate": 3.2771659135634564e-05,
"loss": 0.5033,
"num_tokens": 254539106.0,
"step": 1330
},
{
"epoch": 0.45563139931740615,
"grad_norm": 0.37283505962982216,
"learning_rate": 3.2648114956200005e-05,
"loss": 0.5134,
"num_tokens": 255475551.0,
"step": 1335
},
{
"epoch": 0.45733788395904434,
"grad_norm": 0.3736180838966003,
"learning_rate": 3.2524406770505675e-05,
"loss": 0.5212,
"num_tokens": 256460069.0,
"step": 1340
},
{
"epoch": 0.4590443686006826,
"grad_norm": 0.3711824529096275,
"learning_rate": 3.240053851958961e-05,
"loss": 0.4986,
"num_tokens": 257384246.0,
"step": 1345
},
{
"epoch": 0.46075085324232085,
"grad_norm": 0.35194816416518127,
"learning_rate": 3.227651414958912e-05,
"loss": 0.4996,
"num_tokens": 258439462.0,
"step": 1350
},
{
"epoch": 0.46245733788395904,
"grad_norm": 0.3636023146472485,
"learning_rate": 3.2152337611615096e-05,
"loss": 0.5128,
"num_tokens": 259419905.0,
"step": 1355
},
{
"epoch": 0.4641638225255973,
"grad_norm": 0.3499395456709178,
"learning_rate": 3.202801286162611e-05,
"loss": 0.529,
"num_tokens": 260499223.0,
"step": 1360
},
{
"epoch": 0.4658703071672355,
"grad_norm": 0.344200324854834,
"learning_rate": 3.1903543860302445e-05,
"loss": 0.4954,
"num_tokens": 261442637.0,
"step": 1365
},
{
"epoch": 0.46757679180887374,
"grad_norm": 0.34635826868295416,
"learning_rate": 3.1778934572919805e-05,
"loss": 0.5053,
"num_tokens": 262428104.0,
"step": 1370
},
{
"epoch": 0.46928327645051193,
"grad_norm": 0.3434947744560547,
"learning_rate": 3.165418896922313e-05,
"loss": 0.4892,
"num_tokens": 263310660.0,
"step": 1375
},
{
"epoch": 0.4709897610921502,
"grad_norm": 0.36603548416607035,
"learning_rate": 3.152931102330002e-05,
"loss": 0.5193,
"num_tokens": 264331327.0,
"step": 1380
},
{
"epoch": 0.4726962457337884,
"grad_norm": 0.36277672500545255,
"learning_rate": 3.140430471345419e-05,
"loss": 0.5103,
"num_tokens": 265270147.0,
"step": 1385
},
{
"epoch": 0.47440273037542663,
"grad_norm": 0.36093915798723425,
"learning_rate": 3.127917402207871e-05,
"loss": 0.5125,
"num_tokens": 266242185.0,
"step": 1390
},
{
"epoch": 0.4761092150170648,
"grad_norm": 0.3391219737377559,
"learning_rate": 3.115392293552915e-05,
"loss": 0.5119,
"num_tokens": 267191130.0,
"step": 1395
},
{
"epoch": 0.4778156996587031,
"grad_norm": 0.35339290653989003,
"learning_rate": 3.1028555443996544e-05,
"loss": 0.5099,
"num_tokens": 268142845.0,
"step": 1400
},
{
"epoch": 0.47952218430034127,
"grad_norm": 0.3520073614880941,
"learning_rate": 3.090307554138033e-05,
"loss": 0.527,
"num_tokens": 269116555.0,
"step": 1405
},
{
"epoch": 0.4812286689419795,
"grad_norm": 0.32746517176816037,
"learning_rate": 3.0777487225161096e-05,
"loss": 0.5171,
"num_tokens": 270078357.0,
"step": 1410
},
{
"epoch": 0.48293515358361777,
"grad_norm": 0.3801899716265933,
"learning_rate": 3.065179449627316e-05,
"loss": 0.5179,
"num_tokens": 271065401.0,
"step": 1415
},
{
"epoch": 0.48464163822525597,
"grad_norm": 0.34972336470548876,
"learning_rate": 3.0526001358977254e-05,
"loss": 0.5192,
"num_tokens": 272018748.0,
"step": 1420
},
{
"epoch": 0.4863481228668942,
"grad_norm": 0.4088433608953109,
"learning_rate": 3.0400111820732802e-05,
"loss": 0.5202,
"num_tokens": 273051158.0,
"step": 1425
},
{
"epoch": 0.4880546075085324,
"grad_norm": 0.4045530258228776,
"learning_rate": 3.0274129892070368e-05,
"loss": 0.5363,
"num_tokens": 274027158.0,
"step": 1430
},
{
"epoch": 0.48976109215017066,
"grad_norm": 0.37760264633069307,
"learning_rate": 3.014805958646383e-05,
"loss": 0.5071,
"num_tokens": 274976608.0,
"step": 1435
},
{
"epoch": 0.49146757679180886,
"grad_norm": 0.41544044167849326,
"learning_rate": 3.002190492020255e-05,
"loss": 0.5336,
"num_tokens": 275897357.0,
"step": 1440
},
{
"epoch": 0.4931740614334471,
"grad_norm": 0.3621173202789576,
"learning_rate": 2.9895669912263393e-05,
"loss": 0.4884,
"num_tokens": 276767022.0,
"step": 1445
},
{
"epoch": 0.4948805460750853,
"grad_norm": 0.35971250539401595,
"learning_rate": 2.9769358584182732e-05,
"loss": 0.4929,
"num_tokens": 277733458.0,
"step": 1450
},
{
"epoch": 0.49658703071672355,
"grad_norm": 0.3333198556836057,
"learning_rate": 2.9642974959928293e-05,
"loss": 0.5181,
"num_tokens": 278655070.0,
"step": 1455
},
{
"epoch": 0.49829351535836175,
"grad_norm": 0.40269115529983035,
"learning_rate": 2.9516523065771e-05,
"loss": 0.5092,
"num_tokens": 279550428.0,
"step": 1460
},
{
"epoch": 0.5,
"grad_norm": 0.3375843000024675,
"learning_rate": 2.9390006930156683e-05,
"loss": 0.5035,
"num_tokens": 280592599.0,
"step": 1465
},
{
"epoch": 0.5017064846416383,
"grad_norm": 0.33837995836306645,
"learning_rate": 2.9263430583577715e-05,
"loss": 0.4936,
"num_tokens": 281502549.0,
"step": 1470
},
{
"epoch": 0.5034129692832765,
"grad_norm": 0.3491138042125671,
"learning_rate": 2.9136798058444704e-05,
"loss": 0.5186,
"num_tokens": 282554594.0,
"step": 1475
},
{
"epoch": 0.5051194539249146,
"grad_norm": 0.36149705403685856,
"learning_rate": 2.9010113388957906e-05,
"loss": 0.4996,
"num_tokens": 283508120.0,
"step": 1480
},
{
"epoch": 0.5068259385665529,
"grad_norm": 0.3682570368717468,
"learning_rate": 2.8883380610978804e-05,
"loss": 0.4868,
"num_tokens": 284430674.0,
"step": 1485
},
{
"epoch": 0.5085324232081911,
"grad_norm": 0.3450199220270282,
"learning_rate": 2.875660376190149e-05,
"loss": 0.5225,
"num_tokens": 285480194.0,
"step": 1490
},
{
"epoch": 0.5102389078498294,
"grad_norm": 0.35852992172619397,
"learning_rate": 2.8629786880524057e-05,
"loss": 0.5044,
"num_tokens": 286426656.0,
"step": 1495
},
{
"epoch": 0.5119453924914675,
"grad_norm": 0.35758605357343837,
"learning_rate": 2.8502934006919908e-05,
"loss": 0.531,
"num_tokens": 287419124.0,
"step": 1500
},
{
"epoch": 0.5136518771331058,
"grad_norm": 0.3679994961058525,
"learning_rate": 2.83760491823091e-05,
"loss": 0.4891,
"num_tokens": 288343301.0,
"step": 1505
},
{
"epoch": 0.515358361774744,
"grad_norm": 0.386078898523489,
"learning_rate": 2.824913644892955e-05,
"loss": 0.4912,
"num_tokens": 289306762.0,
"step": 1510
},
{
"epoch": 0.5170648464163823,
"grad_norm": 0.33205328887110974,
"learning_rate": 2.8122199849908286e-05,
"loss": 0.5047,
"num_tokens": 290236538.0,
"step": 1515
},
{
"epoch": 0.5187713310580204,
"grad_norm": 0.36598920383011924,
"learning_rate": 2.7995243429132644e-05,
"loss": 0.5082,
"num_tokens": 291105578.0,
"step": 1520
},
{
"epoch": 0.5204778156996587,
"grad_norm": 0.3577740364047028,
"learning_rate": 2.7868271231121406e-05,
"loss": 0.5271,
"num_tokens": 292089939.0,
"step": 1525
},
{
"epoch": 0.5221843003412969,
"grad_norm": 0.3395160116353141,
"learning_rate": 2.7741287300896013e-05,
"loss": 0.4958,
"num_tokens": 293082816.0,
"step": 1530
},
{
"epoch": 0.5238907849829352,
"grad_norm": 0.3750202797810289,
"learning_rate": 2.7614295683851637e-05,
"loss": 0.5043,
"num_tokens": 293957075.0,
"step": 1535
},
{
"epoch": 0.5255972696245734,
"grad_norm": 0.36593612437784134,
"learning_rate": 2.7487300425628347e-05,
"loss": 0.4999,
"num_tokens": 294930434.0,
"step": 1540
},
{
"epoch": 0.5273037542662116,
"grad_norm": 0.36077082410017175,
"learning_rate": 2.7360305571982213e-05,
"loss": 0.517,
"num_tokens": 295898443.0,
"step": 1545
},
{
"epoch": 0.5290102389078498,
"grad_norm": 0.34084324496634494,
"learning_rate": 2.723331516865641e-05,
"loss": 0.5042,
"num_tokens": 296842807.0,
"step": 1550
},
{
"epoch": 0.5307167235494881,
"grad_norm": 0.3473414935833505,
"learning_rate": 2.7106333261252342e-05,
"loss": 0.5141,
"num_tokens": 297874811.0,
"step": 1555
},
{
"epoch": 0.5324232081911263,
"grad_norm": 0.37129914119401464,
"learning_rate": 2.697936389510073e-05,
"loss": 0.5019,
"num_tokens": 298726998.0,
"step": 1560
},
{
"epoch": 0.5341296928327645,
"grad_norm": 0.36028226696611454,
"learning_rate": 2.685241111513281e-05,
"loss": 0.5116,
"num_tokens": 299723782.0,
"step": 1565
},
{
"epoch": 0.5358361774744027,
"grad_norm": 0.3655240725721465,
"learning_rate": 2.6725478965751378e-05,
"loss": 0.4864,
"num_tokens": 300660125.0,
"step": 1570
},
{
"epoch": 0.537542662116041,
"grad_norm": 0.3527263064768574,
"learning_rate": 2.6598571490702013e-05,
"loss": 0.4997,
"num_tokens": 301489572.0,
"step": 1575
},
{
"epoch": 0.5392491467576792,
"grad_norm": 0.3514385599593041,
"learning_rate": 2.6471692732944227e-05,
"loss": 0.4773,
"num_tokens": 302437719.0,
"step": 1580
},
{
"epoch": 0.5409556313993175,
"grad_norm": 0.5417409716600186,
"learning_rate": 2.634484673452265e-05,
"loss": 0.5256,
"num_tokens": 303463770.0,
"step": 1585
},
{
"epoch": 0.5426621160409556,
"grad_norm": 0.3711273299009024,
"learning_rate": 2.6218037536438315e-05,
"loss": 0.5067,
"num_tokens": 304343518.0,
"step": 1590
},
{
"epoch": 0.5443686006825939,
"grad_norm": 0.38356094617087266,
"learning_rate": 2.6091269178519885e-05,
"loss": 0.5195,
"num_tokens": 305270656.0,
"step": 1595
},
{
"epoch": 0.5460750853242321,
"grad_norm": 0.36905989432450675,
"learning_rate": 2.5964545699294906e-05,
"loss": 0.5049,
"num_tokens": 306180961.0,
"step": 1600
},
{
"epoch": 0.5477815699658704,
"grad_norm": 0.3692380599855385,
"learning_rate": 2.583787113586126e-05,
"loss": 0.5315,
"num_tokens": 307152419.0,
"step": 1605
},
{
"epoch": 0.5494880546075085,
"grad_norm": 0.3565584564996635,
"learning_rate": 2.571124952375845e-05,
"loss": 0.5028,
"num_tokens": 308076053.0,
"step": 1610
},
{
"epoch": 0.5511945392491467,
"grad_norm": 0.6182642964929036,
"learning_rate": 2.55846848968391e-05,
"loss": 0.5168,
"num_tokens": 309029777.0,
"step": 1615
},
{
"epoch": 0.552901023890785,
"grad_norm": 0.3804959240952762,
"learning_rate": 2.545818128714043e-05,
"loss": 0.4985,
"num_tokens": 310003006.0,
"step": 1620
},
{
"epoch": 0.5546075085324232,
"grad_norm": 0.3585357836902996,
"learning_rate": 2.533174272475579e-05,
"loss": 0.4889,
"num_tokens": 310946881.0,
"step": 1625
},
{
"epoch": 0.5563139931740614,
"grad_norm": 0.36197226313264375,
"learning_rate": 2.52053732377063e-05,
"loss": 0.5011,
"num_tokens": 311908102.0,
"step": 1630
},
{
"epoch": 0.5580204778156996,
"grad_norm": 0.40032263570721643,
"learning_rate": 2.5079076851812476e-05,
"loss": 0.5089,
"num_tokens": 312808887.0,
"step": 1635
},
{
"epoch": 0.5597269624573379,
"grad_norm": 0.3451971824355068,
"learning_rate": 2.4952857590566043e-05,
"loss": 0.493,
"num_tokens": 313777123.0,
"step": 1640
},
{
"epoch": 0.5614334470989761,
"grad_norm": 0.3900504406034111,
"learning_rate": 2.4826719475001714e-05,
"loss": 0.5094,
"num_tokens": 314739056.0,
"step": 1645
},
{
"epoch": 0.5631399317406144,
"grad_norm": 0.3493313787920549,
"learning_rate": 2.4700666523569106e-05,
"loss": 0.4898,
"num_tokens": 315742426.0,
"step": 1650
},
{
"epoch": 0.5648464163822525,
"grad_norm": 0.3438326732935695,
"learning_rate": 2.4574702752004703e-05,
"loss": 0.5175,
"num_tokens": 316737000.0,
"step": 1655
},
{
"epoch": 0.5665529010238908,
"grad_norm": 0.35276865905225485,
"learning_rate": 2.444883217320395e-05,
"loss": 0.495,
"num_tokens": 317577413.0,
"step": 1660
},
{
"epoch": 0.568259385665529,
"grad_norm": 0.3407109448961259,
"learning_rate": 2.4323058797093395e-05,
"loss": 0.504,
"num_tokens": 318577669.0,
"step": 1665
},
{
"epoch": 0.5699658703071673,
"grad_norm": 0.4001170454014252,
"learning_rate": 2.4197386630502965e-05,
"loss": 0.4969,
"num_tokens": 319557900.0,
"step": 1670
},
{
"epoch": 0.5716723549488054,
"grad_norm": 0.34709303207454156,
"learning_rate": 2.407181967703826e-05,
"loss": 0.5009,
"num_tokens": 320511805.0,
"step": 1675
},
{
"epoch": 0.5733788395904437,
"grad_norm": 0.371204319077067,
"learning_rate": 2.3946361936953092e-05,
"loss": 0.5075,
"num_tokens": 321462994.0,
"step": 1680
},
{
"epoch": 0.5750853242320819,
"grad_norm": 0.35657413743817584,
"learning_rate": 2.382101740702199e-05,
"loss": 0.4846,
"num_tokens": 322380429.0,
"step": 1685
},
{
"epoch": 0.5767918088737202,
"grad_norm": 0.3373065447243538,
"learning_rate": 2.369579008041286e-05,
"loss": 0.5064,
"num_tokens": 323355363.0,
"step": 1690
},
{
"epoch": 0.5784982935153583,
"grad_norm": 0.34242628423967963,
"learning_rate": 2.3570683946559835e-05,
"loss": 0.5057,
"num_tokens": 324276849.0,
"step": 1695
},
{
"epoch": 0.5802047781569966,
"grad_norm": 0.3424829522431512,
"learning_rate": 2.3445702991036138e-05,
"loss": 0.4915,
"num_tokens": 325155802.0,
"step": 1700
},
{
"epoch": 0.5819112627986348,
"grad_norm": 0.38418791267218005,
"learning_rate": 2.332085119542711e-05,
"loss": 0.4747,
"num_tokens": 325996402.0,
"step": 1705
},
{
"epoch": 0.5836177474402731,
"grad_norm": 0.3720571745285186,
"learning_rate": 2.319613253720338e-05,
"loss": 0.5314,
"num_tokens": 326956942.0,
"step": 1710
},
{
"epoch": 0.5853242320819113,
"grad_norm": 0.35719133127267255,
"learning_rate": 2.3071550989594133e-05,
"loss": 0.5122,
"num_tokens": 327985119.0,
"step": 1715
},
{
"epoch": 0.5870307167235495,
"grad_norm": 0.38622831687102893,
"learning_rate": 2.2947110521460567e-05,
"loss": 0.4888,
"num_tokens": 328885222.0,
"step": 1720
},
{
"epoch": 0.5887372013651877,
"grad_norm": 0.35596529449616776,
"learning_rate": 2.2822815097169447e-05,
"loss": 0.5065,
"num_tokens": 329923181.0,
"step": 1725
},
{
"epoch": 0.590443686006826,
"grad_norm": 0.36830928128106777,
"learning_rate": 2.269866867646675e-05,
"loss": 0.4908,
"num_tokens": 330878184.0,
"step": 1730
},
{
"epoch": 0.5921501706484642,
"grad_norm": 0.333509463984278,
"learning_rate": 2.2574675214351622e-05,
"loss": 0.4683,
"num_tokens": 331849770.0,
"step": 1735
},
{
"epoch": 0.5938566552901023,
"grad_norm": 0.3657723804929488,
"learning_rate": 2.245083866095029e-05,
"loss": 0.498,
"num_tokens": 332821824.0,
"step": 1740
},
{
"epoch": 0.5955631399317406,
"grad_norm": 0.3709027281629384,
"learning_rate": 2.2327162961390254e-05,
"loss": 0.5101,
"num_tokens": 333794769.0,
"step": 1745
},
{
"epoch": 0.5972696245733788,
"grad_norm": 0.34103457669200804,
"learning_rate": 2.2203652055674633e-05,
"loss": 0.4935,
"num_tokens": 334798404.0,
"step": 1750
},
{
"epoch": 0.5989761092150171,
"grad_norm": 0.3545726343474071,
"learning_rate": 2.20803098785566e-05,
"loss": 0.4833,
"num_tokens": 335687213.0,
"step": 1755
},
{
"epoch": 0.6006825938566553,
"grad_norm": 0.3340674324445749,
"learning_rate": 2.1957140359414063e-05,
"loss": 0.4651,
"num_tokens": 336651049.0,
"step": 1760
},
{
"epoch": 0.6023890784982935,
"grad_norm": 0.36127627803351964,
"learning_rate": 2.1834147422124463e-05,
"loss": 0.4772,
"num_tokens": 337519072.0,
"step": 1765
},
{
"epoch": 0.6040955631399317,
"grad_norm": 0.3835855921099779,
"learning_rate": 2.1711334984939767e-05,
"loss": 0.5155,
"num_tokens": 338438000.0,
"step": 1770
},
{
"epoch": 0.60580204778157,
"grad_norm": 0.30765262583419745,
"learning_rate": 2.1588706960361682e-05,
"loss": 0.5165,
"num_tokens": 339506233.0,
"step": 1775
},
{
"epoch": 0.6075085324232082,
"grad_norm": 0.3248110534125549,
"learning_rate": 2.146626725501697e-05,
"loss": 0.4952,
"num_tokens": 340414967.0,
"step": 1780
},
{
"epoch": 0.6092150170648464,
"grad_norm": 0.3774926928186119,
"learning_rate": 2.134401976953299e-05,
"loss": 0.5206,
"num_tokens": 341348316.0,
"step": 1785
},
{
"epoch": 0.6109215017064846,
"grad_norm": 0.3532666917926633,
"learning_rate": 2.1221968398413477e-05,
"loss": 0.4882,
"num_tokens": 342244656.0,
"step": 1790
},
{
"epoch": 0.6126279863481229,
"grad_norm": 0.3502010878134099,
"learning_rate": 2.1100117029914434e-05,
"loss": 0.4849,
"num_tokens": 343244894.0,
"step": 1795
},
{
"epoch": 0.6143344709897611,
"grad_norm": 0.34456710514750377,
"learning_rate": 2.0978469545920254e-05,
"loss": 0.5066,
"num_tokens": 344295726.0,
"step": 1800
},
{
"epoch": 0.6160409556313993,
"grad_norm": 0.32734359098421567,
"learning_rate": 2.0857029821820113e-05,
"loss": 0.5014,
"num_tokens": 345312852.0,
"step": 1805
},
{
"epoch": 0.6177474402730375,
"grad_norm": 0.37196057474243177,
"learning_rate": 2.0735801726384436e-05,
"loss": 0.5103,
"num_tokens": 346263433.0,
"step": 1810
},
{
"epoch": 0.6194539249146758,
"grad_norm": 0.32459376752536473,
"learning_rate": 2.0614789121641688e-05,
"loss": 0.5038,
"num_tokens": 347219412.0,
"step": 1815
},
{
"epoch": 0.621160409556314,
"grad_norm": 0.36986659746774475,
"learning_rate": 2.0493995862755333e-05,
"loss": 0.4975,
"num_tokens": 348137882.0,
"step": 1820
},
{
"epoch": 0.6228668941979523,
"grad_norm": 0.4026711619598764,
"learning_rate": 2.0373425797901024e-05,
"loss": 0.5169,
"num_tokens": 349064203.0,
"step": 1825
},
{
"epoch": 0.6245733788395904,
"grad_norm": 0.3497696696697358,
"learning_rate": 2.0253082768143976e-05,
"loss": 0.4985,
"num_tokens": 349987787.0,
"step": 1830
},
{
"epoch": 0.6262798634812287,
"grad_norm": 0.35282576206861677,
"learning_rate": 2.0132970607316677e-05,
"loss": 0.4961,
"num_tokens": 350963679.0,
"step": 1835
},
{
"epoch": 0.6279863481228669,
"grad_norm": 0.33944555479530397,
"learning_rate": 2.0013093141896634e-05,
"loss": 0.4743,
"num_tokens": 351875623.0,
"step": 1840
},
{
"epoch": 0.6296928327645052,
"grad_norm": 0.3720204410754147,
"learning_rate": 1.989345419088458e-05,
"loss": 0.4853,
"num_tokens": 352834128.0,
"step": 1845
},
{
"epoch": 0.6313993174061433,
"grad_norm": 0.3288718937132713,
"learning_rate": 1.9774057565682768e-05,
"loss": 0.4954,
"num_tokens": 353796065.0,
"step": 1850
},
{
"epoch": 0.6331058020477816,
"grad_norm": 0.3227249405857713,
"learning_rate": 1.965490706997351e-05,
"loss": 0.4869,
"num_tokens": 354752780.0,
"step": 1855
},
{
"epoch": 0.6348122866894198,
"grad_norm": 0.3312191918205175,
"learning_rate": 1.9536006499598085e-05,
"loss": 0.4953,
"num_tokens": 355697743.0,
"step": 1860
},
{
"epoch": 0.636518771331058,
"grad_norm": 0.3273924968177549,
"learning_rate": 1.941735964243574e-05,
"loss": 0.4905,
"num_tokens": 356707970.0,
"step": 1865
},
{
"epoch": 0.6382252559726962,
"grad_norm": 0.3733694588578265,
"learning_rate": 1.9298970278283046e-05,
"loss": 0.5312,
"num_tokens": 357607500.0,
"step": 1870
},
{
"epoch": 0.6399317406143344,
"grad_norm": 0.3377666062483733,
"learning_rate": 1.918084217873349e-05,
"loss": 0.5072,
"num_tokens": 358549752.0,
"step": 1875
},
{
"epoch": 0.6416382252559727,
"grad_norm": 0.3795132924337565,
"learning_rate": 1.90629791070573e-05,
"loss": 0.5073,
"num_tokens": 359496768.0,
"step": 1880
},
{
"epoch": 0.643344709897611,
"grad_norm": 0.3393948612318191,
"learning_rate": 1.8945384818081574e-05,
"loss": 0.4666,
"num_tokens": 360449477.0,
"step": 1885
},
{
"epoch": 0.6450511945392492,
"grad_norm": 0.3362849089493312,
"learning_rate": 1.882806305807067e-05,
"loss": 0.4991,
"num_tokens": 361389017.0,
"step": 1890
},
{
"epoch": 0.6467576791808873,
"grad_norm": 0.34043285210954316,
"learning_rate": 1.871101756460682e-05,
"loss": 0.4755,
"num_tokens": 362299106.0,
"step": 1895
},
{
"epoch": 0.6484641638225256,
"grad_norm": 0.3496235222336366,
"learning_rate": 1.8594252066471108e-05,
"loss": 0.4994,
"num_tokens": 363249804.0,
"step": 1900
},
{
"epoch": 0.6501706484641638,
"grad_norm": 0.35219312570924277,
"learning_rate": 1.847777028352463e-05,
"loss": 0.505,
"num_tokens": 364170107.0,
"step": 1905
},
{
"epoch": 0.6518771331058021,
"grad_norm": 0.3675703568778994,
"learning_rate": 1.8361575926590034e-05,
"loss": 0.4798,
"num_tokens": 365084839.0,
"step": 1910
},
{
"epoch": 0.6535836177474402,
"grad_norm": 0.33143812395942357,
"learning_rate": 1.8245672697333288e-05,
"loss": 0.4933,
"num_tokens": 365997466.0,
"step": 1915
},
{
"epoch": 0.6552901023890785,
"grad_norm": 0.32956885993414253,
"learning_rate": 1.8130064288145737e-05,
"loss": 0.4724,
"num_tokens": 366923656.0,
"step": 1920
},
{
"epoch": 0.6569965870307167,
"grad_norm": 0.3282570628204338,
"learning_rate": 1.801475438202648e-05,
"loss": 0.5023,
"num_tokens": 367958683.0,
"step": 1925
},
{
"epoch": 0.658703071672355,
"grad_norm": 0.33433791259781837,
"learning_rate": 1.789974665246507e-05,
"loss": 0.5161,
"num_tokens": 368897813.0,
"step": 1930
},
{
"epoch": 0.6604095563139932,
"grad_norm": 0.33188996124132675,
"learning_rate": 1.7785044763324415e-05,
"loss": 0.4924,
"num_tokens": 369808844.0,
"step": 1935
},
{
"epoch": 0.6621160409556314,
"grad_norm": 0.3305327165261152,
"learning_rate": 1.7670652368724144e-05,
"loss": 0.4928,
"num_tokens": 370786942.0,
"step": 1940
},
{
"epoch": 0.6638225255972696,
"grad_norm": 0.3292219960944428,
"learning_rate": 1.7556573112924135e-05,
"loss": 0.4675,
"num_tokens": 371657863.0,
"step": 1945
},
{
"epoch": 0.6655290102389079,
"grad_norm": 0.31411324035322963,
"learning_rate": 1.7442810630208446e-05,
"loss": 0.4831,
"num_tokens": 372630696.0,
"step": 1950
},
{
"epoch": 0.6672354948805461,
"grad_norm": 0.3217654495266495,
"learning_rate": 1.7329368544769487e-05,
"loss": 0.5029,
"num_tokens": 373650740.0,
"step": 1955
},
{
"epoch": 0.6689419795221843,
"grad_norm": 0.30398529228133475,
"learning_rate": 1.721625047059265e-05,
"loss": 0.4927,
"num_tokens": 374628223.0,
"step": 1960
},
{
"epoch": 0.6706484641638225,
"grad_norm": 0.3690454794887621,
"learning_rate": 1.7103460011341084e-05,
"loss": 0.4882,
"num_tokens": 375573909.0,
"step": 1965
},
{
"epoch": 0.6723549488054608,
"grad_norm": 0.32789625269764505,
"learning_rate": 1.699100076024099e-05,
"loss": 0.4697,
"num_tokens": 376493989.0,
"step": 1970
},
{
"epoch": 0.674061433447099,
"grad_norm": 0.3425335012653658,
"learning_rate": 1.6878876299967018e-05,
"loss": 0.4706,
"num_tokens": 377479804.0,
"step": 1975
},
{
"epoch": 0.6757679180887372,
"grad_norm": 0.3506912584928773,
"learning_rate": 1.6767090202528268e-05,
"loss": 0.4884,
"num_tokens": 378392822.0,
"step": 1980
},
{
"epoch": 0.6774744027303754,
"grad_norm": 0.3366127448224504,
"learning_rate": 1.6655646029154402e-05,
"loss": 0.4757,
"num_tokens": 379328234.0,
"step": 1985
},
{
"epoch": 0.6791808873720137,
"grad_norm": 0.33730121762767445,
"learning_rate": 1.6544547330182234e-05,
"loss": 0.4683,
"num_tokens": 380308538.0,
"step": 1990
},
{
"epoch": 0.6808873720136519,
"grad_norm": 0.3150426444385526,
"learning_rate": 1.6433797644942633e-05,
"loss": 0.4975,
"num_tokens": 381210797.0,
"step": 1995
},
{
"epoch": 0.6825938566552902,
"grad_norm": 0.31875516644753304,
"learning_rate": 1.63234005016477e-05,
"loss": 0.4942,
"num_tokens": 382166430.0,
"step": 2000
},
{
"epoch": 0.6843003412969283,
"grad_norm": 0.32666915132106794,
"learning_rate": 1.6213359417278473e-05,
"loss": 0.5085,
"num_tokens": 383179056.0,
"step": 2005
},
{
"epoch": 0.6860068259385665,
"grad_norm": 0.32712173146500084,
"learning_rate": 1.6103677897472794e-05,
"loss": 0.5003,
"num_tokens": 384075218.0,
"step": 2010
},
{
"epoch": 0.6877133105802048,
"grad_norm": 0.3287315519317715,
"learning_rate": 1.599435943641368e-05,
"loss": 0.4702,
"num_tokens": 384999949.0,
"step": 2015
},
{
"epoch": 0.689419795221843,
"grad_norm": 0.38670376122956074,
"learning_rate": 1.5885407516717987e-05,
"loss": 0.4908,
"num_tokens": 385900887.0,
"step": 2020
},
{
"epoch": 0.6911262798634812,
"grad_norm": 0.35663261838972243,
"learning_rate": 1.577682560932547e-05,
"loss": 0.4978,
"num_tokens": 386870114.0,
"step": 2025
},
{
"epoch": 0.6928327645051194,
"grad_norm": 0.42642527854196,
"learning_rate": 1.566861717338819e-05,
"loss": 0.4906,
"num_tokens": 387782669.0,
"step": 2030
},
{
"epoch": 0.6945392491467577,
"grad_norm": 0.37343767857022897,
"learning_rate": 1.556078565616034e-05,
"loss": 0.4902,
"num_tokens": 388715961.0,
"step": 2035
},
{
"epoch": 0.6962457337883959,
"grad_norm": 0.3284331766719929,
"learning_rate": 1.5453334492888428e-05,
"loss": 0.4776,
"num_tokens": 389650899.0,
"step": 2040
},
{
"epoch": 0.6979522184300341,
"grad_norm": 0.3483829695120365,
"learning_rate": 1.5346267106701762e-05,
"loss": 0.4836,
"num_tokens": 390610942.0,
"step": 2045
},
{
"epoch": 0.6996587030716723,
"grad_norm": 0.32022610770787235,
"learning_rate": 1.5239586908503533e-05,
"loss": 0.5172,
"num_tokens": 391632321.0,
"step": 2050
},
{
"epoch": 0.7013651877133106,
"grad_norm": 0.3514714730074271,
"learning_rate": 1.513329729686203e-05,
"loss": 0.4854,
"num_tokens": 392626976.0,
"step": 2055
},
{
"epoch": 0.7030716723549488,
"grad_norm": 0.3413532016336787,
"learning_rate": 1.502740165790244e-05,
"loss": 0.4856,
"num_tokens": 393493604.0,
"step": 2060
},
{
"epoch": 0.7047781569965871,
"grad_norm": 0.3422857376051543,
"learning_rate": 1.4921903365198914e-05,
"loss": 0.5084,
"num_tokens": 394371570.0,
"step": 2065
},
{
"epoch": 0.7064846416382252,
"grad_norm": 0.3572958103887209,
"learning_rate": 1.481680577966717e-05,
"loss": 0.4963,
"num_tokens": 395329185.0,
"step": 2070
},
{
"epoch": 0.7081911262798635,
"grad_norm": 0.34758264074134787,
"learning_rate": 1.471211224945736e-05,
"loss": 0.4905,
"num_tokens": 396285000.0,
"step": 2075
},
{
"epoch": 0.7098976109215017,
"grad_norm": 0.3419972923917919,
"learning_rate": 1.4607826109847458e-05,
"loss": 0.5266,
"num_tokens": 397224172.0,
"step": 2080
},
{
"epoch": 0.71160409556314,
"grad_norm": 0.337893094968607,
"learning_rate": 1.4503950683136936e-05,
"loss": 0.4857,
"num_tokens": 398210109.0,
"step": 2085
},
{
"epoch": 0.7133105802047781,
"grad_norm": 0.33901468478209035,
"learning_rate": 1.4400489278540985e-05,
"loss": 0.4749,
"num_tokens": 399158135.0,
"step": 2090
},
{
"epoch": 0.7150170648464164,
"grad_norm": 0.3835956412834383,
"learning_rate": 1.429744519208508e-05,
"loss": 0.4936,
"num_tokens": 400075133.0,
"step": 2095
},
{
"epoch": 0.7167235494880546,
"grad_norm": 0.34331595173677726,
"learning_rate": 1.4194821706499955e-05,
"loss": 0.5031,
"num_tokens": 400990040.0,
"step": 2100
},
{
"epoch": 0.7184300341296929,
"grad_norm": 0.3171937953485873,
"learning_rate": 1.4092622091117041e-05,
"loss": 0.4815,
"num_tokens": 401912436.0,
"step": 2105
},
{
"epoch": 0.7201365187713311,
"grad_norm": 0.3410195330730544,
"learning_rate": 1.399084960176431e-05,
"loss": 0.4741,
"num_tokens": 402861165.0,
"step": 2110
},
{
"epoch": 0.7218430034129693,
"grad_norm": 0.3465187152384934,
"learning_rate": 1.3889507480662545e-05,
"loss": 0.4913,
"num_tokens": 403763990.0,
"step": 2115
},
{
"epoch": 0.7235494880546075,
"grad_norm": 0.3292708548169215,
"learning_rate": 1.3788598956322068e-05,
"loss": 0.4858,
"num_tokens": 404752387.0,
"step": 2120
},
{
"epoch": 0.7252559726962458,
"grad_norm": 0.36695144477657526,
"learning_rate": 1.3688127243439863e-05,
"loss": 0.4838,
"num_tokens": 405691554.0,
"step": 2125
},
{
"epoch": 0.726962457337884,
"grad_norm": 0.3372811093945545,
"learning_rate": 1.3588095542797186e-05,
"loss": 0.4947,
"num_tokens": 406680793.0,
"step": 2130
},
{
"epoch": 0.7286689419795221,
"grad_norm": 0.3443503709289555,
"learning_rate": 1.3488507041157584e-05,
"loss": 0.4921,
"num_tokens": 407683383.0,
"step": 2135
},
{
"epoch": 0.7303754266211604,
"grad_norm": 0.3480092507121765,
"learning_rate": 1.3389364911165375e-05,
"loss": 0.4846,
"num_tokens": 408650161.0,
"step": 2140
},
{
"epoch": 0.7320819112627986,
"grad_norm": 0.3353926863066775,
"learning_rate": 1.3290672311244584e-05,
"loss": 0.5006,
"num_tokens": 409549494.0,
"step": 2145
},
{
"epoch": 0.7337883959044369,
"grad_norm": 0.37867503643842687,
"learning_rate": 1.3192432385498305e-05,
"loss": 0.4921,
"num_tokens": 410510602.0,
"step": 2150
},
{
"epoch": 0.735494880546075,
"grad_norm": 0.3592680929736747,
"learning_rate": 1.3094648263608533e-05,
"loss": 0.4981,
"num_tokens": 411492905.0,
"step": 2155
},
{
"epoch": 0.7372013651877133,
"grad_norm": 0.3322392394703493,
"learning_rate": 1.299732306073652e-05,
"loss": 0.487,
"num_tokens": 412454003.0,
"step": 2160
},
{
"epoch": 0.7389078498293515,
"grad_norm": 0.3756417144546959,
"learning_rate": 1.2900459877423457e-05,
"loss": 0.5106,
"num_tokens": 413421190.0,
"step": 2165
},
{
"epoch": 0.7406143344709898,
"grad_norm": 0.3276238813240706,
"learning_rate": 1.2804061799491734e-05,
"loss": 0.4945,
"num_tokens": 414425737.0,
"step": 2170
},
{
"epoch": 0.742320819112628,
"grad_norm": 0.29735177345606206,
"learning_rate": 1.2708131897946621e-05,
"loss": 0.478,
"num_tokens": 415344538.0,
"step": 2175
},
{
"epoch": 0.7440273037542662,
"grad_norm": 0.3125870087788144,
"learning_rate": 1.261267322887845e-05,
"loss": 0.5041,
"num_tokens": 416440659.0,
"step": 2180
},
{
"epoch": 0.7457337883959044,
"grad_norm": 0.32461771570892695,
"learning_rate": 1.251768883336526e-05,
"loss": 0.4919,
"num_tokens": 417360385.0,
"step": 2185
},
{
"epoch": 0.7474402730375427,
"grad_norm": 0.3442194296895731,
"learning_rate": 1.2423181737375899e-05,
"loss": 0.4836,
"num_tokens": 418334906.0,
"step": 2190
},
{
"epoch": 0.7491467576791809,
"grad_norm": 0.3446911329824469,
"learning_rate": 1.2329154951673598e-05,
"loss": 0.4646,
"num_tokens": 419196059.0,
"step": 2195
},
{
"epoch": 0.7508532423208191,
"grad_norm": 0.33767235227444237,
"learning_rate": 1.2235611471720123e-05,
"loss": 0.4856,
"num_tokens": 420121223.0,
"step": 2200
},
{
"epoch": 0.7525597269624573,
"grad_norm": 0.31616744714729916,
"learning_rate": 1.2142554277580288e-05,
"loss": 0.4867,
"num_tokens": 421062594.0,
"step": 2205
},
{
"epoch": 0.7542662116040956,
"grad_norm": 0.3288211934138168,
"learning_rate": 1.2049986333827048e-05,
"loss": 0.4672,
"num_tokens": 421975487.0,
"step": 2210
},
{
"epoch": 0.7559726962457338,
"grad_norm": 0.3496778301287439,
"learning_rate": 1.1957910589447043e-05,
"loss": 0.4861,
"num_tokens": 422820853.0,
"step": 2215
},
{
"epoch": 0.757679180887372,
"grad_norm": 0.3475005774602,
"learning_rate": 1.1866329977746656e-05,
"loss": 0.4882,
"num_tokens": 423755589.0,
"step": 2220
},
{
"epoch": 0.7593856655290102,
"grad_norm": 0.35706393638603534,
"learning_rate": 1.177524741625856e-05,
"loss": 0.4887,
"num_tokens": 424688821.0,
"step": 2225
},
{
"epoch": 0.7610921501706485,
"grad_norm": 0.3461078636788691,
"learning_rate": 1.1684665806648772e-05,
"loss": 0.4684,
"num_tokens": 425585640.0,
"step": 2230
},
{
"epoch": 0.7627986348122867,
"grad_norm": 0.3356141138518873,
"learning_rate": 1.1594588034624228e-05,
"loss": 0.4813,
"num_tokens": 426547476.0,
"step": 2235
},
{
"epoch": 0.764505119453925,
"grad_norm": 0.3105226360165998,
"learning_rate": 1.1505016969840823e-05,
"loss": 0.4745,
"num_tokens": 427476418.0,
"step": 2240
},
{
"epoch": 0.7662116040955631,
"grad_norm": 0.32709156257769095,
"learning_rate": 1.1415955465812023e-05,
"loss": 0.4887,
"num_tokens": 428405822.0,
"step": 2245
},
{
"epoch": 0.7679180887372014,
"grad_norm": 0.30756496657170446,
"learning_rate": 1.1327406359817933e-05,
"loss": 0.4774,
"num_tokens": 429400796.0,
"step": 2250
},
{
"epoch": 0.7696245733788396,
"grad_norm": 0.3111754417183422,
"learning_rate": 1.1239372472814927e-05,
"loss": 0.4805,
"num_tokens": 430392694.0,
"step": 2255
},
{
"epoch": 0.7713310580204779,
"grad_norm": 0.34095902728307337,
"learning_rate": 1.1151856609345774e-05,
"loss": 0.4716,
"num_tokens": 431359520.0,
"step": 2260
},
{
"epoch": 0.773037542662116,
"grad_norm": 0.34936226176674845,
"learning_rate": 1.1064861557450256e-05,
"loss": 0.4894,
"num_tokens": 432294915.0,
"step": 2265
},
{
"epoch": 0.7747440273037542,
"grad_norm": 0.3155937127951743,
"learning_rate": 1.0978390088576437e-05,
"loss": 0.481,
"num_tokens": 433284774.0,
"step": 2270
},
{
"epoch": 0.7764505119453925,
"grad_norm": 0.3202827342005119,
"learning_rate": 1.0892444957492276e-05,
"loss": 0.4891,
"num_tokens": 434284592.0,
"step": 2275
},
{
"epoch": 0.7781569965870307,
"grad_norm": 0.3369162033659964,
"learning_rate": 1.0807028902197925e-05,
"loss": 0.4654,
"num_tokens": 435149765.0,
"step": 2280
},
{
"epoch": 0.7798634812286689,
"grad_norm": 0.3582446923094639,
"learning_rate": 1.0722144643838461e-05,
"loss": 0.4866,
"num_tokens": 436158148.0,
"step": 2285
},
{
"epoch": 0.7815699658703071,
"grad_norm": 0.3311676374288052,
"learning_rate": 1.063779488661724e-05,
"loss": 0.4776,
"num_tokens": 437135437.0,
"step": 2290
},
{
"epoch": 0.7832764505119454,
"grad_norm": 0.32325241567650026,
"learning_rate": 1.0553982317709741e-05,
"loss": 0.4654,
"num_tokens": 438061307.0,
"step": 2295
},
{
"epoch": 0.7849829351535836,
"grad_norm": 0.3043727173598455,
"learning_rate": 1.047070960717793e-05,
"loss": 0.4932,
"num_tokens": 439079263.0,
"step": 2300
},
{
"epoch": 0.7866894197952219,
"grad_norm": 0.3361341921228823,
"learning_rate": 1.0387979407885198e-05,
"loss": 0.506,
"num_tokens": 440154096.0,
"step": 2305
},
{
"epoch": 0.78839590443686,
"grad_norm": 0.31103435909845967,
"learning_rate": 1.03057943554119e-05,
"loss": 0.4848,
"num_tokens": 441195758.0,
"step": 2310
},
{
"epoch": 0.7901023890784983,
"grad_norm": 0.32529003714436683,
"learning_rate": 1.022415706797133e-05,
"loss": 0.4941,
"num_tokens": 442194379.0,
"step": 2315
},
{
"epoch": 0.7918088737201365,
"grad_norm": 0.34129914389197974,
"learning_rate": 1.0143070146326347e-05,
"loss": 0.4965,
"num_tokens": 443118717.0,
"step": 2320
},
{
"epoch": 0.7935153583617748,
"grad_norm": 0.34574447344569287,
"learning_rate": 1.0062536173706519e-05,
"loss": 0.4833,
"num_tokens": 444049001.0,
"step": 2325
},
{
"epoch": 0.7952218430034129,
"grad_norm": 0.39058472684776835,
"learning_rate": 9.982557715725807e-06,
"loss": 0.4855,
"num_tokens": 444948197.0,
"step": 2330
},
{
"epoch": 0.7969283276450512,
"grad_norm": 0.3223678497972808,
"learning_rate": 9.903137320300852e-06,
"loss": 0.4923,
"num_tokens": 445993006.0,
"step": 2335
},
{
"epoch": 0.7986348122866894,
"grad_norm": 0.3581110760327946,
"learning_rate": 9.824277517569791e-06,
"loss": 0.4714,
"num_tokens": 446925677.0,
"step": 2340
},
{
"epoch": 0.8003412969283277,
"grad_norm": 0.35940172044204516,
"learning_rate": 9.745980819811668e-06,
"loss": 0.4838,
"num_tokens": 447799196.0,
"step": 2345
},
{
"epoch": 0.8020477815699659,
"grad_norm": 0.34500268650073257,
"learning_rate": 9.66824972136638e-06,
"loss": 0.493,
"num_tokens": 448739177.0,
"step": 2350
},
{
"epoch": 0.8037542662116041,
"grad_norm": 0.32399229631376597,
"learning_rate": 9.59108669855523e-06,
"loss": 0.5037,
"num_tokens": 449751958.0,
"step": 2355
},
{
"epoch": 0.8054607508532423,
"grad_norm": 0.29766759242151386,
"learning_rate": 9.514494209602023e-06,
"loss": 0.5071,
"num_tokens": 450761568.0,
"step": 2360
},
{
"epoch": 0.8071672354948806,
"grad_norm": 0.3430178270787923,
"learning_rate": 9.438474694554775e-06,
"loss": 0.4935,
"num_tokens": 451740507.0,
"step": 2365
},
{
"epoch": 0.8088737201365188,
"grad_norm": 0.3471410052900583,
"learning_rate": 9.36303057520795e-06,
"loss": 0.4713,
"num_tokens": 452625293.0,
"step": 2370
},
{
"epoch": 0.810580204778157,
"grad_norm": 0.3493958783029142,
"learning_rate": 9.288164255025334e-06,
"loss": 0.4823,
"num_tokens": 453642012.0,
"step": 2375
},
{
"epoch": 0.8122866894197952,
"grad_norm": 0.33853142356519106,
"learning_rate": 9.21387811906344e-06,
"loss": 0.4823,
"num_tokens": 454558803.0,
"step": 2380
},
{
"epoch": 0.8139931740614335,
"grad_norm": 0.3227168370953717,
"learning_rate": 9.14017453389556e-06,
"loss": 0.476,
"num_tokens": 455523392.0,
"step": 2385
},
{
"epoch": 0.8156996587030717,
"grad_norm": 0.29404894296430983,
"learning_rate": 9.067055847536346e-06,
"loss": 0.4596,
"num_tokens": 456494011.0,
"step": 2390
},
{
"epoch": 0.8174061433447098,
"grad_norm": 0.3308718804211504,
"learning_rate": 8.994524389367001e-06,
"loss": 0.4891,
"num_tokens": 457401137.0,
"step": 2395
},
{
"epoch": 0.8191126279863481,
"grad_norm": 0.3300071384128006,
"learning_rate": 8.922582470061099e-06,
"loss": 0.4961,
"num_tokens": 458401399.0,
"step": 2400
},
{
"epoch": 0.8208191126279863,
"grad_norm": 0.44818729004437646,
"learning_rate": 8.851232381510961e-06,
"loss": 0.504,
"num_tokens": 459365515.0,
"step": 2405
},
{
"epoch": 0.8225255972696246,
"grad_norm": 0.3842623998501234,
"learning_rate": 8.780476396754633e-06,
"loss": 0.4931,
"num_tokens": 460303546.0,
"step": 2410
},
{
"epoch": 0.8242320819112628,
"grad_norm": 0.33418576004104283,
"learning_rate": 8.710316769903471e-06,
"loss": 0.4868,
"num_tokens": 461172152.0,
"step": 2415
},
{
"epoch": 0.825938566552901,
"grad_norm": 0.368646183750318,
"learning_rate": 8.640755736070346e-06,
"loss": 0.4579,
"num_tokens": 462132037.0,
"step": 2420
},
{
"epoch": 0.8276450511945392,
"grad_norm": 0.32508360537240183,
"learning_rate": 8.571795511298423e-06,
"loss": 0.4853,
"num_tokens": 463170048.0,
"step": 2425
},
{
"epoch": 0.8293515358361775,
"grad_norm": 0.3211106939852037,
"learning_rate": 8.50343829249059e-06,
"loss": 0.4593,
"num_tokens": 464065062.0,
"step": 2430
},
{
"epoch": 0.8310580204778157,
"grad_norm": 0.3053006101651597,
"learning_rate": 8.435686257339417e-06,
"loss": 0.4831,
"num_tokens": 465056306.0,
"step": 2435
},
{
"epoch": 0.8327645051194539,
"grad_norm": 0.31115117178982893,
"learning_rate": 8.368541564257842e-06,
"loss": 0.4907,
"num_tokens": 466050672.0,
"step": 2440
},
{
"epoch": 0.8344709897610921,
"grad_norm": 0.3397295723653259,
"learning_rate": 8.302006352310369e-06,
"loss": 0.4966,
"num_tokens": 467046976.0,
"step": 2445
},
{
"epoch": 0.8361774744027304,
"grad_norm": 0.3382052634070017,
"learning_rate": 8.236082741144938e-06,
"loss": 0.4638,
"num_tokens": 468039326.0,
"step": 2450
},
{
"epoch": 0.8378839590443686,
"grad_norm": 0.3514456973966274,
"learning_rate": 8.170772830925389e-06,
"loss": 0.4653,
"num_tokens": 468922373.0,
"step": 2455
},
{
"epoch": 0.8395904436860068,
"grad_norm": 0.30836591986791617,
"learning_rate": 8.106078702264573e-06,
"loss": 0.4829,
"num_tokens": 469868923.0,
"step": 2460
},
{
"epoch": 0.841296928327645,
"grad_norm": 0.33389876659214884,
"learning_rate": 8.042002416158047e-06,
"loss": 0.471,
"num_tokens": 470752870.0,
"step": 2465
},
{
"epoch": 0.8430034129692833,
"grad_norm": 0.34528058311355864,
"learning_rate": 7.978546013918428e-06,
"loss": 0.4806,
"num_tokens": 471694644.0,
"step": 2470
},
{
"epoch": 0.8447098976109215,
"grad_norm": 0.30449852181321135,
"learning_rate": 7.915711517110365e-06,
"loss": 0.4726,
"num_tokens": 472652423.0,
"step": 2475
},
{
"epoch": 0.8464163822525598,
"grad_norm": 0.3010535970843912,
"learning_rate": 7.853500927486129e-06,
"loss": 0.4734,
"num_tokens": 473648633.0,
"step": 2480
},
{
"epoch": 0.8481228668941979,
"grad_norm": 0.319770912075749,
"learning_rate": 7.791916226921844e-06,
"loss": 0.493,
"num_tokens": 474686021.0,
"step": 2485
},
{
"epoch": 0.8498293515358362,
"grad_norm": 0.3101950532253656,
"learning_rate": 7.730959377354354e-06,
"loss": 0.4811,
"num_tokens": 475597050.0,
"step": 2490
},
{
"epoch": 0.8515358361774744,
"grad_norm": 0.336474137392747,
"learning_rate": 7.670632320718714e-06,
"loss": 0.4985,
"num_tokens": 476480863.0,
"step": 2495
},
{
"epoch": 0.8532423208191127,
"grad_norm": 0.33239250078197224,
"learning_rate": 7.610936978886332e-06,
"loss": 0.4889,
"num_tokens": 477480036.0,
"step": 2500
},
{
"epoch": 0.8549488054607508,
"grad_norm": 0.31906385732968195,
"learning_rate": 7.551875253603726e-06,
"loss": 0.4913,
"num_tokens": 478441727.0,
"step": 2505
},
{
"epoch": 0.856655290102389,
"grad_norm": 0.2972232303527529,
"learning_rate": 7.493449026431963e-06,
"loss": 0.4846,
"num_tokens": 479450987.0,
"step": 2510
},
{
"epoch": 0.8583617747440273,
"grad_norm": 0.34482331352077483,
"learning_rate": 7.4356601586867094e-06,
"loss": 0.4872,
"num_tokens": 480480087.0,
"step": 2515
},
{
"epoch": 0.8600682593856656,
"grad_norm": 0.3286565488200801,
"learning_rate": 7.3785104913789284e-06,
"loss": 0.493,
"num_tokens": 481428209.0,
"step": 2520
},
{
"epoch": 0.8617747440273038,
"grad_norm": 0.3202404807748421,
"learning_rate": 7.322001845156215e-06,
"loss": 0.4634,
"num_tokens": 482333846.0,
"step": 2525
},
{
"epoch": 0.863481228668942,
"grad_norm": 0.3361830515481688,
"learning_rate": 7.2661360202448344e-06,
"loss": 0.4904,
"num_tokens": 483299671.0,
"step": 2530
},
{
"epoch": 0.8651877133105802,
"grad_norm": 0.3177624111695824,
"learning_rate": 7.2109147963923335e-06,
"loss": 0.4988,
"num_tokens": 484313085.0,
"step": 2535
},
{
"epoch": 0.8668941979522184,
"grad_norm": 0.32624228168571706,
"learning_rate": 7.156339932810871e-06,
"loss": 0.4968,
"num_tokens": 485251856.0,
"step": 2540
},
{
"epoch": 0.8686006825938567,
"grad_norm": 0.2897757010993598,
"learning_rate": 7.1024131681211455e-06,
"loss": 0.4779,
"num_tokens": 486246381.0,
"step": 2545
},
{
"epoch": 0.8703071672354948,
"grad_norm": 0.3120988592440139,
"learning_rate": 7.0491362202970295e-06,
"loss": 0.4712,
"num_tokens": 487198446.0,
"step": 2550
},
{
"epoch": 0.8720136518771331,
"grad_norm": 0.2815904159190546,
"learning_rate": 6.9965107866108274e-06,
"loss": 0.4722,
"num_tokens": 488156403.0,
"step": 2555
},
{
"epoch": 0.8737201365187713,
"grad_norm": 0.3273103504291178,
"learning_rate": 6.9445385435792095e-06,
"loss": 0.4695,
"num_tokens": 489140124.0,
"step": 2560
},
{
"epoch": 0.8754266211604096,
"grad_norm": 0.3289263665204276,
"learning_rate": 6.893221146909806e-06,
"loss": 0.4724,
"num_tokens": 490104565.0,
"step": 2565
},
{
"epoch": 0.8771331058020477,
"grad_norm": 0.3146346775894274,
"learning_rate": 6.84256023144845e-06,
"loss": 0.4762,
"num_tokens": 491054868.0,
"step": 2570
},
{
"epoch": 0.878839590443686,
"grad_norm": 0.29961414202911324,
"learning_rate": 6.792557411127099e-06,
"loss": 0.4704,
"num_tokens": 492078546.0,
"step": 2575
},
{
"epoch": 0.8805460750853242,
"grad_norm": 0.34594580055469915,
"learning_rate": 6.74321427891242e-06,
"loss": 0.4851,
"num_tokens": 492974236.0,
"step": 2580
},
{
"epoch": 0.8822525597269625,
"grad_norm": 0.3066722562434767,
"learning_rate": 6.694532406755053e-06,
"loss": 0.478,
"num_tokens": 494019470.0,
"step": 2585
},
{
"epoch": 0.8839590443686007,
"grad_norm": 0.2923055504113335,
"learning_rate": 6.646513345539509e-06,
"loss": 0.516,
"num_tokens": 495062198.0,
"step": 2590
},
{
"epoch": 0.8856655290102389,
"grad_norm": 0.2953358109985866,
"learning_rate": 6.59915862503478e-06,
"loss": 0.4668,
"num_tokens": 496039347.0,
"step": 2595
},
{
"epoch": 0.8873720136518771,
"grad_norm": 0.33489720113078275,
"learning_rate": 6.552469753845601e-06,
"loss": 0.4715,
"num_tokens": 496987511.0,
"step": 2600
},
{
"epoch": 0.8890784982935154,
"grad_norm": 0.35147052248086963,
"learning_rate": 6.506448219364389e-06,
"loss": 0.4952,
"num_tokens": 497953501.0,
"step": 2605
},
{
"epoch": 0.8907849829351536,
"grad_norm": 0.2976052092258336,
"learning_rate": 6.461095487723852e-06,
"loss": 0.4703,
"num_tokens": 498971917.0,
"step": 2610
},
{
"epoch": 0.8924914675767918,
"grad_norm": 0.3166781793747034,
"learning_rate": 6.416413003750289e-06,
"loss": 0.4765,
"num_tokens": 499959465.0,
"step": 2615
},
{
"epoch": 0.89419795221843,
"grad_norm": 0.3094928641877226,
"learning_rate": 6.3724021909175636e-06,
"loss": 0.4714,
"num_tokens": 500947010.0,
"step": 2620
},
{
"epoch": 0.8959044368600683,
"grad_norm": 0.3403135875166433,
"learning_rate": 6.3290644513017496e-06,
"loss": 0.4838,
"num_tokens": 501972930.0,
"step": 2625
},
{
"epoch": 0.8976109215017065,
"grad_norm": 0.35528577423877566,
"learning_rate": 6.286401165536466e-06,
"loss": 0.4974,
"num_tokens": 502958987.0,
"step": 2630
},
{
"epoch": 0.8993174061433447,
"grad_norm": 0.3318258094487176,
"learning_rate": 6.244413692768893e-06,
"loss": 0.4767,
"num_tokens": 503946765.0,
"step": 2635
},
{
"epoch": 0.9010238907849829,
"grad_norm": 0.30577807197719564,
"learning_rate": 6.2031033706164715e-06,
"loss": 0.471,
"num_tokens": 504893463.0,
"step": 2640
},
{
"epoch": 0.9027303754266212,
"grad_norm": 0.3384112726283077,
"learning_rate": 6.162471515124292e-06,
"loss": 0.481,
"num_tokens": 505899175.0,
"step": 2645
},
{
"epoch": 0.9044368600682594,
"grad_norm": 0.30699959739486854,
"learning_rate": 6.122519420723182e-06,
"loss": 0.4733,
"num_tokens": 506861395.0,
"step": 2650
},
{
"epoch": 0.9061433447098977,
"grad_norm": 0.3222784803973646,
"learning_rate": 6.083248360188437e-06,
"loss": 0.4825,
"num_tokens": 507817589.0,
"step": 2655
},
{
"epoch": 0.9078498293515358,
"grad_norm": 0.3403142746810214,
"learning_rate": 6.044659584599297e-06,
"loss": 0.4761,
"num_tokens": 508690960.0,
"step": 2660
},
{
"epoch": 0.909556313993174,
"grad_norm": 0.2953479643563658,
"learning_rate": 6.006754323299088e-06,
"loss": 0.4804,
"num_tokens": 509649233.0,
"step": 2665
},
{
"epoch": 0.9112627986348123,
"grad_norm": 0.30487020552378524,
"learning_rate": 5.969533783856054e-06,
"loss": 0.4777,
"num_tokens": 510627944.0,
"step": 2670
},
{
"epoch": 0.9129692832764505,
"grad_norm": 0.30704549371766876,
"learning_rate": 5.932999152024885e-06,
"loss": 0.4822,
"num_tokens": 511591407.0,
"step": 2675
},
{
"epoch": 0.9146757679180887,
"grad_norm": 0.33249148261118033,
"learning_rate": 5.897151591708947e-06,
"loss": 0.5016,
"num_tokens": 512558436.0,
"step": 2680
},
{
"epoch": 0.9163822525597269,
"grad_norm": 0.3229296450202934,
"learning_rate": 5.861992244923199e-06,
"loss": 0.4735,
"num_tokens": 513474763.0,
"step": 2685
},
{
"epoch": 0.9180887372013652,
"grad_norm": 0.31994250178502454,
"learning_rate": 5.827522231757808e-06,
"loss": 0.4609,
"num_tokens": 514407245.0,
"step": 2690
},
{
"epoch": 0.9197952218430034,
"grad_norm": 0.30727719477408283,
"learning_rate": 5.793742650342482e-06,
"loss": 0.4611,
"num_tokens": 515337057.0,
"step": 2695
},
{
"epoch": 0.9215017064846417,
"grad_norm": 0.3422479555475027,
"learning_rate": 5.760654576811455e-06,
"loss": 0.5085,
"num_tokens": 516301089.0,
"step": 2700
},
{
"epoch": 0.9232081911262798,
"grad_norm": 0.32317795820730566,
"learning_rate": 5.728259065269248e-06,
"loss": 0.4808,
"num_tokens": 517258131.0,
"step": 2705
},
{
"epoch": 0.9249146757679181,
"grad_norm": 0.31035185685822675,
"learning_rate": 5.696557147757041e-06,
"loss": 0.4989,
"num_tokens": 518223298.0,
"step": 2710
},
{
"epoch": 0.9266211604095563,
"grad_norm": 0.32127700006025317,
"learning_rate": 5.66554983421983e-06,
"loss": 0.4721,
"num_tokens": 519130985.0,
"step": 2715
},
{
"epoch": 0.9283276450511946,
"grad_norm": 0.3248191734813711,
"learning_rate": 5.635238112474237e-06,
"loss": 0.4878,
"num_tokens": 520051962.0,
"step": 2720
},
{
"epoch": 0.9300341296928327,
"grad_norm": 0.32016262129429485,
"learning_rate": 5.605622948177032e-06,
"loss": 0.4612,
"num_tokens": 520934447.0,
"step": 2725
},
{
"epoch": 0.931740614334471,
"grad_norm": 0.3075215750119489,
"learning_rate": 5.576705284794404e-06,
"loss": 0.4717,
"num_tokens": 521910187.0,
"step": 2730
},
{
"epoch": 0.9334470989761092,
"grad_norm": 0.2966943509184171,
"learning_rate": 5.548486043571861e-06,
"loss": 0.4615,
"num_tokens": 522876883.0,
"step": 2735
},
{
"epoch": 0.9351535836177475,
"grad_norm": 0.3823699167375232,
"learning_rate": 5.52096612350491e-06,
"loss": 0.4899,
"num_tokens": 523880084.0,
"step": 2740
},
{
"epoch": 0.9368600682593856,
"grad_norm": 0.3445674466297448,
"learning_rate": 5.494146401310404e-06,
"loss": 0.4792,
"num_tokens": 524788350.0,
"step": 2745
},
{
"epoch": 0.9385665529010239,
"grad_norm": 0.29579943008870063,
"learning_rate": 5.468027731398621e-06,
"loss": 0.4863,
"num_tokens": 525832920.0,
"step": 2750
},
{
"epoch": 0.9402730375426621,
"grad_norm": 0.30558147446901934,
"learning_rate": 5.442610945846045e-06,
"loss": 0.4943,
"num_tokens": 526845340.0,
"step": 2755
},
{
"epoch": 0.9419795221843004,
"grad_norm": 0.32100249042860457,
"learning_rate": 5.41789685436884e-06,
"loss": 0.4788,
"num_tokens": 527753736.0,
"step": 2760
},
{
"epoch": 0.9436860068259386,
"grad_norm": 0.32004490139915764,
"learning_rate": 5.393886244297079e-06,
"loss": 0.4817,
"num_tokens": 528798665.0,
"step": 2765
},
{
"epoch": 0.9453924914675768,
"grad_norm": 0.3325347960685324,
"learning_rate": 5.370579880549647e-06,
"loss": 0.4878,
"num_tokens": 529711197.0,
"step": 2770
},
{
"epoch": 0.947098976109215,
"grad_norm": 0.3303511181497015,
"learning_rate": 5.347978505609877e-06,
"loss": 0.4693,
"num_tokens": 530632318.0,
"step": 2775
},
{
"epoch": 0.9488054607508533,
"grad_norm": 0.3044920349269044,
"learning_rate": 5.326082839501891e-06,
"loss": 0.4881,
"num_tokens": 531553862.0,
"step": 2780
},
{
"epoch": 0.9505119453924915,
"grad_norm": 0.30326530639070487,
"learning_rate": 5.304893579767674e-06,
"loss": 0.4935,
"num_tokens": 532532990.0,
"step": 2785
},
{
"epoch": 0.9522184300341296,
"grad_norm": 0.31251416953382594,
"learning_rate": 5.284411401444836e-06,
"loss": 0.4933,
"num_tokens": 533448215.0,
"step": 2790
},
{
"epoch": 0.9539249146757679,
"grad_norm": 0.32753454088259387,
"learning_rate": 5.264636957045122e-06,
"loss": 0.4824,
"num_tokens": 534401740.0,
"step": 2795
},
{
"epoch": 0.9556313993174061,
"grad_norm": 0.30791599389681806,
"learning_rate": 5.245570876533615e-06,
"loss": 0.4685,
"num_tokens": 535341346.0,
"step": 2800
},
{
"epoch": 0.9573378839590444,
"grad_norm": 0.313479264368028,
"learning_rate": 5.227213767308668e-06,
"loss": 0.4575,
"num_tokens": 536296941.0,
"step": 2805
},
{
"epoch": 0.9590443686006825,
"grad_norm": 0.29032270836531787,
"learning_rate": 5.209566214182558e-06,
"loss": 0.4742,
"num_tokens": 537336227.0,
"step": 2810
},
{
"epoch": 0.9607508532423208,
"grad_norm": 0.30878749649756715,
"learning_rate": 5.1926287793628515e-06,
"loss": 0.4843,
"num_tokens": 538331668.0,
"step": 2815
},
{
"epoch": 0.962457337883959,
"grad_norm": 0.3144044223958235,
"learning_rate": 5.176402002434495e-06,
"loss": 0.4596,
"num_tokens": 539226192.0,
"step": 2820
},
{
"epoch": 0.9641638225255973,
"grad_norm": 0.33929109387452877,
"learning_rate": 5.1608864003426255e-06,
"loss": 0.4783,
"num_tokens": 540188216.0,
"step": 2825
},
{
"epoch": 0.9658703071672355,
"grad_norm": 0.3147575010798021,
"learning_rate": 5.146082467376103e-06,
"loss": 0.4742,
"num_tokens": 541043880.0,
"step": 2830
},
{
"epoch": 0.9675767918088737,
"grad_norm": 0.29692914861077196,
"learning_rate": 5.131990675151757e-06,
"loss": 0.4915,
"num_tokens": 541979505.0,
"step": 2835
},
{
"epoch": 0.9692832764505119,
"grad_norm": 0.3233760089537912,
"learning_rate": 5.1186114725993754e-06,
"loss": 0.4741,
"num_tokens": 542913487.0,
"step": 2840
},
{
"epoch": 0.9709897610921502,
"grad_norm": 0.3243453736091082,
"learning_rate": 5.105945285947394e-06,
"loss": 0.4856,
"num_tokens": 543888926.0,
"step": 2845
},
{
"epoch": 0.9726962457337884,
"grad_norm": 0.34332556463465,
"learning_rate": 5.09399251870931e-06,
"loss": 0.5042,
"num_tokens": 544851416.0,
"step": 2850
},
{
"epoch": 0.9744027303754266,
"grad_norm": 0.35805837088581816,
"learning_rate": 5.082753551670843e-06,
"loss": 0.4832,
"num_tokens": 545757943.0,
"step": 2855
},
{
"epoch": 0.9761092150170648,
"grad_norm": 0.3330936930767059,
"learning_rate": 5.072228742877796e-06,
"loss": 0.4861,
"num_tokens": 546668069.0,
"step": 2860
},
{
"epoch": 0.9778156996587031,
"grad_norm": 0.2996095611574381,
"learning_rate": 5.062418427624646e-06,
"loss": 0.4706,
"num_tokens": 547620964.0,
"step": 2865
},
{
"epoch": 0.9795221843003413,
"grad_norm": 0.3334623473830669,
"learning_rate": 5.053322918443873e-06,
"loss": 0.4815,
"num_tokens": 548580998.0,
"step": 2870
},
{
"epoch": 0.9812286689419796,
"grad_norm": 0.3134805266587632,
"learning_rate": 5.0449425050959876e-06,
"loss": 0.49,
"num_tokens": 549511852.0,
"step": 2875
},
{
"epoch": 0.9829351535836177,
"grad_norm": 0.3309875762519481,
"learning_rate": 5.0372774545603155e-06,
"loss": 0.4617,
"num_tokens": 550468465.0,
"step": 2880
},
{
"epoch": 0.984641638225256,
"grad_norm": 0.3215047597552604,
"learning_rate": 5.0303280110264825e-06,
"loss": 0.4681,
"num_tokens": 551376796.0,
"step": 2885
},
{
"epoch": 0.9863481228668942,
"grad_norm": 0.45537977132918783,
"learning_rate": 5.02409439588664e-06,
"loss": 0.4914,
"num_tokens": 552397091.0,
"step": 2890
},
{
"epoch": 0.9880546075085325,
"grad_norm": 0.30892367299813056,
"learning_rate": 5.018576807728409e-06,
"loss": 0.4632,
"num_tokens": 553378344.0,
"step": 2895
},
{
"epoch": 0.9897610921501706,
"grad_norm": 0.32443914024425874,
"learning_rate": 5.013775422328553e-06,
"loss": 0.466,
"num_tokens": 554315309.0,
"step": 2900
},
{
"epoch": 0.9914675767918089,
"grad_norm": 0.3310428027199226,
"learning_rate": 5.0096903926473885e-06,
"loss": 0.4724,
"num_tokens": 555217376.0,
"step": 2905
},
{
"epoch": 0.9931740614334471,
"grad_norm": 0.3211087070464618,
"learning_rate": 5.00632184882389e-06,
"loss": 0.4991,
"num_tokens": 556147373.0,
"step": 2910
},
{
"epoch": 0.9948805460750854,
"grad_norm": 0.3344876290335765,
"learning_rate": 5.00366989817157e-06,
"loss": 0.4827,
"num_tokens": 557076223.0,
"step": 2915
},
{
"epoch": 0.9965870307167235,
"grad_norm": 0.32216989394760803,
"learning_rate": 5.0017346251750415e-06,
"loss": 0.4646,
"num_tokens": 558093284.0,
"step": 2920
},
{
"epoch": 0.9982935153583617,
"grad_norm": 0.3263881006745613,
"learning_rate": 5.000516091487337e-06,
"loss": 0.4751,
"num_tokens": 558997916.0,
"step": 2925
},
{
"epoch": 1.0,
"grad_norm": 0.3026241946213063,
"learning_rate": 5.00001433592793e-06,
"loss": 0.4825,
"num_tokens": 560009809.0,
"step": 2930
},
{
"epoch": 1.0,
"step": 2930,
"total_flos": 1146719603261440.0,
"train_loss": 0.5200651722149637,
"train_runtime": 20826.6168,
"train_samples_per_second": 4.501,
"train_steps_per_second": 0.141
}
],
"logging_steps": 5,
"max_steps": 2930,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1146719603261440.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}