|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 2930, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0017064846416382253, |
|
"grad_norm": 2.823172429776875, |
|
"learning_rate": 1.360544217687075e-06, |
|
"loss": 0.8715, |
|
"num_tokens": 949756.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0034129692832764505, |
|
"grad_norm": 1.9356174655366971, |
|
"learning_rate": 3.0612244897959185e-06, |
|
"loss": 0.8609, |
|
"num_tokens": 1934239.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005119453924914676, |
|
"grad_norm": 1.460519264059952, |
|
"learning_rate": 4.7619047619047615e-06, |
|
"loss": 0.794, |
|
"num_tokens": 2810536.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.006825938566552901, |
|
"grad_norm": 0.904578522693587, |
|
"learning_rate": 6.462585034013606e-06, |
|
"loss": 0.7436, |
|
"num_tokens": 3759778.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008532423208191127, |
|
"grad_norm": 0.7983479886595664, |
|
"learning_rate": 8.163265306122448e-06, |
|
"loss": 0.7124, |
|
"num_tokens": 4719221.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.010238907849829351, |
|
"grad_norm": 0.7054491846301127, |
|
"learning_rate": 9.863945578231292e-06, |
|
"loss": 0.7244, |
|
"num_tokens": 5645472.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.011945392491467578, |
|
"grad_norm": 0.5846860059632117, |
|
"learning_rate": 1.1564625850340138e-05, |
|
"loss": 0.6856, |
|
"num_tokens": 6675650.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.013651877133105802, |
|
"grad_norm": 0.5252998046067419, |
|
"learning_rate": 1.3265306122448982e-05, |
|
"loss": 0.6694, |
|
"num_tokens": 7670069.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.015358361774744027, |
|
"grad_norm": 0.5533849135779766, |
|
"learning_rate": 1.4965986394557824e-05, |
|
"loss": 0.6436, |
|
"num_tokens": 8562223.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.017064846416382253, |
|
"grad_norm": 0.5631922874607858, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.6546, |
|
"num_tokens": 9514590.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01877133105802048, |
|
"grad_norm": 0.6078968314204046, |
|
"learning_rate": 1.836734693877551e-05, |
|
"loss": 0.6175, |
|
"num_tokens": 10423653.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.020477815699658702, |
|
"grad_norm": 0.5693094869748001, |
|
"learning_rate": 2.0068027210884355e-05, |
|
"loss": 0.6177, |
|
"num_tokens": 11370767.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.02218430034129693, |
|
"grad_norm": 0.6043968806478303, |
|
"learning_rate": 2.17687074829932e-05, |
|
"loss": 0.6241, |
|
"num_tokens": 12205003.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.023890784982935155, |
|
"grad_norm": 0.6098681736085941, |
|
"learning_rate": 2.3469387755102043e-05, |
|
"loss": 0.6197, |
|
"num_tokens": 13221634.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.025597269624573378, |
|
"grad_norm": 0.5652622276816748, |
|
"learning_rate": 2.5170068027210887e-05, |
|
"loss": 0.6032, |
|
"num_tokens": 14132790.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.027303754266211604, |
|
"grad_norm": 0.5925207051821937, |
|
"learning_rate": 2.687074829931973e-05, |
|
"loss": 0.6095, |
|
"num_tokens": 15120586.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.02901023890784983, |
|
"grad_norm": 0.6004251741348174, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 0.627, |
|
"num_tokens": 16123604.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.030716723549488054, |
|
"grad_norm": 0.6116000919042185, |
|
"learning_rate": 3.0272108843537418e-05, |
|
"loss": 0.6003, |
|
"num_tokens": 17045024.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.032423208191126277, |
|
"grad_norm": 0.6033555839102962, |
|
"learning_rate": 3.1972789115646265e-05, |
|
"loss": 0.6099, |
|
"num_tokens": 17979516.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.034129692832764506, |
|
"grad_norm": 0.6174109618196879, |
|
"learning_rate": 3.36734693877551e-05, |
|
"loss": 0.5928, |
|
"num_tokens": 18909989.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03583617747440273, |
|
"grad_norm": 0.6562250628577161, |
|
"learning_rate": 3.5374149659863946e-05, |
|
"loss": 0.6091, |
|
"num_tokens": 19893037.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.03754266211604096, |
|
"grad_norm": 0.6359940327448688, |
|
"learning_rate": 3.707482993197279e-05, |
|
"loss": 0.5757, |
|
"num_tokens": 20794568.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.03924914675767918, |
|
"grad_norm": 0.591968085837116, |
|
"learning_rate": 3.8775510204081634e-05, |
|
"loss": 0.601, |
|
"num_tokens": 21708539.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.040955631399317405, |
|
"grad_norm": 0.6284093561745501, |
|
"learning_rate": 4.047619047619048e-05, |
|
"loss": 0.6047, |
|
"num_tokens": 22645536.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.042662116040955635, |
|
"grad_norm": 0.622896210775772, |
|
"learning_rate": 4.217687074829932e-05, |
|
"loss": 0.5941, |
|
"num_tokens": 23599448.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.04436860068259386, |
|
"grad_norm": 0.5576038080368834, |
|
"learning_rate": 4.387755102040816e-05, |
|
"loss": 0.6178, |
|
"num_tokens": 24609009.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.04607508532423208, |
|
"grad_norm": 0.5507377217724624, |
|
"learning_rate": 4.557823129251701e-05, |
|
"loss": 0.6048, |
|
"num_tokens": 25618632.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.04778156996587031, |
|
"grad_norm": 0.6383078614695583, |
|
"learning_rate": 4.7278911564625856e-05, |
|
"loss": 0.5975, |
|
"num_tokens": 26641035.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.04948805460750853, |
|
"grad_norm": 0.6846737722897228, |
|
"learning_rate": 4.89795918367347e-05, |
|
"loss": 0.5744, |
|
"num_tokens": 27517779.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.051194539249146756, |
|
"grad_norm": 0.7663095587321149, |
|
"learning_rate": 4.999994265630655e-05, |
|
"loss": 0.5675, |
|
"num_tokens": 28445983.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.052901023890784986, |
|
"grad_norm": 0.6816635800011499, |
|
"learning_rate": 4.999929754311198e-05, |
|
"loss": 0.5903, |
|
"num_tokens": 29489865.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.05460750853242321, |
|
"grad_norm": 0.6680964778036098, |
|
"learning_rate": 4.999793565772626e-05, |
|
"loss": 0.5989, |
|
"num_tokens": 30423007.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.05631399317406143, |
|
"grad_norm": 0.5649434202104989, |
|
"learning_rate": 4.999585704353568e-05, |
|
"loss": 0.5801, |
|
"num_tokens": 31372257.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.05802047781569966, |
|
"grad_norm": 0.5122745336387098, |
|
"learning_rate": 4.999306176675979e-05, |
|
"loss": 0.5998, |
|
"num_tokens": 32356676.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.059726962457337884, |
|
"grad_norm": 0.5332799116717496, |
|
"learning_rate": 4.998954991644921e-05, |
|
"loss": 0.5904, |
|
"num_tokens": 33261796.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.06143344709897611, |
|
"grad_norm": 0.5068363705703531, |
|
"learning_rate": 4.9985321604482835e-05, |
|
"loss": 0.59, |
|
"num_tokens": 34237001.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06313993174061433, |
|
"grad_norm": 0.6492409848966394, |
|
"learning_rate": 4.9980376965564286e-05, |
|
"loss": 0.5955, |
|
"num_tokens": 35167253.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.06484641638225255, |
|
"grad_norm": 0.5820124301992429, |
|
"learning_rate": 4.997471615721756e-05, |
|
"loss": 0.5767, |
|
"num_tokens": 36074352.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.06655290102389079, |
|
"grad_norm": 0.5394095878459838, |
|
"learning_rate": 4.996833935978207e-05, |
|
"loss": 0.624, |
|
"num_tokens": 37055456.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.06825938566552901, |
|
"grad_norm": 0.5362539000244216, |
|
"learning_rate": 4.996124677640687e-05, |
|
"loss": 0.5722, |
|
"num_tokens": 37967720.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06996587030716724, |
|
"grad_norm": 0.49110498514505857, |
|
"learning_rate": 4.99534386330442e-05, |
|
"loss": 0.5993, |
|
"num_tokens": 38924279.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.07167235494880546, |
|
"grad_norm": 0.546699809294802, |
|
"learning_rate": 4.994491517844227e-05, |
|
"loss": 0.5938, |
|
"num_tokens": 39853351.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.07337883959044368, |
|
"grad_norm": 0.5776918031520784, |
|
"learning_rate": 4.993567668413733e-05, |
|
"loss": 0.5809, |
|
"num_tokens": 40811069.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.07508532423208192, |
|
"grad_norm": 0.4951200074682594, |
|
"learning_rate": 4.992572344444507e-05, |
|
"loss": 0.6027, |
|
"num_tokens": 41833783.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.07679180887372014, |
|
"grad_norm": 0.5901419308378143, |
|
"learning_rate": 4.991505577645118e-05, |
|
"loss": 0.5747, |
|
"num_tokens": 42744744.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.07849829351535836, |
|
"grad_norm": 0.5304430248970768, |
|
"learning_rate": 4.9903674020001284e-05, |
|
"loss": 0.6, |
|
"num_tokens": 43682290.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.08020477815699659, |
|
"grad_norm": 0.5211597930937201, |
|
"learning_rate": 4.989157853769009e-05, |
|
"loss": 0.5805, |
|
"num_tokens": 44609387.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.08191126279863481, |
|
"grad_norm": 0.5133063136328732, |
|
"learning_rate": 4.987876971484988e-05, |
|
"loss": 0.5787, |
|
"num_tokens": 45550959.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.08361774744027303, |
|
"grad_norm": 0.6095708293579054, |
|
"learning_rate": 4.9865247959538194e-05, |
|
"loss": 0.5976, |
|
"num_tokens": 46433321.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.08532423208191127, |
|
"grad_norm": 0.4814034781564727, |
|
"learning_rate": 4.985101370252483e-05, |
|
"loss": 0.5872, |
|
"num_tokens": 47474526.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.08703071672354949, |
|
"grad_norm": 0.5162969780823659, |
|
"learning_rate": 4.983606739727816e-05, |
|
"loss": 0.5863, |
|
"num_tokens": 48425336.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.08873720136518772, |
|
"grad_norm": 0.5073448552205259, |
|
"learning_rate": 4.982040951995066e-05, |
|
"loss": 0.5821, |
|
"num_tokens": 49377672.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.09044368600682594, |
|
"grad_norm": 0.6390759679559125, |
|
"learning_rate": 4.980404056936371e-05, |
|
"loss": 0.5822, |
|
"num_tokens": 50297482.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.09215017064846416, |
|
"grad_norm": 0.505488369723625, |
|
"learning_rate": 4.978696106699175e-05, |
|
"loss": 0.5777, |
|
"num_tokens": 51201531.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.09385665529010238, |
|
"grad_norm": 0.4834540354696749, |
|
"learning_rate": 4.976917155694565e-05, |
|
"loss": 0.5706, |
|
"num_tokens": 52135060.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.09556313993174062, |
|
"grad_norm": 0.5091081042854496, |
|
"learning_rate": 4.9750672605955385e-05, |
|
"loss": 0.5887, |
|
"num_tokens": 53019691.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.09726962457337884, |
|
"grad_norm": 0.5272029644773706, |
|
"learning_rate": 4.9731464803351944e-05, |
|
"loss": 0.5768, |
|
"num_tokens": 54021567.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.09897610921501707, |
|
"grad_norm": 0.5618712865310092, |
|
"learning_rate": 4.971154876104862e-05, |
|
"loss": 0.5707, |
|
"num_tokens": 54928756.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.10068259385665529, |
|
"grad_norm": 0.5079662485961427, |
|
"learning_rate": 4.969092511352143e-05, |
|
"loss": 0.5764, |
|
"num_tokens": 55901721.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.10238907849829351, |
|
"grad_norm": 0.5590533288223599, |
|
"learning_rate": 4.9669594517789004e-05, |
|
"loss": 0.6059, |
|
"num_tokens": 56919336.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.10409556313993173, |
|
"grad_norm": 0.5124289657226824, |
|
"learning_rate": 4.9647557653391544e-05, |
|
"loss": 0.5486, |
|
"num_tokens": 57859693.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.10580204778156997, |
|
"grad_norm": 0.5236808101484978, |
|
"learning_rate": 4.9624815222369283e-05, |
|
"loss": 0.5744, |
|
"num_tokens": 58769487.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1075085324232082, |
|
"grad_norm": 0.41840286703818447, |
|
"learning_rate": 4.9601367949240034e-05, |
|
"loss": 0.5571, |
|
"num_tokens": 59686216.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.10921501706484642, |
|
"grad_norm": 0.49183814869408765, |
|
"learning_rate": 4.957721658097616e-05, |
|
"loss": 0.5778, |
|
"num_tokens": 60660649.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.11092150170648464, |
|
"grad_norm": 0.5261379848235058, |
|
"learning_rate": 4.955236188698076e-05, |
|
"loss": 0.5581, |
|
"num_tokens": 61640951.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.11262798634812286, |
|
"grad_norm": 0.5047161464074051, |
|
"learning_rate": 4.9526804659063135e-05, |
|
"loss": 0.5453, |
|
"num_tokens": 62673288.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.11433447098976109, |
|
"grad_norm": 0.5370013069114835, |
|
"learning_rate": 4.950054571141362e-05, |
|
"loss": 0.5704, |
|
"num_tokens": 63654789.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.11604095563139932, |
|
"grad_norm": 0.5065504794071035, |
|
"learning_rate": 4.94735858805776e-05, |
|
"loss": 0.5564, |
|
"num_tokens": 64575366.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.11774744027303755, |
|
"grad_norm": 0.5367370357250334, |
|
"learning_rate": 4.9445926025428856e-05, |
|
"loss": 0.5682, |
|
"num_tokens": 65572577.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.11945392491467577, |
|
"grad_norm": 0.6256624388445353, |
|
"learning_rate": 4.9417567027142245e-05, |
|
"loss": 0.5691, |
|
"num_tokens": 66496209.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.12116040955631399, |
|
"grad_norm": 0.5364526247895124, |
|
"learning_rate": 4.938850978916557e-05, |
|
"loss": 0.5963, |
|
"num_tokens": 67477381.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.12286689419795221, |
|
"grad_norm": 0.5760517333994615, |
|
"learning_rate": 4.935875523719086e-05, |
|
"loss": 0.5676, |
|
"num_tokens": 68438217.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.12457337883959044, |
|
"grad_norm": 0.4912183552600293, |
|
"learning_rate": 4.932830431912484e-05, |
|
"loss": 0.5689, |
|
"num_tokens": 69455969.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.12627986348122866, |
|
"grad_norm": 0.4760184495750453, |
|
"learning_rate": 4.929715800505873e-05, |
|
"loss": 0.5763, |
|
"num_tokens": 70344364.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.12798634812286688, |
|
"grad_norm": 0.45385512480590845, |
|
"learning_rate": 4.926531728723738e-05, |
|
"loss": 0.5871, |
|
"num_tokens": 71311780.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.1296928327645051, |
|
"grad_norm": 0.4698082651802184, |
|
"learning_rate": 4.923278318002761e-05, |
|
"loss": 0.5545, |
|
"num_tokens": 72264786.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.13139931740614336, |
|
"grad_norm": 0.45414350096683564, |
|
"learning_rate": 4.919955671988592e-05, |
|
"loss": 0.5368, |
|
"num_tokens": 73254319.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.13310580204778158, |
|
"grad_norm": 0.4488305223658382, |
|
"learning_rate": 4.916563896532549e-05, |
|
"loss": 0.5538, |
|
"num_tokens": 74233642.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1348122866894198, |
|
"grad_norm": 0.4525201301734097, |
|
"learning_rate": 4.91310309968824e-05, |
|
"loss": 0.5689, |
|
"num_tokens": 75216559.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.13651877133105803, |
|
"grad_norm": 0.5580205197947308, |
|
"learning_rate": 4.90957339170813e-05, |
|
"loss": 0.5684, |
|
"num_tokens": 76173511.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.13822525597269625, |
|
"grad_norm": 0.43833715671179474, |
|
"learning_rate": 4.905974885040015e-05, |
|
"loss": 0.5537, |
|
"num_tokens": 77137128.0, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.13993174061433447, |
|
"grad_norm": 0.43081913118053056, |
|
"learning_rate": 4.902307694323456e-05, |
|
"loss": 0.5595, |
|
"num_tokens": 78183541.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.1416382252559727, |
|
"grad_norm": 0.46606853464523745, |
|
"learning_rate": 4.8985719363861135e-05, |
|
"loss": 0.572, |
|
"num_tokens": 79163656.0, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.14334470989761092, |
|
"grad_norm": 0.5472361464183433, |
|
"learning_rate": 4.8947677302400326e-05, |
|
"loss": 0.5522, |
|
"num_tokens": 80166162.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.14505119453924914, |
|
"grad_norm": 0.4537573554760025, |
|
"learning_rate": 4.890895197077848e-05, |
|
"loss": 0.5507, |
|
"num_tokens": 81121834.0, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.14675767918088736, |
|
"grad_norm": 0.5290898874704273, |
|
"learning_rate": 4.886954460268927e-05, |
|
"loss": 0.5702, |
|
"num_tokens": 81987283.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.14846416382252559, |
|
"grad_norm": 0.45456782141151, |
|
"learning_rate": 4.882945645355435e-05, |
|
"loss": 0.5756, |
|
"num_tokens": 82994121.0, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.15017064846416384, |
|
"grad_norm": 0.4809659734303757, |
|
"learning_rate": 4.878868880048341e-05, |
|
"loss": 0.5614, |
|
"num_tokens": 83915537.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.15187713310580206, |
|
"grad_norm": 0.52139675211743, |
|
"learning_rate": 4.874724294223343e-05, |
|
"loss": 0.5444, |
|
"num_tokens": 84755157.0, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.15358361774744028, |
|
"grad_norm": 0.4544341064553721, |
|
"learning_rate": 4.870512019916734e-05, |
|
"loss": 0.5433, |
|
"num_tokens": 85690047.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1552901023890785, |
|
"grad_norm": 0.4558960340950479, |
|
"learning_rate": 4.866232191321199e-05, |
|
"loss": 0.5933, |
|
"num_tokens": 86736902.0, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.15699658703071673, |
|
"grad_norm": 0.47990907617604506, |
|
"learning_rate": 4.8618849447815305e-05, |
|
"loss": 0.5745, |
|
"num_tokens": 87705484.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.15870307167235495, |
|
"grad_norm": 0.4471349487997525, |
|
"learning_rate": 4.8574704187902955e-05, |
|
"loss": 0.5493, |
|
"num_tokens": 88652585.0, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.16040955631399317, |
|
"grad_norm": 0.5050821167458766, |
|
"learning_rate": 4.8529887539834144e-05, |
|
"loss": 0.5559, |
|
"num_tokens": 89594618.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.1621160409556314, |
|
"grad_norm": 0.433008399283799, |
|
"learning_rate": 4.84844009313569e-05, |
|
"loss": 0.5465, |
|
"num_tokens": 90594832.0, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.16382252559726962, |
|
"grad_norm": 0.42424498611581374, |
|
"learning_rate": 4.843824581156249e-05, |
|
"loss": 0.5555, |
|
"num_tokens": 91557529.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.16552901023890784, |
|
"grad_norm": 0.4536724303374971, |
|
"learning_rate": 4.839142365083932e-05, |
|
"loss": 0.5586, |
|
"num_tokens": 92520759.0, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.16723549488054607, |
|
"grad_norm": 0.4135691134510148, |
|
"learning_rate": 4.8343935940826104e-05, |
|
"loss": 0.5463, |
|
"num_tokens": 93508244.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1689419795221843, |
|
"grad_norm": 0.49925946717984443, |
|
"learning_rate": 4.829578419436427e-05, |
|
"loss": 0.5758, |
|
"num_tokens": 94489856.0, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.17064846416382254, |
|
"grad_norm": 0.4795792461133882, |
|
"learning_rate": 4.824696994544985e-05, |
|
"loss": 0.5581, |
|
"num_tokens": 95453893.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.17235494880546076, |
|
"grad_norm": 0.46669919948991945, |
|
"learning_rate": 4.819749474918455e-05, |
|
"loss": 0.556, |
|
"num_tokens": 96333287.0, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.17406143344709898, |
|
"grad_norm": 0.4707277196611808, |
|
"learning_rate": 4.814736018172624e-05, |
|
"loss": 0.5583, |
|
"num_tokens": 97319183.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.1757679180887372, |
|
"grad_norm": 0.44856398131453706, |
|
"learning_rate": 4.809656784023872e-05, |
|
"loss": 0.5643, |
|
"num_tokens": 98374455.0, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.17747440273037543, |
|
"grad_norm": 0.41999315891743993, |
|
"learning_rate": 4.8045119342840885e-05, |
|
"loss": 0.5368, |
|
"num_tokens": 99400041.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.17918088737201365, |
|
"grad_norm": 0.4314801891904282, |
|
"learning_rate": 4.799301632855508e-05, |
|
"loss": 0.5682, |
|
"num_tokens": 100419726.0, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.18088737201365188, |
|
"grad_norm": 0.4532725946713624, |
|
"learning_rate": 4.794026045725501e-05, |
|
"loss": 0.5413, |
|
"num_tokens": 101373875.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.1825938566552901, |
|
"grad_norm": 0.45204637278231036, |
|
"learning_rate": 4.788685340961276e-05, |
|
"loss": 0.5561, |
|
"num_tokens": 102331658.0, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.18430034129692832, |
|
"grad_norm": 0.4711186304154966, |
|
"learning_rate": 4.7832796887045276e-05, |
|
"loss": 0.5421, |
|
"num_tokens": 103240516.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.18600682593856654, |
|
"grad_norm": 0.4626002965808097, |
|
"learning_rate": 4.7778092611660225e-05, |
|
"loss": 0.5696, |
|
"num_tokens": 104162605.0, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.18771331058020477, |
|
"grad_norm": 0.423630300008881, |
|
"learning_rate": 4.772274232620104e-05, |
|
"loss": 0.5532, |
|
"num_tokens": 105061908.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.189419795221843, |
|
"grad_norm": 0.5303058647014298, |
|
"learning_rate": 4.766674779399145e-05, |
|
"loss": 0.5634, |
|
"num_tokens": 105919969.0, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.19112627986348124, |
|
"grad_norm": 0.4528277982211153, |
|
"learning_rate": 4.76101107988793e-05, |
|
"loss": 0.5775, |
|
"num_tokens": 106919294.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.19283276450511946, |
|
"grad_norm": 0.375604144309103, |
|
"learning_rate": 4.7552833145179746e-05, |
|
"loss": 0.5127, |
|
"num_tokens": 107846976.0, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.1945392491467577, |
|
"grad_norm": 0.44488024598088494, |
|
"learning_rate": 4.749491665761772e-05, |
|
"loss": 0.5388, |
|
"num_tokens": 108819219.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.1962457337883959, |
|
"grad_norm": 0.4154713260123601, |
|
"learning_rate": 4.7436363181269825e-05, |
|
"loss": 0.5469, |
|
"num_tokens": 109845258.0, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.19795221843003413, |
|
"grad_norm": 0.39816353681189776, |
|
"learning_rate": 4.737717458150558e-05, |
|
"loss": 0.5519, |
|
"num_tokens": 110858993.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.19965870307167236, |
|
"grad_norm": 0.44831838753632824, |
|
"learning_rate": 4.7317352743927954e-05, |
|
"loss": 0.5578, |
|
"num_tokens": 111788546.0, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.20136518771331058, |
|
"grad_norm": 0.4186460516616093, |
|
"learning_rate": 4.7256899574313304e-05, |
|
"loss": 0.5472, |
|
"num_tokens": 112732095.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.2030716723549488, |
|
"grad_norm": 0.44237132476023605, |
|
"learning_rate": 4.71958169985507e-05, |
|
"loss": 0.5493, |
|
"num_tokens": 113649022.0, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.20477815699658702, |
|
"grad_norm": 0.43701302963747873, |
|
"learning_rate": 4.7134106962580516e-05, |
|
"loss": 0.5569, |
|
"num_tokens": 114540376.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.20648464163822525, |
|
"grad_norm": 0.4128633428095234, |
|
"learning_rate": 4.707177143233247e-05, |
|
"loss": 0.5513, |
|
"num_tokens": 115480997.0, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.20819112627986347, |
|
"grad_norm": 0.3755079914854989, |
|
"learning_rate": 4.7008812393662996e-05, |
|
"loss": 0.5255, |
|
"num_tokens": 116464215.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.2098976109215017, |
|
"grad_norm": 0.4092404736922847, |
|
"learning_rate": 4.694523185229196e-05, |
|
"loss": 0.5398, |
|
"num_tokens": 117413382.0, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.21160409556313994, |
|
"grad_norm": 0.4130984750062132, |
|
"learning_rate": 4.688103183373877e-05, |
|
"loss": 0.5355, |
|
"num_tokens": 118465258.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.21331058020477817, |
|
"grad_norm": 0.4350871959699708, |
|
"learning_rate": 4.6816214383257864e-05, |
|
"loss": 0.5507, |
|
"num_tokens": 119368272.0, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.2150170648464164, |
|
"grad_norm": 0.42712415977098717, |
|
"learning_rate": 4.6750781565773524e-05, |
|
"loss": 0.5376, |
|
"num_tokens": 120323497.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.2167235494880546, |
|
"grad_norm": 0.4018174070587826, |
|
"learning_rate": 4.6684735465814114e-05, |
|
"loss": 0.5623, |
|
"num_tokens": 121336091.0, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.21843003412969283, |
|
"grad_norm": 0.40967629179586657, |
|
"learning_rate": 4.661807818744568e-05, |
|
"loss": 0.5345, |
|
"num_tokens": 122331818.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.22013651877133106, |
|
"grad_norm": 0.4516013972993547, |
|
"learning_rate": 4.6550811854204896e-05, |
|
"loss": 0.545, |
|
"num_tokens": 123276577.0, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.22184300341296928, |
|
"grad_norm": 0.4167635858598471, |
|
"learning_rate": 4.6482938609031406e-05, |
|
"loss": 0.5574, |
|
"num_tokens": 124260967.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.2235494880546075, |
|
"grad_norm": 0.4027183249033178, |
|
"learning_rate": 4.6414460614199614e-05, |
|
"loss": 0.558, |
|
"num_tokens": 125178584.0, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.22525597269624573, |
|
"grad_norm": 0.3943619941232963, |
|
"learning_rate": 4.6345380051249726e-05, |
|
"loss": 0.5359, |
|
"num_tokens": 126115279.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.22696245733788395, |
|
"grad_norm": 0.41605588299949164, |
|
"learning_rate": 4.627569912091829e-05, |
|
"loss": 0.5308, |
|
"num_tokens": 127123510.0, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.22866894197952217, |
|
"grad_norm": 0.4232494441201061, |
|
"learning_rate": 4.620542004306808e-05, |
|
"loss": 0.5291, |
|
"num_tokens": 128096244.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.23037542662116042, |
|
"grad_norm": 0.36709719294748555, |
|
"learning_rate": 4.613454505661738e-05, |
|
"loss": 0.545, |
|
"num_tokens": 129070712.0, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.23208191126279865, |
|
"grad_norm": 0.4284861781533593, |
|
"learning_rate": 4.606307641946867e-05, |
|
"loss": 0.5639, |
|
"num_tokens": 129992439.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.23378839590443687, |
|
"grad_norm": 0.4278163699139823, |
|
"learning_rate": 4.599101640843664e-05, |
|
"loss": 0.539, |
|
"num_tokens": 130917322.0, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.2354948805460751, |
|
"grad_norm": 0.3880253078166962, |
|
"learning_rate": 4.591836731917573e-05, |
|
"loss": 0.5683, |
|
"num_tokens": 131869001.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.23720136518771331, |
|
"grad_norm": 0.48519543443409835, |
|
"learning_rate": 4.584513146610694e-05, |
|
"loss": 0.5578, |
|
"num_tokens": 132871820.0, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.23890784982935154, |
|
"grad_norm": 0.4331997241836761, |
|
"learning_rate": 4.577131118234413e-05, |
|
"loss": 0.5642, |
|
"num_tokens": 133787994.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.24061433447098976, |
|
"grad_norm": 0.4124588160404072, |
|
"learning_rate": 4.569690881961967e-05, |
|
"loss": 0.531, |
|
"num_tokens": 134665258.0, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.24232081911262798, |
|
"grad_norm": 0.40688160744028257, |
|
"learning_rate": 4.562192674820957e-05, |
|
"loss": 0.536, |
|
"num_tokens": 135563718.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.2440273037542662, |
|
"grad_norm": 0.3855522562994955, |
|
"learning_rate": 4.554636735685786e-05, |
|
"loss": 0.5366, |
|
"num_tokens": 136530274.0, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.24573378839590443, |
|
"grad_norm": 0.4249521897707511, |
|
"learning_rate": 4.547023305270064e-05, |
|
"loss": 0.5475, |
|
"num_tokens": 137544925.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.24744027303754265, |
|
"grad_norm": 0.6254856649079702, |
|
"learning_rate": 4.539352626118926e-05, |
|
"loss": 0.5417, |
|
"num_tokens": 138475799.0, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.24914675767918087, |
|
"grad_norm": 0.3911156875271076, |
|
"learning_rate": 4.5316249426013126e-05, |
|
"loss": 0.5201, |
|
"num_tokens": 139435802.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.2508532423208191, |
|
"grad_norm": 0.4688833261931172, |
|
"learning_rate": 4.523840500902183e-05, |
|
"loss": 0.5373, |
|
"num_tokens": 140314284.0, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.2525597269624573, |
|
"grad_norm": 0.4261015924614589, |
|
"learning_rate": 4.515999549014673e-05, |
|
"loss": 0.5329, |
|
"num_tokens": 141219364.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.25426621160409557, |
|
"grad_norm": 0.46451445077951103, |
|
"learning_rate": 4.5081023367321916e-05, |
|
"loss": 0.5369, |
|
"num_tokens": 142303539.0, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.25597269624573377, |
|
"grad_norm": 0.4899621819611558, |
|
"learning_rate": 4.500149115640468e-05, |
|
"loss": 0.5736, |
|
"num_tokens": 143301347.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.257679180887372, |
|
"grad_norm": 0.40185507186448355, |
|
"learning_rate": 4.492140139109533e-05, |
|
"loss": 0.529, |
|
"num_tokens": 144231893.0, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.2593856655290102, |
|
"grad_norm": 0.3751786581425354, |
|
"learning_rate": 4.484075662285647e-05, |
|
"loss": 0.5366, |
|
"num_tokens": 145160611.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.26109215017064846, |
|
"grad_norm": 0.383945095710832, |
|
"learning_rate": 4.475955942083176e-05, |
|
"loss": 0.5286, |
|
"num_tokens": 146121565.0, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.2627986348122867, |
|
"grad_norm": 0.39293769351911984, |
|
"learning_rate": 4.4677812371764e-05, |
|
"loss": 0.5177, |
|
"num_tokens": 147031619.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.2645051194539249, |
|
"grad_norm": 0.36541237345446015, |
|
"learning_rate": 4.45955180799128e-05, |
|
"loss": 0.5289, |
|
"num_tokens": 147981269.0, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.26621160409556316, |
|
"grad_norm": 0.4692171769339349, |
|
"learning_rate": 4.4512679166971553e-05, |
|
"loss": 0.5489, |
|
"num_tokens": 148964661.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.26791808873720135, |
|
"grad_norm": 0.3687554090744247, |
|
"learning_rate": 4.442929827198395e-05, |
|
"loss": 0.5471, |
|
"num_tokens": 150008239.0, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.2696245733788396, |
|
"grad_norm": 0.45244715310494926, |
|
"learning_rate": 4.43453780512599e-05, |
|
"loss": 0.5466, |
|
"num_tokens": 150937307.0, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.2713310580204778, |
|
"grad_norm": 0.4452320820857673, |
|
"learning_rate": 4.4260921178290866e-05, |
|
"loss": 0.5407, |
|
"num_tokens": 151860116.0, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.27303754266211605, |
|
"grad_norm": 0.42732274617096166, |
|
"learning_rate": 4.417593034366478e-05, |
|
"loss": 0.5311, |
|
"num_tokens": 152834849.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.27474402730375425, |
|
"grad_norm": 0.4052649603454169, |
|
"learning_rate": 4.409040825498024e-05, |
|
"loss": 0.5115, |
|
"num_tokens": 153761800.0, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.2764505119453925, |
|
"grad_norm": 0.4397536584781063, |
|
"learning_rate": 4.40043576367603e-05, |
|
"loss": 0.5268, |
|
"num_tokens": 154739335.0, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.2781569965870307, |
|
"grad_norm": 0.42381590061263635, |
|
"learning_rate": 4.3917781230365677e-05, |
|
"loss": 0.5554, |
|
"num_tokens": 155726110.0, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.27986348122866894, |
|
"grad_norm": 0.4189254439755074, |
|
"learning_rate": 4.383068179390739e-05, |
|
"loss": 0.5435, |
|
"num_tokens": 156709373.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.2815699658703072, |
|
"grad_norm": 0.4194447248910214, |
|
"learning_rate": 4.3743062102158896e-05, |
|
"loss": 0.5318, |
|
"num_tokens": 157605031.0, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.2832764505119454, |
|
"grad_norm": 0.3731419235770229, |
|
"learning_rate": 4.3654924946467724e-05, |
|
"loss": 0.517, |
|
"num_tokens": 158541316.0, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.28498293515358364, |
|
"grad_norm": 0.3749285811277496, |
|
"learning_rate": 4.3566273134666525e-05, |
|
"loss": 0.5494, |
|
"num_tokens": 159525622.0, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.28668941979522183, |
|
"grad_norm": 0.4309015308982524, |
|
"learning_rate": 4.3477109490983626e-05, |
|
"loss": 0.5424, |
|
"num_tokens": 160459756.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.2883959044368601, |
|
"grad_norm": 0.3986942602299617, |
|
"learning_rate": 4.338743685595304e-05, |
|
"loss": 0.5228, |
|
"num_tokens": 161382919.0, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.2901023890784983, |
|
"grad_norm": 0.4258708102475343, |
|
"learning_rate": 4.329725808632403e-05, |
|
"loss": 0.5365, |
|
"num_tokens": 162358277.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.29180887372013653, |
|
"grad_norm": 0.38918974623711616, |
|
"learning_rate": 4.320657605497001e-05, |
|
"loss": 0.5522, |
|
"num_tokens": 163332894.0, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.2935153583617747, |
|
"grad_norm": 0.41923964286389137, |
|
"learning_rate": 4.3115393650797095e-05, |
|
"loss": 0.5384, |
|
"num_tokens": 164401378.0, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.295221843003413, |
|
"grad_norm": 0.4156858487356743, |
|
"learning_rate": 4.3023713778652074e-05, |
|
"loss": 0.5049, |
|
"num_tokens": 165316411.0, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.29692832764505117, |
|
"grad_norm": 0.39774241581331693, |
|
"learning_rate": 4.2931539359229804e-05, |
|
"loss": 0.5192, |
|
"num_tokens": 166276916.0, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.2986348122866894, |
|
"grad_norm": 0.35406058199730683, |
|
"learning_rate": 4.283887332898019e-05, |
|
"loss": 0.5127, |
|
"num_tokens": 167298025.0, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.3003412969283277, |
|
"grad_norm": 0.3843676536025804, |
|
"learning_rate": 4.2745718640014696e-05, |
|
"loss": 0.5318, |
|
"num_tokens": 168250987.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.30204778156996587, |
|
"grad_norm": 0.38653674163211976, |
|
"learning_rate": 4.265207826001219e-05, |
|
"loss": 0.5336, |
|
"num_tokens": 169245557.0, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.3037542662116041, |
|
"grad_norm": 0.4322065456917861, |
|
"learning_rate": 4.255795517212451e-05, |
|
"loss": 0.5489, |
|
"num_tokens": 170217424.0, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.3054607508532423, |
|
"grad_norm": 0.41164873108292127, |
|
"learning_rate": 4.246335237488136e-05, |
|
"loss": 0.5171, |
|
"num_tokens": 171143325.0, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.30716723549488056, |
|
"grad_norm": 0.4104853255150601, |
|
"learning_rate": 4.236827288209478e-05, |
|
"loss": 0.5223, |
|
"num_tokens": 172160313.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.30887372013651876, |
|
"grad_norm": 0.43043638343636487, |
|
"learning_rate": 4.2272719722763197e-05, |
|
"loss": 0.5246, |
|
"num_tokens": 173195128.0, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.310580204778157, |
|
"grad_norm": 0.41502896344603557, |
|
"learning_rate": 4.217669594097485e-05, |
|
"loss": 0.5379, |
|
"num_tokens": 174112017.0, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.3122866894197952, |
|
"grad_norm": 0.40961938646342305, |
|
"learning_rate": 4.208020459581087e-05, |
|
"loss": 0.5343, |
|
"num_tokens": 175151908.0, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.31399317406143346, |
|
"grad_norm": 0.4009346397822526, |
|
"learning_rate": 4.19832487612478e-05, |
|
"loss": 0.5057, |
|
"num_tokens": 176115268.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.31569965870307165, |
|
"grad_norm": 0.38967085585332806, |
|
"learning_rate": 4.1885831526059674e-05, |
|
"loss": 0.5108, |
|
"num_tokens": 177084976.0, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.3174061433447099, |
|
"grad_norm": 0.3386319476796638, |
|
"learning_rate": 4.178795599371961e-05, |
|
"loss": 0.4975, |
|
"num_tokens": 178049137.0, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.3191126279863481, |
|
"grad_norm": 0.430097609031975, |
|
"learning_rate": 4.168962528230096e-05, |
|
"loss": 0.5321, |
|
"num_tokens": 178990489.0, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.32081911262798635, |
|
"grad_norm": 0.3918509038411313, |
|
"learning_rate": 4.1590842524377914e-05, |
|
"loss": 0.5297, |
|
"num_tokens": 179947208.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.3225255972696246, |
|
"grad_norm": 0.4113406051465179, |
|
"learning_rate": 4.149161086692581e-05, |
|
"loss": 0.5375, |
|
"num_tokens": 180895300.0, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.3242320819112628, |
|
"grad_norm": 0.3812963085578539, |
|
"learning_rate": 4.139193347122077e-05, |
|
"loss": 0.5323, |
|
"num_tokens": 181891310.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.32593856655290104, |
|
"grad_norm": 0.427163248722511, |
|
"learning_rate": 4.1291813512739074e-05, |
|
"loss": 0.53, |
|
"num_tokens": 182829455.0, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.32764505119453924, |
|
"grad_norm": 0.3684699313638392, |
|
"learning_rate": 4.1191254181055936e-05, |
|
"loss": 0.52, |
|
"num_tokens": 183776326.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.3293515358361775, |
|
"grad_norm": 0.36070078090014446, |
|
"learning_rate": 4.1090258679743934e-05, |
|
"loss": 0.5176, |
|
"num_tokens": 184739434.0, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.3310580204778157, |
|
"grad_norm": 0.45870874889230046, |
|
"learning_rate": 4.098883022627094e-05, |
|
"loss": 0.5657, |
|
"num_tokens": 185721070.0, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.33276450511945393, |
|
"grad_norm": 0.4188344945747834, |
|
"learning_rate": 4.0886972051897594e-05, |
|
"loss": 0.533, |
|
"num_tokens": 186739113.0, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.33447098976109213, |
|
"grad_norm": 0.45500040021056365, |
|
"learning_rate": 4.078468740157439e-05, |
|
"loss": 0.537, |
|
"num_tokens": 187730193.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.3361774744027304, |
|
"grad_norm": 0.40761293819970623, |
|
"learning_rate": 4.068197953383832e-05, |
|
"loss": 0.5221, |
|
"num_tokens": 188652348.0, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.3378839590443686, |
|
"grad_norm": 0.4263098333005298, |
|
"learning_rate": 4.0578851720709e-05, |
|
"loss": 0.519, |
|
"num_tokens": 189611145.0, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.3395904436860068, |
|
"grad_norm": 0.35780922785817854, |
|
"learning_rate": 4.047530724758451e-05, |
|
"loss": 0.5263, |
|
"num_tokens": 190593764.0, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.3412969283276451, |
|
"grad_norm": 0.3827403926446559, |
|
"learning_rate": 4.037134941313668e-05, |
|
"loss": 0.5182, |
|
"num_tokens": 191543591.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3430034129692833, |
|
"grad_norm": 0.391417552124784, |
|
"learning_rate": 4.026698152920599e-05, |
|
"loss": 0.514, |
|
"num_tokens": 192536034.0, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.3447098976109215, |
|
"grad_norm": 0.3750506973668557, |
|
"learning_rate": 4.016220692069612e-05, |
|
"loss": 0.5227, |
|
"num_tokens": 193451364.0, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.3464163822525597, |
|
"grad_norm": 0.40025338962482404, |
|
"learning_rate": 4.005702892546798e-05, |
|
"loss": 0.534, |
|
"num_tokens": 194391841.0, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.34812286689419797, |
|
"grad_norm": 0.36297817083661593, |
|
"learning_rate": 3.9951450894233365e-05, |
|
"loss": 0.5183, |
|
"num_tokens": 195399830.0, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.34982935153583616, |
|
"grad_norm": 0.3598098656728241, |
|
"learning_rate": 3.984547619044827e-05, |
|
"loss": 0.5115, |
|
"num_tokens": 196363387.0, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.3515358361774744, |
|
"grad_norm": 0.38480064839017475, |
|
"learning_rate": 3.973910819020567e-05, |
|
"loss": 0.5009, |
|
"num_tokens": 197289380.0, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.3532423208191126, |
|
"grad_norm": 0.37900568253469097, |
|
"learning_rate": 3.963235028212802e-05, |
|
"loss": 0.5334, |
|
"num_tokens": 198263603.0, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.35494880546075086, |
|
"grad_norm": 0.35505267262656565, |
|
"learning_rate": 3.9525205867259246e-05, |
|
"loss": 0.4977, |
|
"num_tokens": 199248080.0, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.35665529010238906, |
|
"grad_norm": 0.4178322580187183, |
|
"learning_rate": 3.941767835895647e-05, |
|
"loss": 0.5247, |
|
"num_tokens": 200182740.0, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.3583617747440273, |
|
"grad_norm": 0.3886991806809196, |
|
"learning_rate": 3.9309771182781194e-05, |
|
"loss": 0.5592, |
|
"num_tokens": 201135153.0, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.36006825938566556, |
|
"grad_norm": 0.3729591088065611, |
|
"learning_rate": 3.9201487776390215e-05, |
|
"loss": 0.5174, |
|
"num_tokens": 202034499.0, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.36177474402730375, |
|
"grad_norm": 0.43050604928861597, |
|
"learning_rate": 3.90928315894261e-05, |
|
"loss": 0.5203, |
|
"num_tokens": 202936389.0, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.363481228668942, |
|
"grad_norm": 0.3903868868255724, |
|
"learning_rate": 3.898380608340728e-05, |
|
"loss": 0.5121, |
|
"num_tokens": 203808949.0, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.3651877133105802, |
|
"grad_norm": 0.43315506398310644, |
|
"learning_rate": 3.887441473161779e-05, |
|
"loss": 0.5268, |
|
"num_tokens": 204803047.0, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.36689419795221845, |
|
"grad_norm": 0.3826063463200933, |
|
"learning_rate": 3.87646610189966e-05, |
|
"loss": 0.526, |
|
"num_tokens": 205764413.0, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.36860068259385664, |
|
"grad_norm": 0.3804292887775751, |
|
"learning_rate": 3.8654548442026615e-05, |
|
"loss": 0.5121, |
|
"num_tokens": 206795183.0, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.3703071672354949, |
|
"grad_norm": 0.386486637072244, |
|
"learning_rate": 3.854408050862326e-05, |
|
"loss": 0.5197, |
|
"num_tokens": 207776278.0, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.3720136518771331, |
|
"grad_norm": 0.3679656863557538, |
|
"learning_rate": 3.843326073802275e-05, |
|
"loss": 0.5109, |
|
"num_tokens": 208806680.0, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.37372013651877134, |
|
"grad_norm": 0.36341929778692034, |
|
"learning_rate": 3.832209266066996e-05, |
|
"loss": 0.5117, |
|
"num_tokens": 209705965.0, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.37542662116040953, |
|
"grad_norm": 0.36095775324919993, |
|
"learning_rate": 3.821057981810597e-05, |
|
"loss": 0.5173, |
|
"num_tokens": 210620229.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3771331058020478, |
|
"grad_norm": 0.37920120200226964, |
|
"learning_rate": 3.809872576285522e-05, |
|
"loss": 0.5278, |
|
"num_tokens": 211518941.0, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.378839590443686, |
|
"grad_norm": 0.37125338141855646, |
|
"learning_rate": 3.798653405831236e-05, |
|
"loss": 0.5213, |
|
"num_tokens": 212481056.0, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.38054607508532423, |
|
"grad_norm": 0.39363430670991295, |
|
"learning_rate": 3.78740082786287e-05, |
|
"loss": 0.5081, |
|
"num_tokens": 213401212.0, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.3822525597269625, |
|
"grad_norm": 0.41301840155723674, |
|
"learning_rate": 3.7761152008598356e-05, |
|
"loss": 0.5262, |
|
"num_tokens": 214296967.0, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.3839590443686007, |
|
"grad_norm": 0.4004388299714662, |
|
"learning_rate": 3.764796884354408e-05, |
|
"loss": 0.5295, |
|
"num_tokens": 215306580.0, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.3856655290102389, |
|
"grad_norm": 0.35857577455632367, |
|
"learning_rate": 3.7534462389202655e-05, |
|
"loss": 0.5328, |
|
"num_tokens": 216266686.0, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.3873720136518771, |
|
"grad_norm": 0.357927002022941, |
|
"learning_rate": 3.742063626161011e-05, |
|
"loss": 0.5307, |
|
"num_tokens": 217244190.0, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.3890784982935154, |
|
"grad_norm": 0.4316739204850572, |
|
"learning_rate": 3.7306494086986424e-05, |
|
"loss": 0.5115, |
|
"num_tokens": 218179883.0, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.39078498293515357, |
|
"grad_norm": 0.3236368199361241, |
|
"learning_rate": 3.7192039501620114e-05, |
|
"loss": 0.5265, |
|
"num_tokens": 219217201.0, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.3924914675767918, |
|
"grad_norm": 0.38665727290508983, |
|
"learning_rate": 3.7077276151752274e-05, |
|
"loss": 0.5137, |
|
"num_tokens": 220144855.0, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.39419795221843, |
|
"grad_norm": 0.37991237354351537, |
|
"learning_rate": 3.696220769346052e-05, |
|
"loss": 0.515, |
|
"num_tokens": 221131861.0, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.39590443686006827, |
|
"grad_norm": 0.4047905746984203, |
|
"learning_rate": 3.6846837792542446e-05, |
|
"loss": 0.5289, |
|
"num_tokens": 222093783.0, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.39761092150170646, |
|
"grad_norm": 0.4226911202426522, |
|
"learning_rate": 3.673117012439889e-05, |
|
"loss": 0.5267, |
|
"num_tokens": 223054352.0, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.3993174061433447, |
|
"grad_norm": 0.37522461139025454, |
|
"learning_rate": 3.6615208373916775e-05, |
|
"loss": 0.4879, |
|
"num_tokens": 223929151.0, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.40102389078498296, |
|
"grad_norm": 0.40207189529772014, |
|
"learning_rate": 3.6498956235351815e-05, |
|
"loss": 0.5245, |
|
"num_tokens": 224865728.0, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.40273037542662116, |
|
"grad_norm": 0.3930165821540444, |
|
"learning_rate": 3.6382417412210744e-05, |
|
"loss": 0.5087, |
|
"num_tokens": 225865685.0, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.4044368600682594, |
|
"grad_norm": 0.3578549306424923, |
|
"learning_rate": 3.6265595617133366e-05, |
|
"loss": 0.4939, |
|
"num_tokens": 226749326.0, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.4061433447098976, |
|
"grad_norm": 0.36807883381787004, |
|
"learning_rate": 3.6148494571774275e-05, |
|
"loss": 0.5286, |
|
"num_tokens": 227786006.0, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.40784982935153585, |
|
"grad_norm": 0.41611237665417367, |
|
"learning_rate": 3.603111800668428e-05, |
|
"loss": 0.5099, |
|
"num_tokens": 228763631.0, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.40955631399317405, |
|
"grad_norm": 0.3799547117691007, |
|
"learning_rate": 3.591346966119159e-05, |
|
"loss": 0.5094, |
|
"num_tokens": 229748231.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.4112627986348123, |
|
"grad_norm": 0.3478677676406051, |
|
"learning_rate": 3.579555328328265e-05, |
|
"loss": 0.5117, |
|
"num_tokens": 230738165.0, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.4129692832764505, |
|
"grad_norm": 0.3296304695850409, |
|
"learning_rate": 3.5677372629482775e-05, |
|
"loss": 0.521, |
|
"num_tokens": 231716185.0, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.41467576791808874, |
|
"grad_norm": 0.382979600202463, |
|
"learning_rate": 3.555893146473644e-05, |
|
"loss": 0.5262, |
|
"num_tokens": 232698142.0, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.41638225255972694, |
|
"grad_norm": 0.35116928864638025, |
|
"learning_rate": 3.5440233562287376e-05, |
|
"loss": 0.5417, |
|
"num_tokens": 233655900.0, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.4180887372013652, |
|
"grad_norm": 0.3728136743436132, |
|
"learning_rate": 3.532128270355832e-05, |
|
"loss": 0.516, |
|
"num_tokens": 234596302.0, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.4197952218430034, |
|
"grad_norm": 0.4116335593681845, |
|
"learning_rate": 3.520208267803059e-05, |
|
"loss": 0.5242, |
|
"num_tokens": 235502719.0, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.42150170648464164, |
|
"grad_norm": 0.38580680323921696, |
|
"learning_rate": 3.508263728312336e-05, |
|
"loss": 0.5278, |
|
"num_tokens": 236475023.0, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.4232081911262799, |
|
"grad_norm": 0.38551884251450047, |
|
"learning_rate": 3.496295032407263e-05, |
|
"loss": 0.5229, |
|
"num_tokens": 237433481.0, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.4249146757679181, |
|
"grad_norm": 0.38557583134333984, |
|
"learning_rate": 3.484302561381007e-05, |
|
"loss": 0.5029, |
|
"num_tokens": 238378423.0, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.42662116040955633, |
|
"grad_norm": 0.40297119331954867, |
|
"learning_rate": 3.47228669728415e-05, |
|
"loss": 0.5288, |
|
"num_tokens": 239310469.0, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.4283276450511945, |
|
"grad_norm": 0.36323193941329485, |
|
"learning_rate": 3.4602478229125197e-05, |
|
"loss": 0.5178, |
|
"num_tokens": 240265629.0, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.4300341296928328, |
|
"grad_norm": 0.4010782882289214, |
|
"learning_rate": 3.4481863217949964e-05, |
|
"loss": 0.5211, |
|
"num_tokens": 241153898.0, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.431740614334471, |
|
"grad_norm": 0.3725440207487266, |
|
"learning_rate": 3.43610257818129e-05, |
|
"loss": 0.5339, |
|
"num_tokens": 242074086.0, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.4334470989761092, |
|
"grad_norm": 0.3573796647038546, |
|
"learning_rate": 3.4239969770297033e-05, |
|
"loss": 0.5275, |
|
"num_tokens": 243032696.0, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.4351535836177474, |
|
"grad_norm": 0.3500180473141626, |
|
"learning_rate": 3.411869903994867e-05, |
|
"loss": 0.5237, |
|
"num_tokens": 244052484.0, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.43686006825938567, |
|
"grad_norm": 0.3630134268573173, |
|
"learning_rate": 3.399721745415451e-05, |
|
"loss": 0.4863, |
|
"num_tokens": 245008254.0, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.43856655290102387, |
|
"grad_norm": 0.3276111054395765, |
|
"learning_rate": 3.38755288830186e-05, |
|
"loss": 0.5239, |
|
"num_tokens": 246076299.0, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.4402730375426621, |
|
"grad_norm": 0.37922721950544974, |
|
"learning_rate": 3.375363720323904e-05, |
|
"loss": 0.5558, |
|
"num_tokens": 247016964.0, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.44197952218430037, |
|
"grad_norm": 0.4053020528274456, |
|
"learning_rate": 3.363154629798444e-05, |
|
"loss": 0.4991, |
|
"num_tokens": 247913243.0, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.44368600682593856, |
|
"grad_norm": 0.39968767139036077, |
|
"learning_rate": 3.350926005677027e-05, |
|
"loss": 0.5163, |
|
"num_tokens": 248791992.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.4453924914675768, |
|
"grad_norm": 0.4196675646397248, |
|
"learning_rate": 3.338678237533491e-05, |
|
"loss": 0.5155, |
|
"num_tokens": 249736240.0, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.447098976109215, |
|
"grad_norm": 0.36733233226120704, |
|
"learning_rate": 3.326411715551559e-05, |
|
"loss": 0.5187, |
|
"num_tokens": 250713070.0, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.44880546075085326, |
|
"grad_norm": 0.3526872487151841, |
|
"learning_rate": 3.314126830512397e-05, |
|
"loss": 0.5183, |
|
"num_tokens": 251635307.0, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.45051194539249145, |
|
"grad_norm": 0.3666740114223966, |
|
"learning_rate": 3.3018239737821806e-05, |
|
"loss": 0.4913, |
|
"num_tokens": 252648795.0, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.4522184300341297, |
|
"grad_norm": 0.3865152760583026, |
|
"learning_rate": 3.289503537299616e-05, |
|
"loss": 0.5326, |
|
"num_tokens": 253618343.0, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.4539249146757679, |
|
"grad_norm": 0.3882700849297493, |
|
"learning_rate": 3.2771659135634564e-05, |
|
"loss": 0.5033, |
|
"num_tokens": 254539106.0, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.45563139931740615, |
|
"grad_norm": 0.37283505962982216, |
|
"learning_rate": 3.2648114956200005e-05, |
|
"loss": 0.5134, |
|
"num_tokens": 255475551.0, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.45733788395904434, |
|
"grad_norm": 0.3736180838966003, |
|
"learning_rate": 3.2524406770505675e-05, |
|
"loss": 0.5212, |
|
"num_tokens": 256460069.0, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.4590443686006826, |
|
"grad_norm": 0.3711824529096275, |
|
"learning_rate": 3.240053851958961e-05, |
|
"loss": 0.4986, |
|
"num_tokens": 257384246.0, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.46075085324232085, |
|
"grad_norm": 0.35194816416518127, |
|
"learning_rate": 3.227651414958912e-05, |
|
"loss": 0.4996, |
|
"num_tokens": 258439462.0, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.46245733788395904, |
|
"grad_norm": 0.3636023146472485, |
|
"learning_rate": 3.2152337611615096e-05, |
|
"loss": 0.5128, |
|
"num_tokens": 259419905.0, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.4641638225255973, |
|
"grad_norm": 0.3499395456709178, |
|
"learning_rate": 3.202801286162611e-05, |
|
"loss": 0.529, |
|
"num_tokens": 260499223.0, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.4658703071672355, |
|
"grad_norm": 0.344200324854834, |
|
"learning_rate": 3.1903543860302445e-05, |
|
"loss": 0.4954, |
|
"num_tokens": 261442637.0, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.46757679180887374, |
|
"grad_norm": 0.34635826868295416, |
|
"learning_rate": 3.1778934572919805e-05, |
|
"loss": 0.5053, |
|
"num_tokens": 262428104.0, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.46928327645051193, |
|
"grad_norm": 0.3434947744560547, |
|
"learning_rate": 3.165418896922313e-05, |
|
"loss": 0.4892, |
|
"num_tokens": 263310660.0, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.4709897610921502, |
|
"grad_norm": 0.36603548416607035, |
|
"learning_rate": 3.152931102330002e-05, |
|
"loss": 0.5193, |
|
"num_tokens": 264331327.0, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.4726962457337884, |
|
"grad_norm": 0.36277672500545255, |
|
"learning_rate": 3.140430471345419e-05, |
|
"loss": 0.5103, |
|
"num_tokens": 265270147.0, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.47440273037542663, |
|
"grad_norm": 0.36093915798723425, |
|
"learning_rate": 3.127917402207871e-05, |
|
"loss": 0.5125, |
|
"num_tokens": 266242185.0, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.4761092150170648, |
|
"grad_norm": 0.3391219737377559, |
|
"learning_rate": 3.115392293552915e-05, |
|
"loss": 0.5119, |
|
"num_tokens": 267191130.0, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.4778156996587031, |
|
"grad_norm": 0.35339290653989003, |
|
"learning_rate": 3.1028555443996544e-05, |
|
"loss": 0.5099, |
|
"num_tokens": 268142845.0, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.47952218430034127, |
|
"grad_norm": 0.3520073614880941, |
|
"learning_rate": 3.090307554138033e-05, |
|
"loss": 0.527, |
|
"num_tokens": 269116555.0, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.4812286689419795, |
|
"grad_norm": 0.32746517176816037, |
|
"learning_rate": 3.0777487225161096e-05, |
|
"loss": 0.5171, |
|
"num_tokens": 270078357.0, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.48293515358361777, |
|
"grad_norm": 0.3801899716265933, |
|
"learning_rate": 3.065179449627316e-05, |
|
"loss": 0.5179, |
|
"num_tokens": 271065401.0, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.48464163822525597, |
|
"grad_norm": 0.34972336470548876, |
|
"learning_rate": 3.0526001358977254e-05, |
|
"loss": 0.5192, |
|
"num_tokens": 272018748.0, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.4863481228668942, |
|
"grad_norm": 0.4088433608953109, |
|
"learning_rate": 3.0400111820732802e-05, |
|
"loss": 0.5202, |
|
"num_tokens": 273051158.0, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.4880546075085324, |
|
"grad_norm": 0.4045530258228776, |
|
"learning_rate": 3.0274129892070368e-05, |
|
"loss": 0.5363, |
|
"num_tokens": 274027158.0, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.48976109215017066, |
|
"grad_norm": 0.37760264633069307, |
|
"learning_rate": 3.014805958646383e-05, |
|
"loss": 0.5071, |
|
"num_tokens": 274976608.0, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.49146757679180886, |
|
"grad_norm": 0.41544044167849326, |
|
"learning_rate": 3.002190492020255e-05, |
|
"loss": 0.5336, |
|
"num_tokens": 275897357.0, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.4931740614334471, |
|
"grad_norm": 0.3621173202789576, |
|
"learning_rate": 2.9895669912263393e-05, |
|
"loss": 0.4884, |
|
"num_tokens": 276767022.0, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.4948805460750853, |
|
"grad_norm": 0.35971250539401595, |
|
"learning_rate": 2.9769358584182732e-05, |
|
"loss": 0.4929, |
|
"num_tokens": 277733458.0, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.49658703071672355, |
|
"grad_norm": 0.3333198556836057, |
|
"learning_rate": 2.9642974959928293e-05, |
|
"loss": 0.5181, |
|
"num_tokens": 278655070.0, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.49829351535836175, |
|
"grad_norm": 0.40269115529983035, |
|
"learning_rate": 2.9516523065771e-05, |
|
"loss": 0.5092, |
|
"num_tokens": 279550428.0, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.3375843000024675, |
|
"learning_rate": 2.9390006930156683e-05, |
|
"loss": 0.5035, |
|
"num_tokens": 280592599.0, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.5017064846416383, |
|
"grad_norm": 0.33837995836306645, |
|
"learning_rate": 2.9263430583577715e-05, |
|
"loss": 0.4936, |
|
"num_tokens": 281502549.0, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.5034129692832765, |
|
"grad_norm": 0.3491138042125671, |
|
"learning_rate": 2.9136798058444704e-05, |
|
"loss": 0.5186, |
|
"num_tokens": 282554594.0, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.5051194539249146, |
|
"grad_norm": 0.36149705403685856, |
|
"learning_rate": 2.9010113388957906e-05, |
|
"loss": 0.4996, |
|
"num_tokens": 283508120.0, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.5068259385665529, |
|
"grad_norm": 0.3682570368717468, |
|
"learning_rate": 2.8883380610978804e-05, |
|
"loss": 0.4868, |
|
"num_tokens": 284430674.0, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.5085324232081911, |
|
"grad_norm": 0.3450199220270282, |
|
"learning_rate": 2.875660376190149e-05, |
|
"loss": 0.5225, |
|
"num_tokens": 285480194.0, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.5102389078498294, |
|
"grad_norm": 0.35852992172619397, |
|
"learning_rate": 2.8629786880524057e-05, |
|
"loss": 0.5044, |
|
"num_tokens": 286426656.0, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.5119453924914675, |
|
"grad_norm": 0.35758605357343837, |
|
"learning_rate": 2.8502934006919908e-05, |
|
"loss": 0.531, |
|
"num_tokens": 287419124.0, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5136518771331058, |
|
"grad_norm": 0.3679994961058525, |
|
"learning_rate": 2.83760491823091e-05, |
|
"loss": 0.4891, |
|
"num_tokens": 288343301.0, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.515358361774744, |
|
"grad_norm": 0.386078898523489, |
|
"learning_rate": 2.824913644892955e-05, |
|
"loss": 0.4912, |
|
"num_tokens": 289306762.0, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.5170648464163823, |
|
"grad_norm": 0.33205328887110974, |
|
"learning_rate": 2.8122199849908286e-05, |
|
"loss": 0.5047, |
|
"num_tokens": 290236538.0, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.5187713310580204, |
|
"grad_norm": 0.36598920383011924, |
|
"learning_rate": 2.7995243429132644e-05, |
|
"loss": 0.5082, |
|
"num_tokens": 291105578.0, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.5204778156996587, |
|
"grad_norm": 0.3577740364047028, |
|
"learning_rate": 2.7868271231121406e-05, |
|
"loss": 0.5271, |
|
"num_tokens": 292089939.0, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.5221843003412969, |
|
"grad_norm": 0.3395160116353141, |
|
"learning_rate": 2.7741287300896013e-05, |
|
"loss": 0.4958, |
|
"num_tokens": 293082816.0, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.5238907849829352, |
|
"grad_norm": 0.3750202797810289, |
|
"learning_rate": 2.7614295683851637e-05, |
|
"loss": 0.5043, |
|
"num_tokens": 293957075.0, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.5255972696245734, |
|
"grad_norm": 0.36593612437784134, |
|
"learning_rate": 2.7487300425628347e-05, |
|
"loss": 0.4999, |
|
"num_tokens": 294930434.0, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.5273037542662116, |
|
"grad_norm": 0.36077082410017175, |
|
"learning_rate": 2.7360305571982213e-05, |
|
"loss": 0.517, |
|
"num_tokens": 295898443.0, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.5290102389078498, |
|
"grad_norm": 0.34084324496634494, |
|
"learning_rate": 2.723331516865641e-05, |
|
"loss": 0.5042, |
|
"num_tokens": 296842807.0, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.5307167235494881, |
|
"grad_norm": 0.3473414935833505, |
|
"learning_rate": 2.7106333261252342e-05, |
|
"loss": 0.5141, |
|
"num_tokens": 297874811.0, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.5324232081911263, |
|
"grad_norm": 0.37129914119401464, |
|
"learning_rate": 2.697936389510073e-05, |
|
"loss": 0.5019, |
|
"num_tokens": 298726998.0, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.5341296928327645, |
|
"grad_norm": 0.36028226696611454, |
|
"learning_rate": 2.685241111513281e-05, |
|
"loss": 0.5116, |
|
"num_tokens": 299723782.0, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.5358361774744027, |
|
"grad_norm": 0.3655240725721465, |
|
"learning_rate": 2.6725478965751378e-05, |
|
"loss": 0.4864, |
|
"num_tokens": 300660125.0, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.537542662116041, |
|
"grad_norm": 0.3527263064768574, |
|
"learning_rate": 2.6598571490702013e-05, |
|
"loss": 0.4997, |
|
"num_tokens": 301489572.0, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.5392491467576792, |
|
"grad_norm": 0.3514385599593041, |
|
"learning_rate": 2.6471692732944227e-05, |
|
"loss": 0.4773, |
|
"num_tokens": 302437719.0, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.5409556313993175, |
|
"grad_norm": 0.5417409716600186, |
|
"learning_rate": 2.634484673452265e-05, |
|
"loss": 0.5256, |
|
"num_tokens": 303463770.0, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.5426621160409556, |
|
"grad_norm": 0.3711273299009024, |
|
"learning_rate": 2.6218037536438315e-05, |
|
"loss": 0.5067, |
|
"num_tokens": 304343518.0, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.5443686006825939, |
|
"grad_norm": 0.38356094617087266, |
|
"learning_rate": 2.6091269178519885e-05, |
|
"loss": 0.5195, |
|
"num_tokens": 305270656.0, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.5460750853242321, |
|
"grad_norm": 0.36905989432450675, |
|
"learning_rate": 2.5964545699294906e-05, |
|
"loss": 0.5049, |
|
"num_tokens": 306180961.0, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.5477815699658704, |
|
"grad_norm": 0.3692380599855385, |
|
"learning_rate": 2.583787113586126e-05, |
|
"loss": 0.5315, |
|
"num_tokens": 307152419.0, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.5494880546075085, |
|
"grad_norm": 0.3565584564996635, |
|
"learning_rate": 2.571124952375845e-05, |
|
"loss": 0.5028, |
|
"num_tokens": 308076053.0, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.5511945392491467, |
|
"grad_norm": 0.6182642964929036, |
|
"learning_rate": 2.55846848968391e-05, |
|
"loss": 0.5168, |
|
"num_tokens": 309029777.0, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.552901023890785, |
|
"grad_norm": 0.3804959240952762, |
|
"learning_rate": 2.545818128714043e-05, |
|
"loss": 0.4985, |
|
"num_tokens": 310003006.0, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.5546075085324232, |
|
"grad_norm": 0.3585357836902996, |
|
"learning_rate": 2.533174272475579e-05, |
|
"loss": 0.4889, |
|
"num_tokens": 310946881.0, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.5563139931740614, |
|
"grad_norm": 0.36197226313264375, |
|
"learning_rate": 2.52053732377063e-05, |
|
"loss": 0.5011, |
|
"num_tokens": 311908102.0, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.5580204778156996, |
|
"grad_norm": 0.40032263570721643, |
|
"learning_rate": 2.5079076851812476e-05, |
|
"loss": 0.5089, |
|
"num_tokens": 312808887.0, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.5597269624573379, |
|
"grad_norm": 0.3451971824355068, |
|
"learning_rate": 2.4952857590566043e-05, |
|
"loss": 0.493, |
|
"num_tokens": 313777123.0, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.5614334470989761, |
|
"grad_norm": 0.3900504406034111, |
|
"learning_rate": 2.4826719475001714e-05, |
|
"loss": 0.5094, |
|
"num_tokens": 314739056.0, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.5631399317406144, |
|
"grad_norm": 0.3493313787920549, |
|
"learning_rate": 2.4700666523569106e-05, |
|
"loss": 0.4898, |
|
"num_tokens": 315742426.0, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.5648464163822525, |
|
"grad_norm": 0.3438326732935695, |
|
"learning_rate": 2.4574702752004703e-05, |
|
"loss": 0.5175, |
|
"num_tokens": 316737000.0, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.5665529010238908, |
|
"grad_norm": 0.35276865905225485, |
|
"learning_rate": 2.444883217320395e-05, |
|
"loss": 0.495, |
|
"num_tokens": 317577413.0, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.568259385665529, |
|
"grad_norm": 0.3407109448961259, |
|
"learning_rate": 2.4323058797093395e-05, |
|
"loss": 0.504, |
|
"num_tokens": 318577669.0, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.5699658703071673, |
|
"grad_norm": 0.4001170454014252, |
|
"learning_rate": 2.4197386630502965e-05, |
|
"loss": 0.4969, |
|
"num_tokens": 319557900.0, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.5716723549488054, |
|
"grad_norm": 0.34709303207454156, |
|
"learning_rate": 2.407181967703826e-05, |
|
"loss": 0.5009, |
|
"num_tokens": 320511805.0, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.5733788395904437, |
|
"grad_norm": 0.371204319077067, |
|
"learning_rate": 2.3946361936953092e-05, |
|
"loss": 0.5075, |
|
"num_tokens": 321462994.0, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.5750853242320819, |
|
"grad_norm": 0.35657413743817584, |
|
"learning_rate": 2.382101740702199e-05, |
|
"loss": 0.4846, |
|
"num_tokens": 322380429.0, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.5767918088737202, |
|
"grad_norm": 0.3373065447243538, |
|
"learning_rate": 2.369579008041286e-05, |
|
"loss": 0.5064, |
|
"num_tokens": 323355363.0, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.5784982935153583, |
|
"grad_norm": 0.34242628423967963, |
|
"learning_rate": 2.3570683946559835e-05, |
|
"loss": 0.5057, |
|
"num_tokens": 324276849.0, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.5802047781569966, |
|
"grad_norm": 0.3424829522431512, |
|
"learning_rate": 2.3445702991036138e-05, |
|
"loss": 0.4915, |
|
"num_tokens": 325155802.0, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.5819112627986348, |
|
"grad_norm": 0.38418791267218005, |
|
"learning_rate": 2.332085119542711e-05, |
|
"loss": 0.4747, |
|
"num_tokens": 325996402.0, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.5836177474402731, |
|
"grad_norm": 0.3720571745285186, |
|
"learning_rate": 2.319613253720338e-05, |
|
"loss": 0.5314, |
|
"num_tokens": 326956942.0, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.5853242320819113, |
|
"grad_norm": 0.35719133127267255, |
|
"learning_rate": 2.3071550989594133e-05, |
|
"loss": 0.5122, |
|
"num_tokens": 327985119.0, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.5870307167235495, |
|
"grad_norm": 0.38622831687102893, |
|
"learning_rate": 2.2947110521460567e-05, |
|
"loss": 0.4888, |
|
"num_tokens": 328885222.0, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.5887372013651877, |
|
"grad_norm": 0.35596529449616776, |
|
"learning_rate": 2.2822815097169447e-05, |
|
"loss": 0.5065, |
|
"num_tokens": 329923181.0, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.590443686006826, |
|
"grad_norm": 0.36830928128106777, |
|
"learning_rate": 2.269866867646675e-05, |
|
"loss": 0.4908, |
|
"num_tokens": 330878184.0, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.5921501706484642, |
|
"grad_norm": 0.333509463984278, |
|
"learning_rate": 2.2574675214351622e-05, |
|
"loss": 0.4683, |
|
"num_tokens": 331849770.0, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.5938566552901023, |
|
"grad_norm": 0.3657723804929488, |
|
"learning_rate": 2.245083866095029e-05, |
|
"loss": 0.498, |
|
"num_tokens": 332821824.0, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.5955631399317406, |
|
"grad_norm": 0.3709027281629384, |
|
"learning_rate": 2.2327162961390254e-05, |
|
"loss": 0.5101, |
|
"num_tokens": 333794769.0, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.5972696245733788, |
|
"grad_norm": 0.34103457669200804, |
|
"learning_rate": 2.2203652055674633e-05, |
|
"loss": 0.4935, |
|
"num_tokens": 334798404.0, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.5989761092150171, |
|
"grad_norm": 0.3545726343474071, |
|
"learning_rate": 2.20803098785566e-05, |
|
"loss": 0.4833, |
|
"num_tokens": 335687213.0, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.6006825938566553, |
|
"grad_norm": 0.3340674324445749, |
|
"learning_rate": 2.1957140359414063e-05, |
|
"loss": 0.4651, |
|
"num_tokens": 336651049.0, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.6023890784982935, |
|
"grad_norm": 0.36127627803351964, |
|
"learning_rate": 2.1834147422124463e-05, |
|
"loss": 0.4772, |
|
"num_tokens": 337519072.0, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.6040955631399317, |
|
"grad_norm": 0.3835855921099779, |
|
"learning_rate": 2.1711334984939767e-05, |
|
"loss": 0.5155, |
|
"num_tokens": 338438000.0, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.60580204778157, |
|
"grad_norm": 0.30765262583419745, |
|
"learning_rate": 2.1588706960361682e-05, |
|
"loss": 0.5165, |
|
"num_tokens": 339506233.0, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.6075085324232082, |
|
"grad_norm": 0.3248110534125549, |
|
"learning_rate": 2.146626725501697e-05, |
|
"loss": 0.4952, |
|
"num_tokens": 340414967.0, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.6092150170648464, |
|
"grad_norm": 0.3774926928186119, |
|
"learning_rate": 2.134401976953299e-05, |
|
"loss": 0.5206, |
|
"num_tokens": 341348316.0, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.6109215017064846, |
|
"grad_norm": 0.3532666917926633, |
|
"learning_rate": 2.1221968398413477e-05, |
|
"loss": 0.4882, |
|
"num_tokens": 342244656.0, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.6126279863481229, |
|
"grad_norm": 0.3502010878134099, |
|
"learning_rate": 2.1100117029914434e-05, |
|
"loss": 0.4849, |
|
"num_tokens": 343244894.0, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.6143344709897611, |
|
"grad_norm": 0.34456710514750377, |
|
"learning_rate": 2.0978469545920254e-05, |
|
"loss": 0.5066, |
|
"num_tokens": 344295726.0, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.6160409556313993, |
|
"grad_norm": 0.32734359098421567, |
|
"learning_rate": 2.0857029821820113e-05, |
|
"loss": 0.5014, |
|
"num_tokens": 345312852.0, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.6177474402730375, |
|
"grad_norm": 0.37196057474243177, |
|
"learning_rate": 2.0735801726384436e-05, |
|
"loss": 0.5103, |
|
"num_tokens": 346263433.0, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.6194539249146758, |
|
"grad_norm": 0.32459376752536473, |
|
"learning_rate": 2.0614789121641688e-05, |
|
"loss": 0.5038, |
|
"num_tokens": 347219412.0, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.621160409556314, |
|
"grad_norm": 0.36986659746774475, |
|
"learning_rate": 2.0493995862755333e-05, |
|
"loss": 0.4975, |
|
"num_tokens": 348137882.0, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.6228668941979523, |
|
"grad_norm": 0.4026711619598764, |
|
"learning_rate": 2.0373425797901024e-05, |
|
"loss": 0.5169, |
|
"num_tokens": 349064203.0, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.6245733788395904, |
|
"grad_norm": 0.3497696696697358, |
|
"learning_rate": 2.0253082768143976e-05, |
|
"loss": 0.4985, |
|
"num_tokens": 349987787.0, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.6262798634812287, |
|
"grad_norm": 0.35282576206861677, |
|
"learning_rate": 2.0132970607316677e-05, |
|
"loss": 0.4961, |
|
"num_tokens": 350963679.0, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.6279863481228669, |
|
"grad_norm": 0.33944555479530397, |
|
"learning_rate": 2.0013093141896634e-05, |
|
"loss": 0.4743, |
|
"num_tokens": 351875623.0, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.6296928327645052, |
|
"grad_norm": 0.3720204410754147, |
|
"learning_rate": 1.989345419088458e-05, |
|
"loss": 0.4853, |
|
"num_tokens": 352834128.0, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.6313993174061433, |
|
"grad_norm": 0.3288718937132713, |
|
"learning_rate": 1.9774057565682768e-05, |
|
"loss": 0.4954, |
|
"num_tokens": 353796065.0, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.6331058020477816, |
|
"grad_norm": 0.3227249405857713, |
|
"learning_rate": 1.965490706997351e-05, |
|
"loss": 0.4869, |
|
"num_tokens": 354752780.0, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.6348122866894198, |
|
"grad_norm": 0.3312191918205175, |
|
"learning_rate": 1.9536006499598085e-05, |
|
"loss": 0.4953, |
|
"num_tokens": 355697743.0, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.636518771331058, |
|
"grad_norm": 0.3273924968177549, |
|
"learning_rate": 1.941735964243574e-05, |
|
"loss": 0.4905, |
|
"num_tokens": 356707970.0, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.6382252559726962, |
|
"grad_norm": 0.3733694588578265, |
|
"learning_rate": 1.9298970278283046e-05, |
|
"loss": 0.5312, |
|
"num_tokens": 357607500.0, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.6399317406143344, |
|
"grad_norm": 0.3377666062483733, |
|
"learning_rate": 1.918084217873349e-05, |
|
"loss": 0.5072, |
|
"num_tokens": 358549752.0, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.6416382252559727, |
|
"grad_norm": 0.3795132924337565, |
|
"learning_rate": 1.90629791070573e-05, |
|
"loss": 0.5073, |
|
"num_tokens": 359496768.0, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.643344709897611, |
|
"grad_norm": 0.3393948612318191, |
|
"learning_rate": 1.8945384818081574e-05, |
|
"loss": 0.4666, |
|
"num_tokens": 360449477.0, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.6450511945392492, |
|
"grad_norm": 0.3362849089493312, |
|
"learning_rate": 1.882806305807067e-05, |
|
"loss": 0.4991, |
|
"num_tokens": 361389017.0, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.6467576791808873, |
|
"grad_norm": 0.34043285210954316, |
|
"learning_rate": 1.871101756460682e-05, |
|
"loss": 0.4755, |
|
"num_tokens": 362299106.0, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.6484641638225256, |
|
"grad_norm": 0.3496235222336366, |
|
"learning_rate": 1.8594252066471108e-05, |
|
"loss": 0.4994, |
|
"num_tokens": 363249804.0, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.6501706484641638, |
|
"grad_norm": 0.35219312570924277, |
|
"learning_rate": 1.847777028352463e-05, |
|
"loss": 0.505, |
|
"num_tokens": 364170107.0, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.6518771331058021, |
|
"grad_norm": 0.3675703568778994, |
|
"learning_rate": 1.8361575926590034e-05, |
|
"loss": 0.4798, |
|
"num_tokens": 365084839.0, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.6535836177474402, |
|
"grad_norm": 0.33143812395942357, |
|
"learning_rate": 1.8245672697333288e-05, |
|
"loss": 0.4933, |
|
"num_tokens": 365997466.0, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.6552901023890785, |
|
"grad_norm": 0.32956885993414253, |
|
"learning_rate": 1.8130064288145737e-05, |
|
"loss": 0.4724, |
|
"num_tokens": 366923656.0, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.6569965870307167, |
|
"grad_norm": 0.3282570628204338, |
|
"learning_rate": 1.801475438202648e-05, |
|
"loss": 0.5023, |
|
"num_tokens": 367958683.0, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.658703071672355, |
|
"grad_norm": 0.33433791259781837, |
|
"learning_rate": 1.789974665246507e-05, |
|
"loss": 0.5161, |
|
"num_tokens": 368897813.0, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.6604095563139932, |
|
"grad_norm": 0.33188996124132675, |
|
"learning_rate": 1.7785044763324415e-05, |
|
"loss": 0.4924, |
|
"num_tokens": 369808844.0, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.6621160409556314, |
|
"grad_norm": 0.3305327165261152, |
|
"learning_rate": 1.7670652368724144e-05, |
|
"loss": 0.4928, |
|
"num_tokens": 370786942.0, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.6638225255972696, |
|
"grad_norm": 0.3292219960944428, |
|
"learning_rate": 1.7556573112924135e-05, |
|
"loss": 0.4675, |
|
"num_tokens": 371657863.0, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.6655290102389079, |
|
"grad_norm": 0.31411324035322963, |
|
"learning_rate": 1.7442810630208446e-05, |
|
"loss": 0.4831, |
|
"num_tokens": 372630696.0, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.6672354948805461, |
|
"grad_norm": 0.3217654495266495, |
|
"learning_rate": 1.7329368544769487e-05, |
|
"loss": 0.5029, |
|
"num_tokens": 373650740.0, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.6689419795221843, |
|
"grad_norm": 0.30398529228133475, |
|
"learning_rate": 1.721625047059265e-05, |
|
"loss": 0.4927, |
|
"num_tokens": 374628223.0, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.6706484641638225, |
|
"grad_norm": 0.3690454794887621, |
|
"learning_rate": 1.7103460011341084e-05, |
|
"loss": 0.4882, |
|
"num_tokens": 375573909.0, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.6723549488054608, |
|
"grad_norm": 0.32789625269764505, |
|
"learning_rate": 1.699100076024099e-05, |
|
"loss": 0.4697, |
|
"num_tokens": 376493989.0, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.674061433447099, |
|
"grad_norm": 0.3425335012653658, |
|
"learning_rate": 1.6878876299967018e-05, |
|
"loss": 0.4706, |
|
"num_tokens": 377479804.0, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.6757679180887372, |
|
"grad_norm": 0.3506912584928773, |
|
"learning_rate": 1.6767090202528268e-05, |
|
"loss": 0.4884, |
|
"num_tokens": 378392822.0, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.6774744027303754, |
|
"grad_norm": 0.3366127448224504, |
|
"learning_rate": 1.6655646029154402e-05, |
|
"loss": 0.4757, |
|
"num_tokens": 379328234.0, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.6791808873720137, |
|
"grad_norm": 0.33730121762767445, |
|
"learning_rate": 1.6544547330182234e-05, |
|
"loss": 0.4683, |
|
"num_tokens": 380308538.0, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.6808873720136519, |
|
"grad_norm": 0.3150426444385526, |
|
"learning_rate": 1.6433797644942633e-05, |
|
"loss": 0.4975, |
|
"num_tokens": 381210797.0, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.6825938566552902, |
|
"grad_norm": 0.31875516644753304, |
|
"learning_rate": 1.63234005016477e-05, |
|
"loss": 0.4942, |
|
"num_tokens": 382166430.0, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.6843003412969283, |
|
"grad_norm": 0.32666915132106794, |
|
"learning_rate": 1.6213359417278473e-05, |
|
"loss": 0.5085, |
|
"num_tokens": 383179056.0, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.6860068259385665, |
|
"grad_norm": 0.32712173146500084, |
|
"learning_rate": 1.6103677897472794e-05, |
|
"loss": 0.5003, |
|
"num_tokens": 384075218.0, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.6877133105802048, |
|
"grad_norm": 0.3287315519317715, |
|
"learning_rate": 1.599435943641368e-05, |
|
"loss": 0.4702, |
|
"num_tokens": 384999949.0, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.689419795221843, |
|
"grad_norm": 0.38670376122956074, |
|
"learning_rate": 1.5885407516717987e-05, |
|
"loss": 0.4908, |
|
"num_tokens": 385900887.0, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.6911262798634812, |
|
"grad_norm": 0.35663261838972243, |
|
"learning_rate": 1.577682560932547e-05, |
|
"loss": 0.4978, |
|
"num_tokens": 386870114.0, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.6928327645051194, |
|
"grad_norm": 0.42642527854196, |
|
"learning_rate": 1.566861717338819e-05, |
|
"loss": 0.4906, |
|
"num_tokens": 387782669.0, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.6945392491467577, |
|
"grad_norm": 0.37343767857022897, |
|
"learning_rate": 1.556078565616034e-05, |
|
"loss": 0.4902, |
|
"num_tokens": 388715961.0, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.6962457337883959, |
|
"grad_norm": 0.3284331766719929, |
|
"learning_rate": 1.5453334492888428e-05, |
|
"loss": 0.4776, |
|
"num_tokens": 389650899.0, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.6979522184300341, |
|
"grad_norm": 0.3483829695120365, |
|
"learning_rate": 1.5346267106701762e-05, |
|
"loss": 0.4836, |
|
"num_tokens": 390610942.0, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.6996587030716723, |
|
"grad_norm": 0.32022610770787235, |
|
"learning_rate": 1.5239586908503533e-05, |
|
"loss": 0.5172, |
|
"num_tokens": 391632321.0, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.7013651877133106, |
|
"grad_norm": 0.3514714730074271, |
|
"learning_rate": 1.513329729686203e-05, |
|
"loss": 0.4854, |
|
"num_tokens": 392626976.0, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.7030716723549488, |
|
"grad_norm": 0.3413532016336787, |
|
"learning_rate": 1.502740165790244e-05, |
|
"loss": 0.4856, |
|
"num_tokens": 393493604.0, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.7047781569965871, |
|
"grad_norm": 0.3422857376051543, |
|
"learning_rate": 1.4921903365198914e-05, |
|
"loss": 0.5084, |
|
"num_tokens": 394371570.0, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.7064846416382252, |
|
"grad_norm": 0.3572958103887209, |
|
"learning_rate": 1.481680577966717e-05, |
|
"loss": 0.4963, |
|
"num_tokens": 395329185.0, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.7081911262798635, |
|
"grad_norm": 0.34758264074134787, |
|
"learning_rate": 1.471211224945736e-05, |
|
"loss": 0.4905, |
|
"num_tokens": 396285000.0, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.7098976109215017, |
|
"grad_norm": 0.3419972923917919, |
|
"learning_rate": 1.4607826109847458e-05, |
|
"loss": 0.5266, |
|
"num_tokens": 397224172.0, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.71160409556314, |
|
"grad_norm": 0.337893094968607, |
|
"learning_rate": 1.4503950683136936e-05, |
|
"loss": 0.4857, |
|
"num_tokens": 398210109.0, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.7133105802047781, |
|
"grad_norm": 0.33901468478209035, |
|
"learning_rate": 1.4400489278540985e-05, |
|
"loss": 0.4749, |
|
"num_tokens": 399158135.0, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.7150170648464164, |
|
"grad_norm": 0.3835956412834383, |
|
"learning_rate": 1.429744519208508e-05, |
|
"loss": 0.4936, |
|
"num_tokens": 400075133.0, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.7167235494880546, |
|
"grad_norm": 0.34331595173677726, |
|
"learning_rate": 1.4194821706499955e-05, |
|
"loss": 0.5031, |
|
"num_tokens": 400990040.0, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.7184300341296929, |
|
"grad_norm": 0.3171937953485873, |
|
"learning_rate": 1.4092622091117041e-05, |
|
"loss": 0.4815, |
|
"num_tokens": 401912436.0, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.7201365187713311, |
|
"grad_norm": 0.3410195330730544, |
|
"learning_rate": 1.399084960176431e-05, |
|
"loss": 0.4741, |
|
"num_tokens": 402861165.0, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.7218430034129693, |
|
"grad_norm": 0.3465187152384934, |
|
"learning_rate": 1.3889507480662545e-05, |
|
"loss": 0.4913, |
|
"num_tokens": 403763990.0, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.7235494880546075, |
|
"grad_norm": 0.3292708548169215, |
|
"learning_rate": 1.3788598956322068e-05, |
|
"loss": 0.4858, |
|
"num_tokens": 404752387.0, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.7252559726962458, |
|
"grad_norm": 0.36695144477657526, |
|
"learning_rate": 1.3688127243439863e-05, |
|
"loss": 0.4838, |
|
"num_tokens": 405691554.0, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.726962457337884, |
|
"grad_norm": 0.3372811093945545, |
|
"learning_rate": 1.3588095542797186e-05, |
|
"loss": 0.4947, |
|
"num_tokens": 406680793.0, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.7286689419795221, |
|
"grad_norm": 0.3443503709289555, |
|
"learning_rate": 1.3488507041157584e-05, |
|
"loss": 0.4921, |
|
"num_tokens": 407683383.0, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.7303754266211604, |
|
"grad_norm": 0.3480092507121765, |
|
"learning_rate": 1.3389364911165375e-05, |
|
"loss": 0.4846, |
|
"num_tokens": 408650161.0, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.7320819112627986, |
|
"grad_norm": 0.3353926863066775, |
|
"learning_rate": 1.3290672311244584e-05, |
|
"loss": 0.5006, |
|
"num_tokens": 409549494.0, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.7337883959044369, |
|
"grad_norm": 0.37867503643842687, |
|
"learning_rate": 1.3192432385498305e-05, |
|
"loss": 0.4921, |
|
"num_tokens": 410510602.0, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.735494880546075, |
|
"grad_norm": 0.3592680929736747, |
|
"learning_rate": 1.3094648263608533e-05, |
|
"loss": 0.4981, |
|
"num_tokens": 411492905.0, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.7372013651877133, |
|
"grad_norm": 0.3322392394703493, |
|
"learning_rate": 1.299732306073652e-05, |
|
"loss": 0.487, |
|
"num_tokens": 412454003.0, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.7389078498293515, |
|
"grad_norm": 0.3756417144546959, |
|
"learning_rate": 1.2900459877423457e-05, |
|
"loss": 0.5106, |
|
"num_tokens": 413421190.0, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.7406143344709898, |
|
"grad_norm": 0.3276238813240706, |
|
"learning_rate": 1.2804061799491734e-05, |
|
"loss": 0.4945, |
|
"num_tokens": 414425737.0, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.742320819112628, |
|
"grad_norm": 0.29735177345606206, |
|
"learning_rate": 1.2708131897946621e-05, |
|
"loss": 0.478, |
|
"num_tokens": 415344538.0, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.7440273037542662, |
|
"grad_norm": 0.3125870087788144, |
|
"learning_rate": 1.261267322887845e-05, |
|
"loss": 0.5041, |
|
"num_tokens": 416440659.0, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.7457337883959044, |
|
"grad_norm": 0.32461771570892695, |
|
"learning_rate": 1.251768883336526e-05, |
|
"loss": 0.4919, |
|
"num_tokens": 417360385.0, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.7474402730375427, |
|
"grad_norm": 0.3442194296895731, |
|
"learning_rate": 1.2423181737375899e-05, |
|
"loss": 0.4836, |
|
"num_tokens": 418334906.0, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.7491467576791809, |
|
"grad_norm": 0.3446911329824469, |
|
"learning_rate": 1.2329154951673598e-05, |
|
"loss": 0.4646, |
|
"num_tokens": 419196059.0, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.7508532423208191, |
|
"grad_norm": 0.33767235227444237, |
|
"learning_rate": 1.2235611471720123e-05, |
|
"loss": 0.4856, |
|
"num_tokens": 420121223.0, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.7525597269624573, |
|
"grad_norm": 0.31616744714729916, |
|
"learning_rate": 1.2142554277580288e-05, |
|
"loss": 0.4867, |
|
"num_tokens": 421062594.0, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.7542662116040956, |
|
"grad_norm": 0.3288211934138168, |
|
"learning_rate": 1.2049986333827048e-05, |
|
"loss": 0.4672, |
|
"num_tokens": 421975487.0, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.7559726962457338, |
|
"grad_norm": 0.3496778301287439, |
|
"learning_rate": 1.1957910589447043e-05, |
|
"loss": 0.4861, |
|
"num_tokens": 422820853.0, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.757679180887372, |
|
"grad_norm": 0.3475005774602, |
|
"learning_rate": 1.1866329977746656e-05, |
|
"loss": 0.4882, |
|
"num_tokens": 423755589.0, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.7593856655290102, |
|
"grad_norm": 0.35706393638603534, |
|
"learning_rate": 1.177524741625856e-05, |
|
"loss": 0.4887, |
|
"num_tokens": 424688821.0, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.7610921501706485, |
|
"grad_norm": 0.3461078636788691, |
|
"learning_rate": 1.1684665806648772e-05, |
|
"loss": 0.4684, |
|
"num_tokens": 425585640.0, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.7627986348122867, |
|
"grad_norm": 0.3356141138518873, |
|
"learning_rate": 1.1594588034624228e-05, |
|
"loss": 0.4813, |
|
"num_tokens": 426547476.0, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.764505119453925, |
|
"grad_norm": 0.3105226360165998, |
|
"learning_rate": 1.1505016969840823e-05, |
|
"loss": 0.4745, |
|
"num_tokens": 427476418.0, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.7662116040955631, |
|
"grad_norm": 0.32709156257769095, |
|
"learning_rate": 1.1415955465812023e-05, |
|
"loss": 0.4887, |
|
"num_tokens": 428405822.0, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.7679180887372014, |
|
"grad_norm": 0.30756496657170446, |
|
"learning_rate": 1.1327406359817933e-05, |
|
"loss": 0.4774, |
|
"num_tokens": 429400796.0, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.7696245733788396, |
|
"grad_norm": 0.3111754417183422, |
|
"learning_rate": 1.1239372472814927e-05, |
|
"loss": 0.4805, |
|
"num_tokens": 430392694.0, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.7713310580204779, |
|
"grad_norm": 0.34095902728307337, |
|
"learning_rate": 1.1151856609345774e-05, |
|
"loss": 0.4716, |
|
"num_tokens": 431359520.0, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.773037542662116, |
|
"grad_norm": 0.34936226176674845, |
|
"learning_rate": 1.1064861557450256e-05, |
|
"loss": 0.4894, |
|
"num_tokens": 432294915.0, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.7747440273037542, |
|
"grad_norm": 0.3155937127951743, |
|
"learning_rate": 1.0978390088576437e-05, |
|
"loss": 0.481, |
|
"num_tokens": 433284774.0, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.7764505119453925, |
|
"grad_norm": 0.3202827342005119, |
|
"learning_rate": 1.0892444957492276e-05, |
|
"loss": 0.4891, |
|
"num_tokens": 434284592.0, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.7781569965870307, |
|
"grad_norm": 0.3369162033659964, |
|
"learning_rate": 1.0807028902197925e-05, |
|
"loss": 0.4654, |
|
"num_tokens": 435149765.0, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.7798634812286689, |
|
"grad_norm": 0.3582446923094639, |
|
"learning_rate": 1.0722144643838461e-05, |
|
"loss": 0.4866, |
|
"num_tokens": 436158148.0, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.7815699658703071, |
|
"grad_norm": 0.3311676374288052, |
|
"learning_rate": 1.063779488661724e-05, |
|
"loss": 0.4776, |
|
"num_tokens": 437135437.0, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.7832764505119454, |
|
"grad_norm": 0.32325241567650026, |
|
"learning_rate": 1.0553982317709741e-05, |
|
"loss": 0.4654, |
|
"num_tokens": 438061307.0, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.7849829351535836, |
|
"grad_norm": 0.3043727173598455, |
|
"learning_rate": 1.047070960717793e-05, |
|
"loss": 0.4932, |
|
"num_tokens": 439079263.0, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.7866894197952219, |
|
"grad_norm": 0.3361341921228823, |
|
"learning_rate": 1.0387979407885198e-05, |
|
"loss": 0.506, |
|
"num_tokens": 440154096.0, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.78839590443686, |
|
"grad_norm": 0.31103435909845967, |
|
"learning_rate": 1.03057943554119e-05, |
|
"loss": 0.4848, |
|
"num_tokens": 441195758.0, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.7901023890784983, |
|
"grad_norm": 0.32529003714436683, |
|
"learning_rate": 1.022415706797133e-05, |
|
"loss": 0.4941, |
|
"num_tokens": 442194379.0, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.7918088737201365, |
|
"grad_norm": 0.34129914389197974, |
|
"learning_rate": 1.0143070146326347e-05, |
|
"loss": 0.4965, |
|
"num_tokens": 443118717.0, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.7935153583617748, |
|
"grad_norm": 0.34574447344569287, |
|
"learning_rate": 1.0062536173706519e-05, |
|
"loss": 0.4833, |
|
"num_tokens": 444049001.0, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.7952218430034129, |
|
"grad_norm": 0.39058472684776835, |
|
"learning_rate": 9.982557715725807e-06, |
|
"loss": 0.4855, |
|
"num_tokens": 444948197.0, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.7969283276450512, |
|
"grad_norm": 0.3223678497972808, |
|
"learning_rate": 9.903137320300852e-06, |
|
"loss": 0.4923, |
|
"num_tokens": 445993006.0, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.7986348122866894, |
|
"grad_norm": 0.3581110760327946, |
|
"learning_rate": 9.824277517569791e-06, |
|
"loss": 0.4714, |
|
"num_tokens": 446925677.0, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.8003412969283277, |
|
"grad_norm": 0.35940172044204516, |
|
"learning_rate": 9.745980819811668e-06, |
|
"loss": 0.4838, |
|
"num_tokens": 447799196.0, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.8020477815699659, |
|
"grad_norm": 0.34500268650073257, |
|
"learning_rate": 9.66824972136638e-06, |
|
"loss": 0.493, |
|
"num_tokens": 448739177.0, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.8037542662116041, |
|
"grad_norm": 0.32399229631376597, |
|
"learning_rate": 9.59108669855523e-06, |
|
"loss": 0.5037, |
|
"num_tokens": 449751958.0, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.8054607508532423, |
|
"grad_norm": 0.29766759242151386, |
|
"learning_rate": 9.514494209602023e-06, |
|
"loss": 0.5071, |
|
"num_tokens": 450761568.0, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.8071672354948806, |
|
"grad_norm": 0.3430178270787923, |
|
"learning_rate": 9.438474694554775e-06, |
|
"loss": 0.4935, |
|
"num_tokens": 451740507.0, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.8088737201365188, |
|
"grad_norm": 0.3471410052900583, |
|
"learning_rate": 9.36303057520795e-06, |
|
"loss": 0.4713, |
|
"num_tokens": 452625293.0, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.810580204778157, |
|
"grad_norm": 0.3493958783029142, |
|
"learning_rate": 9.288164255025334e-06, |
|
"loss": 0.4823, |
|
"num_tokens": 453642012.0, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.8122866894197952, |
|
"grad_norm": 0.33853142356519106, |
|
"learning_rate": 9.21387811906344e-06, |
|
"loss": 0.4823, |
|
"num_tokens": 454558803.0, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.8139931740614335, |
|
"grad_norm": 0.3227168370953717, |
|
"learning_rate": 9.14017453389556e-06, |
|
"loss": 0.476, |
|
"num_tokens": 455523392.0, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.8156996587030717, |
|
"grad_norm": 0.29404894296430983, |
|
"learning_rate": 9.067055847536346e-06, |
|
"loss": 0.4596, |
|
"num_tokens": 456494011.0, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.8174061433447098, |
|
"grad_norm": 0.3308718804211504, |
|
"learning_rate": 8.994524389367001e-06, |
|
"loss": 0.4891, |
|
"num_tokens": 457401137.0, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.8191126279863481, |
|
"grad_norm": 0.3300071384128006, |
|
"learning_rate": 8.922582470061099e-06, |
|
"loss": 0.4961, |
|
"num_tokens": 458401399.0, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.8208191126279863, |
|
"grad_norm": 0.44818729004437646, |
|
"learning_rate": 8.851232381510961e-06, |
|
"loss": 0.504, |
|
"num_tokens": 459365515.0, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.8225255972696246, |
|
"grad_norm": 0.3842623998501234, |
|
"learning_rate": 8.780476396754633e-06, |
|
"loss": 0.4931, |
|
"num_tokens": 460303546.0, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.8242320819112628, |
|
"grad_norm": 0.33418576004104283, |
|
"learning_rate": 8.710316769903471e-06, |
|
"loss": 0.4868, |
|
"num_tokens": 461172152.0, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.825938566552901, |
|
"grad_norm": 0.368646183750318, |
|
"learning_rate": 8.640755736070346e-06, |
|
"loss": 0.4579, |
|
"num_tokens": 462132037.0, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.8276450511945392, |
|
"grad_norm": 0.32508360537240183, |
|
"learning_rate": 8.571795511298423e-06, |
|
"loss": 0.4853, |
|
"num_tokens": 463170048.0, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.8293515358361775, |
|
"grad_norm": 0.3211106939852037, |
|
"learning_rate": 8.50343829249059e-06, |
|
"loss": 0.4593, |
|
"num_tokens": 464065062.0, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.8310580204778157, |
|
"grad_norm": 0.3053006101651597, |
|
"learning_rate": 8.435686257339417e-06, |
|
"loss": 0.4831, |
|
"num_tokens": 465056306.0, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.8327645051194539, |
|
"grad_norm": 0.31115117178982893, |
|
"learning_rate": 8.368541564257842e-06, |
|
"loss": 0.4907, |
|
"num_tokens": 466050672.0, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.8344709897610921, |
|
"grad_norm": 0.3397295723653259, |
|
"learning_rate": 8.302006352310369e-06, |
|
"loss": 0.4966, |
|
"num_tokens": 467046976.0, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.8361774744027304, |
|
"grad_norm": 0.3382052634070017, |
|
"learning_rate": 8.236082741144938e-06, |
|
"loss": 0.4638, |
|
"num_tokens": 468039326.0, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.8378839590443686, |
|
"grad_norm": 0.3514456973966274, |
|
"learning_rate": 8.170772830925389e-06, |
|
"loss": 0.4653, |
|
"num_tokens": 468922373.0, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.8395904436860068, |
|
"grad_norm": 0.30836591986791617, |
|
"learning_rate": 8.106078702264573e-06, |
|
"loss": 0.4829, |
|
"num_tokens": 469868923.0, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.841296928327645, |
|
"grad_norm": 0.33389876659214884, |
|
"learning_rate": 8.042002416158047e-06, |
|
"loss": 0.471, |
|
"num_tokens": 470752870.0, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.8430034129692833, |
|
"grad_norm": 0.34528058311355864, |
|
"learning_rate": 7.978546013918428e-06, |
|
"loss": 0.4806, |
|
"num_tokens": 471694644.0, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.8447098976109215, |
|
"grad_norm": 0.30449852181321135, |
|
"learning_rate": 7.915711517110365e-06, |
|
"loss": 0.4726, |
|
"num_tokens": 472652423.0, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.8464163822525598, |
|
"grad_norm": 0.3010535970843912, |
|
"learning_rate": 7.853500927486129e-06, |
|
"loss": 0.4734, |
|
"num_tokens": 473648633.0, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.8481228668941979, |
|
"grad_norm": 0.319770912075749, |
|
"learning_rate": 7.791916226921844e-06, |
|
"loss": 0.493, |
|
"num_tokens": 474686021.0, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.8498293515358362, |
|
"grad_norm": 0.3101950532253656, |
|
"learning_rate": 7.730959377354354e-06, |
|
"loss": 0.4811, |
|
"num_tokens": 475597050.0, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.8515358361774744, |
|
"grad_norm": 0.336474137392747, |
|
"learning_rate": 7.670632320718714e-06, |
|
"loss": 0.4985, |
|
"num_tokens": 476480863.0, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.8532423208191127, |
|
"grad_norm": 0.33239250078197224, |
|
"learning_rate": 7.610936978886332e-06, |
|
"loss": 0.4889, |
|
"num_tokens": 477480036.0, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.8549488054607508, |
|
"grad_norm": 0.31906385732968195, |
|
"learning_rate": 7.551875253603726e-06, |
|
"loss": 0.4913, |
|
"num_tokens": 478441727.0, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.856655290102389, |
|
"grad_norm": 0.2972232303527529, |
|
"learning_rate": 7.493449026431963e-06, |
|
"loss": 0.4846, |
|
"num_tokens": 479450987.0, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.8583617747440273, |
|
"grad_norm": 0.34482331352077483, |
|
"learning_rate": 7.4356601586867094e-06, |
|
"loss": 0.4872, |
|
"num_tokens": 480480087.0, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.8600682593856656, |
|
"grad_norm": 0.3286565488200801, |
|
"learning_rate": 7.3785104913789284e-06, |
|
"loss": 0.493, |
|
"num_tokens": 481428209.0, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.8617747440273038, |
|
"grad_norm": 0.3202404807748421, |
|
"learning_rate": 7.322001845156215e-06, |
|
"loss": 0.4634, |
|
"num_tokens": 482333846.0, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.863481228668942, |
|
"grad_norm": 0.3361830515481688, |
|
"learning_rate": 7.2661360202448344e-06, |
|
"loss": 0.4904, |
|
"num_tokens": 483299671.0, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.8651877133105802, |
|
"grad_norm": 0.3177624111695824, |
|
"learning_rate": 7.2109147963923335e-06, |
|
"loss": 0.4988, |
|
"num_tokens": 484313085.0, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.8668941979522184, |
|
"grad_norm": 0.32624228168571706, |
|
"learning_rate": 7.156339932810871e-06, |
|
"loss": 0.4968, |
|
"num_tokens": 485251856.0, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.8686006825938567, |
|
"grad_norm": 0.2897757010993598, |
|
"learning_rate": 7.1024131681211455e-06, |
|
"loss": 0.4779, |
|
"num_tokens": 486246381.0, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.8703071672354948, |
|
"grad_norm": 0.3120988592440139, |
|
"learning_rate": 7.0491362202970295e-06, |
|
"loss": 0.4712, |
|
"num_tokens": 487198446.0, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.8720136518771331, |
|
"grad_norm": 0.2815904159190546, |
|
"learning_rate": 6.9965107866108274e-06, |
|
"loss": 0.4722, |
|
"num_tokens": 488156403.0, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.8737201365187713, |
|
"grad_norm": 0.3273103504291178, |
|
"learning_rate": 6.9445385435792095e-06, |
|
"loss": 0.4695, |
|
"num_tokens": 489140124.0, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.8754266211604096, |
|
"grad_norm": 0.3289263665204276, |
|
"learning_rate": 6.893221146909806e-06, |
|
"loss": 0.4724, |
|
"num_tokens": 490104565.0, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.8771331058020477, |
|
"grad_norm": 0.3146346775894274, |
|
"learning_rate": 6.84256023144845e-06, |
|
"loss": 0.4762, |
|
"num_tokens": 491054868.0, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.878839590443686, |
|
"grad_norm": 0.29961414202911324, |
|
"learning_rate": 6.792557411127099e-06, |
|
"loss": 0.4704, |
|
"num_tokens": 492078546.0, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.8805460750853242, |
|
"grad_norm": 0.34594580055469915, |
|
"learning_rate": 6.74321427891242e-06, |
|
"loss": 0.4851, |
|
"num_tokens": 492974236.0, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.8822525597269625, |
|
"grad_norm": 0.3066722562434767, |
|
"learning_rate": 6.694532406755053e-06, |
|
"loss": 0.478, |
|
"num_tokens": 494019470.0, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.8839590443686007, |
|
"grad_norm": 0.2923055504113335, |
|
"learning_rate": 6.646513345539509e-06, |
|
"loss": 0.516, |
|
"num_tokens": 495062198.0, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.8856655290102389, |
|
"grad_norm": 0.2953358109985866, |
|
"learning_rate": 6.59915862503478e-06, |
|
"loss": 0.4668, |
|
"num_tokens": 496039347.0, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.8873720136518771, |
|
"grad_norm": 0.33489720113078275, |
|
"learning_rate": 6.552469753845601e-06, |
|
"loss": 0.4715, |
|
"num_tokens": 496987511.0, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.8890784982935154, |
|
"grad_norm": 0.35147052248086963, |
|
"learning_rate": 6.506448219364389e-06, |
|
"loss": 0.4952, |
|
"num_tokens": 497953501.0, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.8907849829351536, |
|
"grad_norm": 0.2976052092258336, |
|
"learning_rate": 6.461095487723852e-06, |
|
"loss": 0.4703, |
|
"num_tokens": 498971917.0, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.8924914675767918, |
|
"grad_norm": 0.3166781793747034, |
|
"learning_rate": 6.416413003750289e-06, |
|
"loss": 0.4765, |
|
"num_tokens": 499959465.0, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.89419795221843, |
|
"grad_norm": 0.3094928641877226, |
|
"learning_rate": 6.3724021909175636e-06, |
|
"loss": 0.4714, |
|
"num_tokens": 500947010.0, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.8959044368600683, |
|
"grad_norm": 0.3403135875166433, |
|
"learning_rate": 6.3290644513017496e-06, |
|
"loss": 0.4838, |
|
"num_tokens": 501972930.0, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.8976109215017065, |
|
"grad_norm": 0.35528577423877566, |
|
"learning_rate": 6.286401165536466e-06, |
|
"loss": 0.4974, |
|
"num_tokens": 502958987.0, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.8993174061433447, |
|
"grad_norm": 0.3318258094487176, |
|
"learning_rate": 6.244413692768893e-06, |
|
"loss": 0.4767, |
|
"num_tokens": 503946765.0, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.9010238907849829, |
|
"grad_norm": 0.30577807197719564, |
|
"learning_rate": 6.2031033706164715e-06, |
|
"loss": 0.471, |
|
"num_tokens": 504893463.0, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.9027303754266212, |
|
"grad_norm": 0.3384112726283077, |
|
"learning_rate": 6.162471515124292e-06, |
|
"loss": 0.481, |
|
"num_tokens": 505899175.0, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.9044368600682594, |
|
"grad_norm": 0.30699959739486854, |
|
"learning_rate": 6.122519420723182e-06, |
|
"loss": 0.4733, |
|
"num_tokens": 506861395.0, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.9061433447098977, |
|
"grad_norm": 0.3222784803973646, |
|
"learning_rate": 6.083248360188437e-06, |
|
"loss": 0.4825, |
|
"num_tokens": 507817589.0, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.9078498293515358, |
|
"grad_norm": 0.3403142746810214, |
|
"learning_rate": 6.044659584599297e-06, |
|
"loss": 0.4761, |
|
"num_tokens": 508690960.0, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.909556313993174, |
|
"grad_norm": 0.2953479643563658, |
|
"learning_rate": 6.006754323299088e-06, |
|
"loss": 0.4804, |
|
"num_tokens": 509649233.0, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.9112627986348123, |
|
"grad_norm": 0.30487020552378524, |
|
"learning_rate": 5.969533783856054e-06, |
|
"loss": 0.4777, |
|
"num_tokens": 510627944.0, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.9129692832764505, |
|
"grad_norm": 0.30704549371766876, |
|
"learning_rate": 5.932999152024885e-06, |
|
"loss": 0.4822, |
|
"num_tokens": 511591407.0, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.9146757679180887, |
|
"grad_norm": 0.33249148261118033, |
|
"learning_rate": 5.897151591708947e-06, |
|
"loss": 0.5016, |
|
"num_tokens": 512558436.0, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.9163822525597269, |
|
"grad_norm": 0.3229296450202934, |
|
"learning_rate": 5.861992244923199e-06, |
|
"loss": 0.4735, |
|
"num_tokens": 513474763.0, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.9180887372013652, |
|
"grad_norm": 0.31994250178502454, |
|
"learning_rate": 5.827522231757808e-06, |
|
"loss": 0.4609, |
|
"num_tokens": 514407245.0, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.9197952218430034, |
|
"grad_norm": 0.30727719477408283, |
|
"learning_rate": 5.793742650342482e-06, |
|
"loss": 0.4611, |
|
"num_tokens": 515337057.0, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.9215017064846417, |
|
"grad_norm": 0.3422479555475027, |
|
"learning_rate": 5.760654576811455e-06, |
|
"loss": 0.5085, |
|
"num_tokens": 516301089.0, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.9232081911262798, |
|
"grad_norm": 0.32317795820730566, |
|
"learning_rate": 5.728259065269248e-06, |
|
"loss": 0.4808, |
|
"num_tokens": 517258131.0, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.9249146757679181, |
|
"grad_norm": 0.31035185685822675, |
|
"learning_rate": 5.696557147757041e-06, |
|
"loss": 0.4989, |
|
"num_tokens": 518223298.0, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.9266211604095563, |
|
"grad_norm": 0.32127700006025317, |
|
"learning_rate": 5.66554983421983e-06, |
|
"loss": 0.4721, |
|
"num_tokens": 519130985.0, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.9283276450511946, |
|
"grad_norm": 0.3248191734813711, |
|
"learning_rate": 5.635238112474237e-06, |
|
"loss": 0.4878, |
|
"num_tokens": 520051962.0, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.9300341296928327, |
|
"grad_norm": 0.32016262129429485, |
|
"learning_rate": 5.605622948177032e-06, |
|
"loss": 0.4612, |
|
"num_tokens": 520934447.0, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.931740614334471, |
|
"grad_norm": 0.3075215750119489, |
|
"learning_rate": 5.576705284794404e-06, |
|
"loss": 0.4717, |
|
"num_tokens": 521910187.0, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.9334470989761092, |
|
"grad_norm": 0.2966943509184171, |
|
"learning_rate": 5.548486043571861e-06, |
|
"loss": 0.4615, |
|
"num_tokens": 522876883.0, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.9351535836177475, |
|
"grad_norm": 0.3823699167375232, |
|
"learning_rate": 5.52096612350491e-06, |
|
"loss": 0.4899, |
|
"num_tokens": 523880084.0, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.9368600682593856, |
|
"grad_norm": 0.3445674466297448, |
|
"learning_rate": 5.494146401310404e-06, |
|
"loss": 0.4792, |
|
"num_tokens": 524788350.0, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.9385665529010239, |
|
"grad_norm": 0.29579943008870063, |
|
"learning_rate": 5.468027731398621e-06, |
|
"loss": 0.4863, |
|
"num_tokens": 525832920.0, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.9402730375426621, |
|
"grad_norm": 0.30558147446901934, |
|
"learning_rate": 5.442610945846045e-06, |
|
"loss": 0.4943, |
|
"num_tokens": 526845340.0, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.9419795221843004, |
|
"grad_norm": 0.32100249042860457, |
|
"learning_rate": 5.41789685436884e-06, |
|
"loss": 0.4788, |
|
"num_tokens": 527753736.0, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.9436860068259386, |
|
"grad_norm": 0.32004490139915764, |
|
"learning_rate": 5.393886244297079e-06, |
|
"loss": 0.4817, |
|
"num_tokens": 528798665.0, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.9453924914675768, |
|
"grad_norm": 0.3325347960685324, |
|
"learning_rate": 5.370579880549647e-06, |
|
"loss": 0.4878, |
|
"num_tokens": 529711197.0, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.947098976109215, |
|
"grad_norm": 0.3303511181497015, |
|
"learning_rate": 5.347978505609877e-06, |
|
"loss": 0.4693, |
|
"num_tokens": 530632318.0, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.9488054607508533, |
|
"grad_norm": 0.3044920349269044, |
|
"learning_rate": 5.326082839501891e-06, |
|
"loss": 0.4881, |
|
"num_tokens": 531553862.0, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.9505119453924915, |
|
"grad_norm": 0.30326530639070487, |
|
"learning_rate": 5.304893579767674e-06, |
|
"loss": 0.4935, |
|
"num_tokens": 532532990.0, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.9522184300341296, |
|
"grad_norm": 0.31251416953382594, |
|
"learning_rate": 5.284411401444836e-06, |
|
"loss": 0.4933, |
|
"num_tokens": 533448215.0, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.9539249146757679, |
|
"grad_norm": 0.32753454088259387, |
|
"learning_rate": 5.264636957045122e-06, |
|
"loss": 0.4824, |
|
"num_tokens": 534401740.0, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.9556313993174061, |
|
"grad_norm": 0.30791599389681806, |
|
"learning_rate": 5.245570876533615e-06, |
|
"loss": 0.4685, |
|
"num_tokens": 535341346.0, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.9573378839590444, |
|
"grad_norm": 0.313479264368028, |
|
"learning_rate": 5.227213767308668e-06, |
|
"loss": 0.4575, |
|
"num_tokens": 536296941.0, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.9590443686006825, |
|
"grad_norm": 0.29032270836531787, |
|
"learning_rate": 5.209566214182558e-06, |
|
"loss": 0.4742, |
|
"num_tokens": 537336227.0, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.9607508532423208, |
|
"grad_norm": 0.30878749649756715, |
|
"learning_rate": 5.1926287793628515e-06, |
|
"loss": 0.4843, |
|
"num_tokens": 538331668.0, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.962457337883959, |
|
"grad_norm": 0.3144044223958235, |
|
"learning_rate": 5.176402002434495e-06, |
|
"loss": 0.4596, |
|
"num_tokens": 539226192.0, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.9641638225255973, |
|
"grad_norm": 0.33929109387452877, |
|
"learning_rate": 5.1608864003426255e-06, |
|
"loss": 0.4783, |
|
"num_tokens": 540188216.0, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.9658703071672355, |
|
"grad_norm": 0.3147575010798021, |
|
"learning_rate": 5.146082467376103e-06, |
|
"loss": 0.4742, |
|
"num_tokens": 541043880.0, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.9675767918088737, |
|
"grad_norm": 0.29692914861077196, |
|
"learning_rate": 5.131990675151757e-06, |
|
"loss": 0.4915, |
|
"num_tokens": 541979505.0, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.9692832764505119, |
|
"grad_norm": 0.3233760089537912, |
|
"learning_rate": 5.1186114725993754e-06, |
|
"loss": 0.4741, |
|
"num_tokens": 542913487.0, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.9709897610921502, |
|
"grad_norm": 0.3243453736091082, |
|
"learning_rate": 5.105945285947394e-06, |
|
"loss": 0.4856, |
|
"num_tokens": 543888926.0, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.9726962457337884, |
|
"grad_norm": 0.34332556463465, |
|
"learning_rate": 5.09399251870931e-06, |
|
"loss": 0.5042, |
|
"num_tokens": 544851416.0, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.9744027303754266, |
|
"grad_norm": 0.35805837088581816, |
|
"learning_rate": 5.082753551670843e-06, |
|
"loss": 0.4832, |
|
"num_tokens": 545757943.0, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.9761092150170648, |
|
"grad_norm": 0.3330936930767059, |
|
"learning_rate": 5.072228742877796e-06, |
|
"loss": 0.4861, |
|
"num_tokens": 546668069.0, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.9778156996587031, |
|
"grad_norm": 0.2996095611574381, |
|
"learning_rate": 5.062418427624646e-06, |
|
"loss": 0.4706, |
|
"num_tokens": 547620964.0, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.9795221843003413, |
|
"grad_norm": 0.3334623473830669, |
|
"learning_rate": 5.053322918443873e-06, |
|
"loss": 0.4815, |
|
"num_tokens": 548580998.0, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.9812286689419796, |
|
"grad_norm": 0.3134805266587632, |
|
"learning_rate": 5.0449425050959876e-06, |
|
"loss": 0.49, |
|
"num_tokens": 549511852.0, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.9829351535836177, |
|
"grad_norm": 0.3309875762519481, |
|
"learning_rate": 5.0372774545603155e-06, |
|
"loss": 0.4617, |
|
"num_tokens": 550468465.0, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.984641638225256, |
|
"grad_norm": 0.3215047597552604, |
|
"learning_rate": 5.0303280110264825e-06, |
|
"loss": 0.4681, |
|
"num_tokens": 551376796.0, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.9863481228668942, |
|
"grad_norm": 0.45537977132918783, |
|
"learning_rate": 5.02409439588664e-06, |
|
"loss": 0.4914, |
|
"num_tokens": 552397091.0, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.9880546075085325, |
|
"grad_norm": 0.30892367299813056, |
|
"learning_rate": 5.018576807728409e-06, |
|
"loss": 0.4632, |
|
"num_tokens": 553378344.0, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.9897610921501706, |
|
"grad_norm": 0.32443914024425874, |
|
"learning_rate": 5.013775422328553e-06, |
|
"loss": 0.466, |
|
"num_tokens": 554315309.0, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.9914675767918089, |
|
"grad_norm": 0.3310428027199226, |
|
"learning_rate": 5.0096903926473885e-06, |
|
"loss": 0.4724, |
|
"num_tokens": 555217376.0, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.9931740614334471, |
|
"grad_norm": 0.3211087070464618, |
|
"learning_rate": 5.00632184882389e-06, |
|
"loss": 0.4991, |
|
"num_tokens": 556147373.0, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.9948805460750854, |
|
"grad_norm": 0.3344876290335765, |
|
"learning_rate": 5.00366989817157e-06, |
|
"loss": 0.4827, |
|
"num_tokens": 557076223.0, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.9965870307167235, |
|
"grad_norm": 0.32216989394760803, |
|
"learning_rate": 5.0017346251750415e-06, |
|
"loss": 0.4646, |
|
"num_tokens": 558093284.0, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.9982935153583617, |
|
"grad_norm": 0.3263881006745613, |
|
"learning_rate": 5.000516091487337e-06, |
|
"loss": 0.4751, |
|
"num_tokens": 558997916.0, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.3026241946213063, |
|
"learning_rate": 5.00001433592793e-06, |
|
"loss": 0.4825, |
|
"num_tokens": 560009809.0, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 2930, |
|
"total_flos": 1146719603261440.0, |
|
"train_loss": 0.5200651722149637, |
|
"train_runtime": 20826.6168, |
|
"train_samples_per_second": 4.501, |
|
"train_steps_per_second": 0.141 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2930, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1146719603261440.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|