|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 4395, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0034129692832764505, |
|
"grad_norm": 2.9234242474780587, |
|
"learning_rate": 9.09090909090909e-07, |
|
"loss": 0.8721, |
|
"num_tokens": 1933925.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.006825938566552901, |
|
"grad_norm": 1.8660906811263716, |
|
"learning_rate": 2.0454545454545457e-06, |
|
"loss": 0.8356, |
|
"num_tokens": 3759146.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.010238907849829351, |
|
"grad_norm": 1.6011343042226884, |
|
"learning_rate": 3.1818181818181817e-06, |
|
"loss": 0.8325, |
|
"num_tokens": 5644524.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.013651877133105802, |
|
"grad_norm": 1.2454863399184872, |
|
"learning_rate": 4.3181818181818185e-06, |
|
"loss": 0.7876, |
|
"num_tokens": 7668808.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.017064846416382253, |
|
"grad_norm": 0.7868446109328924, |
|
"learning_rate": 5.4545454545454545e-06, |
|
"loss": 0.7459, |
|
"num_tokens": 9513010.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.020477815699658702, |
|
"grad_norm": 0.5924484737157731, |
|
"learning_rate": 6.59090909090909e-06, |
|
"loss": 0.6947, |
|
"num_tokens": 11368873.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.023890784982935155, |
|
"grad_norm": 0.5604182612081587, |
|
"learning_rate": 7.727272727272727e-06, |
|
"loss": 0.6899, |
|
"num_tokens": 13219424.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.027303754266211604, |
|
"grad_norm": 0.4895549616398094, |
|
"learning_rate": 8.863636363636365e-06, |
|
"loss": 0.661, |
|
"num_tokens": 15118063.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.030716723549488054, |
|
"grad_norm": 0.42034621767587177, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6604, |
|
"num_tokens": 17042190.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.034129692832764506, |
|
"grad_norm": 0.4180115353544924, |
|
"learning_rate": 1.1136363636363637e-05, |
|
"loss": 0.6408, |
|
"num_tokens": 18906839.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03754266211604096, |
|
"grad_norm": 0.38973426726946303, |
|
"learning_rate": 1.2272727272727273e-05, |
|
"loss": 0.626, |
|
"num_tokens": 20791105.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.040955631399317405, |
|
"grad_norm": 0.3729163992496813, |
|
"learning_rate": 1.340909090909091e-05, |
|
"loss": 0.6319, |
|
"num_tokens": 22641755.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04436860068259386, |
|
"grad_norm": 0.43970034712774936, |
|
"learning_rate": 1.4545454545454545e-05, |
|
"loss": 0.6308, |
|
"num_tokens": 24604914.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.04778156996587031, |
|
"grad_norm": 0.391042984403886, |
|
"learning_rate": 1.5681818181818182e-05, |
|
"loss": 0.6221, |
|
"num_tokens": 26636629.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.051194539249146756, |
|
"grad_norm": 0.36147517581553934, |
|
"learning_rate": 1.6818181818181818e-05, |
|
"loss": 0.5868, |
|
"num_tokens": 28441260.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.05460750853242321, |
|
"grad_norm": 0.4193365439715771, |
|
"learning_rate": 1.7954545454545454e-05, |
|
"loss": 0.6088, |
|
"num_tokens": 30417967.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05802047781569966, |
|
"grad_norm": 0.4171218491287503, |
|
"learning_rate": 1.9090909090909094e-05, |
|
"loss": 0.6029, |
|
"num_tokens": 32351323.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.06143344709897611, |
|
"grad_norm": 0.39471843177954063, |
|
"learning_rate": 2.022727272727273e-05, |
|
"loss": 0.6014, |
|
"num_tokens": 34231333.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06484641638225255, |
|
"grad_norm": 0.43024779590710627, |
|
"learning_rate": 2.1363636363636362e-05, |
|
"loss": 0.5977, |
|
"num_tokens": 36068371.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.06825938566552901, |
|
"grad_norm": 0.46974196929259243, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.609, |
|
"num_tokens": 37961424.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07167235494880546, |
|
"grad_norm": 0.49424283775259037, |
|
"learning_rate": 2.3636363636363637e-05, |
|
"loss": 0.6079, |
|
"num_tokens": 39846738.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.07508532423208192, |
|
"grad_norm": 0.44091065089968234, |
|
"learning_rate": 2.4772727272727277e-05, |
|
"loss": 0.6013, |
|
"num_tokens": 41826860.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.07849829351535836, |
|
"grad_norm": 0.4784434654985161, |
|
"learning_rate": 2.590909090909091e-05, |
|
"loss": 0.596, |
|
"num_tokens": 43675049.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.08191126279863481, |
|
"grad_norm": 0.5357275694123541, |
|
"learning_rate": 2.7045454545454545e-05, |
|
"loss": 0.5887, |
|
"num_tokens": 45543403.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.08532423208191127, |
|
"grad_norm": 0.49911922995763447, |
|
"learning_rate": 2.818181818181818e-05, |
|
"loss": 0.6015, |
|
"num_tokens": 47466657.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.08873720136518772, |
|
"grad_norm": 0.5036954772376355, |
|
"learning_rate": 2.9318181818181817e-05, |
|
"loss": 0.5923, |
|
"num_tokens": 49369486.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09215017064846416, |
|
"grad_norm": 0.4507843677307472, |
|
"learning_rate": 3.0454545454545456e-05, |
|
"loss": 0.5874, |
|
"num_tokens": 51193029.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.09556313993174062, |
|
"grad_norm": 0.4181777811200571, |
|
"learning_rate": 3.159090909090909e-05, |
|
"loss": 0.5859, |
|
"num_tokens": 53010874.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.09897610921501707, |
|
"grad_norm": 0.4591933150676979, |
|
"learning_rate": 3.272727272727273e-05, |
|
"loss": 0.5793, |
|
"num_tokens": 54919625.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.10238907849829351, |
|
"grad_norm": 0.41064630834685967, |
|
"learning_rate": 3.3863636363636364e-05, |
|
"loss": 0.5984, |
|
"num_tokens": 56909889.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.10580204778156997, |
|
"grad_norm": 0.4595803439510107, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.5686, |
|
"num_tokens": 58759721.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.10921501706484642, |
|
"grad_norm": 0.47432883262205655, |
|
"learning_rate": 3.613636363636364e-05, |
|
"loss": 0.576, |
|
"num_tokens": 60650570.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.11262798634812286, |
|
"grad_norm": 0.4476702414488316, |
|
"learning_rate": 3.7272727272727276e-05, |
|
"loss": 0.56, |
|
"num_tokens": 62662896.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.11604095563139932, |
|
"grad_norm": 0.4882186117934918, |
|
"learning_rate": 3.840909090909091e-05, |
|
"loss": 0.5721, |
|
"num_tokens": 64564660.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.11945392491467577, |
|
"grad_norm": 0.4419362623733834, |
|
"learning_rate": 3.954545454545455e-05, |
|
"loss": 0.5772, |
|
"num_tokens": 66485190.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.12286689419795221, |
|
"grad_norm": 0.5206425917443764, |
|
"learning_rate": 4.068181818181818e-05, |
|
"loss": 0.5914, |
|
"num_tokens": 68426882.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.12627986348122866, |
|
"grad_norm": 0.4594589383240807, |
|
"learning_rate": 4.181818181818182e-05, |
|
"loss": 0.5807, |
|
"num_tokens": 70332714.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.1296928327645051, |
|
"grad_norm": 0.6185767143477556, |
|
"learning_rate": 4.295454545454546e-05, |
|
"loss": 0.5801, |
|
"num_tokens": 72252819.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.13310580204778158, |
|
"grad_norm": 0.5441186147701085, |
|
"learning_rate": 4.409090909090909e-05, |
|
"loss": 0.5543, |
|
"num_tokens": 74221358.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.13651877133105803, |
|
"grad_norm": 0.5964199268078774, |
|
"learning_rate": 4.522727272727273e-05, |
|
"loss": 0.5801, |
|
"num_tokens": 76160914.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.13993174061433447, |
|
"grad_norm": 0.5161872417575659, |
|
"learning_rate": 4.636363636363636e-05, |
|
"loss": 0.568, |
|
"num_tokens": 78170639.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.14334470989761092, |
|
"grad_norm": 0.539803223391811, |
|
"learning_rate": 4.75e-05, |
|
"loss": 0.5717, |
|
"num_tokens": 80152955.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.14675767918088736, |
|
"grad_norm": 0.49826016397215506, |
|
"learning_rate": 4.863636363636364e-05, |
|
"loss": 0.5699, |
|
"num_tokens": 81973764.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.15017064846416384, |
|
"grad_norm": 0.4478810201640095, |
|
"learning_rate": 4.9772727272727275e-05, |
|
"loss": 0.5778, |
|
"num_tokens": 83901702.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.15358361774744028, |
|
"grad_norm": 0.47960005318311366, |
|
"learning_rate": 4.999989808010608e-05, |
|
"loss": 0.5527, |
|
"num_tokens": 85675897.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.15699658703071673, |
|
"grad_norm": 0.4154208311619921, |
|
"learning_rate": 4.999948403211952e-05, |
|
"loss": 0.5934, |
|
"num_tokens": 87691018.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.16040955631399317, |
|
"grad_norm": 0.5045441094237715, |
|
"learning_rate": 4.9998751491903514e-05, |
|
"loss": 0.5624, |
|
"num_tokens": 89579835.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.16382252559726962, |
|
"grad_norm": 0.4828789457579032, |
|
"learning_rate": 4.999770046982755e-05, |
|
"loss": 0.561, |
|
"num_tokens": 91542428.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.16723549488054607, |
|
"grad_norm": 0.4694103846101932, |
|
"learning_rate": 4.999633098076944e-05, |
|
"loss": 0.5619, |
|
"num_tokens": 93492833.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.17064846416382254, |
|
"grad_norm": 0.45281850476362134, |
|
"learning_rate": 4.999464304411499e-05, |
|
"loss": 0.5762, |
|
"num_tokens": 95438170.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.17406143344709898, |
|
"grad_norm": 0.45563179331695774, |
|
"learning_rate": 4.999263668375788e-05, |
|
"loss": 0.5677, |
|
"num_tokens": 97303144.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.17747440273037543, |
|
"grad_norm": 0.594320538310398, |
|
"learning_rate": 4.999031192809919e-05, |
|
"loss": 0.5609, |
|
"num_tokens": 99383692.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.18088737201365188, |
|
"grad_norm": 0.4877418400224137, |
|
"learning_rate": 4.998766881004709e-05, |
|
"loss": 0.5661, |
|
"num_tokens": 101357212.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.18430034129692832, |
|
"grad_norm": 0.5047110175224094, |
|
"learning_rate": 4.998470736701634e-05, |
|
"loss": 0.5607, |
|
"num_tokens": 103223537.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.18771331058020477, |
|
"grad_norm": 0.42926499615516883, |
|
"learning_rate": 4.9981427640927737e-05, |
|
"loss": 0.5712, |
|
"num_tokens": 105044617.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.19112627986348124, |
|
"grad_norm": 0.47060617173242353, |
|
"learning_rate": 4.9977829678207565e-05, |
|
"loss": 0.5799, |
|
"num_tokens": 106901687.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1945392491467577, |
|
"grad_norm": 0.3918775245841588, |
|
"learning_rate": 4.99739135297869e-05, |
|
"loss": 0.5348, |
|
"num_tokens": 108801293.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.19795221843003413, |
|
"grad_norm": 0.4423073930617194, |
|
"learning_rate": 4.996967925110093e-05, |
|
"loss": 0.5586, |
|
"num_tokens": 110840758.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.20136518771331058, |
|
"grad_norm": 0.40496989215921997, |
|
"learning_rate": 4.996512690208813e-05, |
|
"loss": 0.5611, |
|
"num_tokens": 112713541.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.20477815699658702, |
|
"grad_norm": 0.4624172582770661, |
|
"learning_rate": 4.996025654718942e-05, |
|
"loss": 0.5616, |
|
"num_tokens": 114521504.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.20819112627986347, |
|
"grad_norm": 0.4157666838153778, |
|
"learning_rate": 4.99550682553473e-05, |
|
"loss": 0.5464, |
|
"num_tokens": 116445027.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.21160409556313994, |
|
"grad_norm": 0.41867883871171135, |
|
"learning_rate": 4.994956210000481e-05, |
|
"loss": 0.546, |
|
"num_tokens": 118445759.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2150170648464164, |
|
"grad_norm": 0.44173080298578826, |
|
"learning_rate": 4.994373815910452e-05, |
|
"loss": 0.5529, |
|
"num_tokens": 120303683.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.21843003412969283, |
|
"grad_norm": 0.4355421923299714, |
|
"learning_rate": 4.9937596515087434e-05, |
|
"loss": 0.5571, |
|
"num_tokens": 122311693.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.22184300341296928, |
|
"grad_norm": 0.36598314246434627, |
|
"learning_rate": 4.993113725489179e-05, |
|
"loss": 0.559, |
|
"num_tokens": 124240526.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.22525597269624573, |
|
"grad_norm": 0.35614304303048105, |
|
"learning_rate": 4.9924360469951894e-05, |
|
"loss": 0.5545, |
|
"num_tokens": 126094524.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.22866894197952217, |
|
"grad_norm": 0.3812958674545772, |
|
"learning_rate": 4.991726625619675e-05, |
|
"loss": 0.5377, |
|
"num_tokens": 128075174.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.23208191126279865, |
|
"grad_norm": 0.39183276636151587, |
|
"learning_rate": 4.990985471404874e-05, |
|
"loss": 0.5626, |
|
"num_tokens": 129971056.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2354948805460751, |
|
"grad_norm": 0.4268019296022907, |
|
"learning_rate": 4.990212594842222e-05, |
|
"loss": 0.5613, |
|
"num_tokens": 131847305.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.23890784982935154, |
|
"grad_norm": 0.4201099497432233, |
|
"learning_rate": 4.989408006872199e-05, |
|
"loss": 0.5692, |
|
"num_tokens": 133765982.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.24232081911262798, |
|
"grad_norm": 0.3970069501554659, |
|
"learning_rate": 4.98857171888418e-05, |
|
"loss": 0.541, |
|
"num_tokens": 135541389.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.24573378839590443, |
|
"grad_norm": 0.43166762110766976, |
|
"learning_rate": 4.9877037427162664e-05, |
|
"loss": 0.5498, |
|
"num_tokens": 137522281.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.24914675767918087, |
|
"grad_norm": 0.391722053079388, |
|
"learning_rate": 4.986804090655126e-05, |
|
"loss": 0.5384, |
|
"num_tokens": 139412841.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.2525597269624573, |
|
"grad_norm": 0.4313722584733286, |
|
"learning_rate": 4.9858727754358156e-05, |
|
"loss": 0.5415, |
|
"num_tokens": 141196084.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.25597269624573377, |
|
"grad_norm": 0.3759279885998296, |
|
"learning_rate": 4.984909810241598e-05, |
|
"loss": 0.5622, |
|
"num_tokens": 143277757.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.2593856655290102, |
|
"grad_norm": 0.4247856463406985, |
|
"learning_rate": 4.9839152087037634e-05, |
|
"loss": 0.54, |
|
"num_tokens": 145136704.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.2627986348122867, |
|
"grad_norm": 0.40601821570497126, |
|
"learning_rate": 4.982888984901427e-05, |
|
"loss": 0.5305, |
|
"num_tokens": 147007396.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.26621160409556316, |
|
"grad_norm": 0.4310619885316953, |
|
"learning_rate": 4.9818311533613356e-05, |
|
"loss": 0.5472, |
|
"num_tokens": 148940122.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2696245733788396, |
|
"grad_norm": 0.45012436714415344, |
|
"learning_rate": 4.9807417290576604e-05, |
|
"loss": 0.5548, |
|
"num_tokens": 150912454.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.27303754266211605, |
|
"grad_norm": 0.44666759037455367, |
|
"learning_rate": 4.979620727411785e-05, |
|
"loss": 0.5419, |
|
"num_tokens": 152809678.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2764505119453925, |
|
"grad_norm": 0.46491094622823037, |
|
"learning_rate": 4.978468164292087e-05, |
|
"loss": 0.5264, |
|
"num_tokens": 154713852.0, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.27986348122866894, |
|
"grad_norm": 0.4241223884347823, |
|
"learning_rate": 4.977284056013714e-05, |
|
"loss": 0.5567, |
|
"num_tokens": 156683573.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.2832764505119454, |
|
"grad_norm": 0.38952457164726123, |
|
"learning_rate": 4.976068419338352e-05, |
|
"loss": 0.5322, |
|
"num_tokens": 158515202.0, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.28668941979522183, |
|
"grad_norm": 0.41276839941571136, |
|
"learning_rate": 4.974821271473989e-05, |
|
"loss": 0.554, |
|
"num_tokens": 160433326.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.2901023890784983, |
|
"grad_norm": 0.46776340260588906, |
|
"learning_rate": 4.973542630074669e-05, |
|
"loss": 0.5379, |
|
"num_tokens": 162331530.0, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.2935153583617747, |
|
"grad_norm": 0.3942126322792489, |
|
"learning_rate": 4.9722325132402456e-05, |
|
"loss": 0.5544, |
|
"num_tokens": 164374316.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.29692832764505117, |
|
"grad_norm": 0.42485340574568903, |
|
"learning_rate": 4.970890939516122e-05, |
|
"loss": 0.5212, |
|
"num_tokens": 166249544.0, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.3003412969283277, |
|
"grad_norm": 0.36931812888506377, |
|
"learning_rate": 4.969517927892993e-05, |
|
"loss": 0.5307, |
|
"num_tokens": 168223299.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3037542662116041, |
|
"grad_norm": 0.37893051071556805, |
|
"learning_rate": 4.968113497806571e-05, |
|
"loss": 0.5495, |
|
"num_tokens": 170189422.0, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.30716723549488056, |
|
"grad_norm": 0.391417342854194, |
|
"learning_rate": 4.966677669137318e-05, |
|
"loss": 0.5273, |
|
"num_tokens": 172132000.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.310580204778157, |
|
"grad_norm": 0.43359270381089615, |
|
"learning_rate": 4.965210462210153e-05, |
|
"loss": 0.5391, |
|
"num_tokens": 174083393.0, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.31399317406143346, |
|
"grad_norm": 0.3814563282884313, |
|
"learning_rate": 4.963711897794177e-05, |
|
"loss": 0.5283, |
|
"num_tokens": 176086331.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.3174061433447099, |
|
"grad_norm": 0.3646920542467526, |
|
"learning_rate": 4.962181997102371e-05, |
|
"loss": 0.5127, |
|
"num_tokens": 178019887.0, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.32081911262798635, |
|
"grad_norm": 0.4136924968720306, |
|
"learning_rate": 4.9606207817912956e-05, |
|
"loss": 0.5394, |
|
"num_tokens": 179917640.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.3242320819112628, |
|
"grad_norm": 0.3583143442753951, |
|
"learning_rate": 4.95902827396079e-05, |
|
"loss": 0.5432, |
|
"num_tokens": 181861425.0, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.32764505119453924, |
|
"grad_norm": 0.36301898613611383, |
|
"learning_rate": 4.957404496153654e-05, |
|
"loss": 0.5331, |
|
"num_tokens": 183746129.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.3310580204778157, |
|
"grad_norm": 0.4058100440696107, |
|
"learning_rate": 4.95574947135533e-05, |
|
"loss": 0.55, |
|
"num_tokens": 185690559.0, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.33447098976109213, |
|
"grad_norm": 0.3963433287989491, |
|
"learning_rate": 4.95406322299358e-05, |
|
"loss": 0.5431, |
|
"num_tokens": 187699370.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.3378839590443686, |
|
"grad_norm": 0.3358571679241484, |
|
"learning_rate": 4.952345774938151e-05, |
|
"loss": 0.5282, |
|
"num_tokens": 189580008.0, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.3412969283276451, |
|
"grad_norm": 0.3876973392915836, |
|
"learning_rate": 4.95059715150044e-05, |
|
"loss": 0.5304, |
|
"num_tokens": 191512142.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3447098976109215, |
|
"grad_norm": 0.39675038817247194, |
|
"learning_rate": 4.948817377433145e-05, |
|
"loss": 0.5264, |
|
"num_tokens": 193419599.0, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.34812286689419797, |
|
"grad_norm": 0.352402123633437, |
|
"learning_rate": 4.947006477929921e-05, |
|
"loss": 0.5347, |
|
"num_tokens": 195367749.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3515358361774744, |
|
"grad_norm": 0.3811400127650845, |
|
"learning_rate": 4.945164478625018e-05, |
|
"loss": 0.5146, |
|
"num_tokens": 197256984.0, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.35494880546075086, |
|
"grad_norm": 0.3515851376722721, |
|
"learning_rate": 4.943291405592923e-05, |
|
"loss": 0.524, |
|
"num_tokens": 199215371.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3583617747440273, |
|
"grad_norm": 0.36260566573089453, |
|
"learning_rate": 4.9413872853479845e-05, |
|
"loss": 0.5503, |
|
"num_tokens": 201102127.0, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.36177474402730375, |
|
"grad_norm": 0.3952389227344783, |
|
"learning_rate": 4.9394521448440445e-05, |
|
"loss": 0.5273, |
|
"num_tokens": 202903048.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.3651877133105802, |
|
"grad_norm": 0.40864664488422103, |
|
"learning_rate": 4.9374860114740504e-05, |
|
"loss": 0.5286, |
|
"num_tokens": 204769389.0, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.36860068259385664, |
|
"grad_norm": 0.33805338721530187, |
|
"learning_rate": 4.9354889130696724e-05, |
|
"loss": 0.5277, |
|
"num_tokens": 206761213.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.3720136518771331, |
|
"grad_norm": 0.3627209335431525, |
|
"learning_rate": 4.933460877900907e-05, |
|
"loss": 0.524, |
|
"num_tokens": 208772401.0, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.37542662116040953, |
|
"grad_norm": 0.3611582489390528, |
|
"learning_rate": 4.931401934675675e-05, |
|
"loss": 0.5239, |
|
"num_tokens": 210585632.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.378839590443686, |
|
"grad_norm": 0.347893449988355, |
|
"learning_rate": 4.9293121125394203e-05, |
|
"loss": 0.5334, |
|
"num_tokens": 212446145.0, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.3822525597269625, |
|
"grad_norm": 0.3819699892037328, |
|
"learning_rate": 4.927191441074692e-05, |
|
"loss": 0.5256, |
|
"num_tokens": 214261738.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.3856655290102389, |
|
"grad_norm": 0.36539693889703045, |
|
"learning_rate": 4.92503995030073e-05, |
|
"loss": 0.54, |
|
"num_tokens": 216231144.0, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.3890784982935154, |
|
"grad_norm": 0.4184658120058824, |
|
"learning_rate": 4.9228576706730355e-05, |
|
"loss": 0.5292, |
|
"num_tokens": 218144024.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.3924914675767918, |
|
"grad_norm": 0.35963118361907315, |
|
"learning_rate": 4.9206446330829456e-05, |
|
"loss": 0.5286, |
|
"num_tokens": 220108684.0, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.39590443686006827, |
|
"grad_norm": 0.3683166909297301, |
|
"learning_rate": 4.9184008688571884e-05, |
|
"loss": 0.5311, |
|
"num_tokens": 222057295.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.3993174061433447, |
|
"grad_norm": 0.3340559497592815, |
|
"learning_rate": 4.9161264097574483e-05, |
|
"loss": 0.5167, |
|
"num_tokens": 223892351.0, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.40273037542662116, |
|
"grad_norm": 0.33966650506262697, |
|
"learning_rate": 4.913821287979909e-05, |
|
"loss": 0.5253, |
|
"num_tokens": 225828573.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.4061433447098976, |
|
"grad_norm": 0.31277344378918026, |
|
"learning_rate": 4.911485536154804e-05, |
|
"loss": 0.5205, |
|
"num_tokens": 227748575.0, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.40955631399317405, |
|
"grad_norm": 0.36134254991055687, |
|
"learning_rate": 4.909119187345948e-05, |
|
"loss": 0.5193, |
|
"num_tokens": 229710487.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4129692832764505, |
|
"grad_norm": 0.3344543802661992, |
|
"learning_rate": 4.906722275050274e-05, |
|
"loss": 0.5262, |
|
"num_tokens": 231678126.0, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.41638225255972694, |
|
"grad_norm": 0.3554138286052159, |
|
"learning_rate": 4.904294833197358e-05, |
|
"loss": 0.5437, |
|
"num_tokens": 233617525.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.4197952218430034, |
|
"grad_norm": 0.3486512810565428, |
|
"learning_rate": 4.901836896148938e-05, |
|
"loss": 0.5302, |
|
"num_tokens": 235464027.0, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.4232081911262799, |
|
"grad_norm": 0.3466494965430672, |
|
"learning_rate": 4.8993484986984265e-05, |
|
"loss": 0.5353, |
|
"num_tokens": 237394471.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.42662116040955633, |
|
"grad_norm": 0.3503225313716609, |
|
"learning_rate": 4.896829676070421e-05, |
|
"loss": 0.5251, |
|
"num_tokens": 239271145.0, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.4300341296928328, |
|
"grad_norm": 0.35994178207363314, |
|
"learning_rate": 4.894280463920201e-05, |
|
"loss": 0.5296, |
|
"num_tokens": 241114261.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.4334470989761092, |
|
"grad_norm": 0.33650308530603384, |
|
"learning_rate": 4.8917008983332307e-05, |
|
"loss": 0.5402, |
|
"num_tokens": 242992742.0, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.43686006825938567, |
|
"grad_norm": 0.321344393861645, |
|
"learning_rate": 4.889091015824639e-05, |
|
"loss": 0.5151, |
|
"num_tokens": 244967987.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.4402730375426621, |
|
"grad_norm": 0.3663954539776663, |
|
"learning_rate": 4.886450853338709e-05, |
|
"loss": 0.5502, |
|
"num_tokens": 246976387.0, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.44368600682593856, |
|
"grad_norm": 0.45975821746996437, |
|
"learning_rate": 4.883780448248353e-05, |
|
"loss": 0.5183, |
|
"num_tokens": 248751095.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.447098976109215, |
|
"grad_norm": 0.3555087234083296, |
|
"learning_rate": 4.881079838354582e-05, |
|
"loss": 0.5277, |
|
"num_tokens": 250671856.0, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.45051194539249145, |
|
"grad_norm": 0.37726144835555214, |
|
"learning_rate": 4.878349061885974e-05, |
|
"loss": 0.5157, |
|
"num_tokens": 252607265.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.4539249146757679, |
|
"grad_norm": 0.3616385768897013, |
|
"learning_rate": 4.875588157498131e-05, |
|
"loss": 0.5288, |
|
"num_tokens": 254497259.0, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.45733788395904434, |
|
"grad_norm": 0.3450279758346968, |
|
"learning_rate": 4.87279716427313e-05, |
|
"loss": 0.5279, |
|
"num_tokens": 256417909.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.46075085324232085, |
|
"grad_norm": 0.33204144900991867, |
|
"learning_rate": 4.8699761217189735e-05, |
|
"loss": 0.5102, |
|
"num_tokens": 258396985.0, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.4641638225255973, |
|
"grad_norm": 0.3431329045509487, |
|
"learning_rate": 4.867125069769027e-05, |
|
"loss": 0.5322, |
|
"num_tokens": 260456429.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.46757679180887374, |
|
"grad_norm": 0.3809746183245989, |
|
"learning_rate": 4.864244048781458e-05, |
|
"loss": 0.5119, |
|
"num_tokens": 262384995.0, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.4709897610921502, |
|
"grad_norm": 0.3712668805710275, |
|
"learning_rate": 4.861333099538656e-05, |
|
"loss": 0.5154, |
|
"num_tokens": 264287905.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.47440273037542663, |
|
"grad_norm": 0.36608602643678284, |
|
"learning_rate": 4.858392263246666e-05, |
|
"loss": 0.5226, |
|
"num_tokens": 266198448.0, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.4778156996587031, |
|
"grad_norm": 0.40431967372937866, |
|
"learning_rate": 4.8554215815345985e-05, |
|
"loss": 0.5223, |
|
"num_tokens": 268098790.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4812286689419795, |
|
"grad_norm": 0.3571318972995455, |
|
"learning_rate": 4.852421096454041e-05, |
|
"loss": 0.5339, |
|
"num_tokens": 270033989.0, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.48464163822525597, |
|
"grad_norm": 0.3432800376569745, |
|
"learning_rate": 4.849390850478465e-05, |
|
"loss": 0.5301, |
|
"num_tokens": 271974065.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.4880546075085324, |
|
"grad_norm": 0.40999627806439953, |
|
"learning_rate": 4.846330886502625e-05, |
|
"loss": 0.5401, |
|
"num_tokens": 273982161.0, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.49146757679180886, |
|
"grad_norm": 0.3829538235957624, |
|
"learning_rate": 4.843241247841944e-05, |
|
"loss": 0.5317, |
|
"num_tokens": 275852045.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.4948805460750853, |
|
"grad_norm": 0.32021121908893335, |
|
"learning_rate": 4.8401219782319114e-05, |
|
"loss": 0.502, |
|
"num_tokens": 277687831.0, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.49829351535836175, |
|
"grad_norm": 0.30949205453341727, |
|
"learning_rate": 4.8369731218274567e-05, |
|
"loss": 0.5252, |
|
"num_tokens": 279504484.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.5017064846416383, |
|
"grad_norm": 0.31840825885431656, |
|
"learning_rate": 4.833794723202327e-05, |
|
"loss": 0.5098, |
|
"num_tokens": 281456293.0, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.5051194539249146, |
|
"grad_norm": 0.3167489912000843, |
|
"learning_rate": 4.8305868273484537e-05, |
|
"loss": 0.521, |
|
"num_tokens": 283461546.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.5085324232081911, |
|
"grad_norm": 0.30595968475470414, |
|
"learning_rate": 4.8273494796753175e-05, |
|
"loss": 0.5165, |
|
"num_tokens": 285433304.0, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.5119453924914675, |
|
"grad_norm": 0.3746798875916052, |
|
"learning_rate": 4.824082726009308e-05, |
|
"loss": 0.53, |
|
"num_tokens": 287371918.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.515358361774744, |
|
"grad_norm": 0.345045842082928, |
|
"learning_rate": 4.8207866125930696e-05, |
|
"loss": 0.502, |
|
"num_tokens": 289259239.0, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.5187713310580204, |
|
"grad_norm": 0.38441521290672914, |
|
"learning_rate": 4.81746118608485e-05, |
|
"loss": 0.5184, |
|
"num_tokens": 291057738.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.5221843003412969, |
|
"grad_norm": 0.3459495605757955, |
|
"learning_rate": 4.8141064935578404e-05, |
|
"loss": 0.5238, |
|
"num_tokens": 293034663.0, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.5255972696245734, |
|
"grad_norm": 0.3465676375928197, |
|
"learning_rate": 4.810722582499508e-05, |
|
"loss": 0.5142, |
|
"num_tokens": 294881963.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.5290102389078498, |
|
"grad_norm": 0.3431332856561044, |
|
"learning_rate": 4.8073095008109234e-05, |
|
"loss": 0.5229, |
|
"num_tokens": 296794020.0, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.5324232081911263, |
|
"grad_norm": 0.34723558691112266, |
|
"learning_rate": 4.8038672968060824e-05, |
|
"loss": 0.5205, |
|
"num_tokens": 298677895.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5358361774744027, |
|
"grad_norm": 0.3661333544764211, |
|
"learning_rate": 4.800396019211224e-05, |
|
"loss": 0.5115, |
|
"num_tokens": 300610709.0, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.5392491467576792, |
|
"grad_norm": 0.3261317551653855, |
|
"learning_rate": 4.79689571716414e-05, |
|
"loss": 0.5005, |
|
"num_tokens": 302387985.0, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5426621160409556, |
|
"grad_norm": 0.44334818660500824, |
|
"learning_rate": 4.793366440213477e-05, |
|
"loss": 0.5294, |
|
"num_tokens": 304293467.0, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.5460750853242321, |
|
"grad_norm": 0.3081814955106798, |
|
"learning_rate": 4.789808238318037e-05, |
|
"loss": 0.5258, |
|
"num_tokens": 306130593.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5494880546075085, |
|
"grad_norm": 0.3576761308612535, |
|
"learning_rate": 4.786221161846072e-05, |
|
"loss": 0.5309, |
|
"num_tokens": 308025370.0, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.552901023890785, |
|
"grad_norm": 0.3270050276920037, |
|
"learning_rate": 4.782605261574568e-05, |
|
"loss": 0.5209, |
|
"num_tokens": 309952008.0, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.5563139931740614, |
|
"grad_norm": 0.31786503490293877, |
|
"learning_rate": 4.778960588688527e-05, |
|
"loss": 0.5082, |
|
"num_tokens": 311856787.0, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.5597269624573379, |
|
"grad_norm": 0.33017454672312996, |
|
"learning_rate": 4.775287194780241e-05, |
|
"loss": 0.5147, |
|
"num_tokens": 313725489.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.5631399317406144, |
|
"grad_norm": 0.3604312004052758, |
|
"learning_rate": 4.771585131848569e-05, |
|
"loss": 0.5133, |
|
"num_tokens": 315690477.0, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.5665529010238908, |
|
"grad_norm": 0.333952918780597, |
|
"learning_rate": 4.767854452298188e-05, |
|
"loss": 0.5204, |
|
"num_tokens": 317525148.0, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.5699658703071673, |
|
"grad_norm": 0.3267362876583091, |
|
"learning_rate": 4.764095208938865e-05, |
|
"loss": 0.5149, |
|
"num_tokens": 319505322.0, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.5733788395904437, |
|
"grad_norm": 0.337839892043544, |
|
"learning_rate": 4.7603074549846986e-05, |
|
"loss": 0.5182, |
|
"num_tokens": 321410101.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.5767918088737202, |
|
"grad_norm": 0.37147409137067106, |
|
"learning_rate": 4.7564912440533734e-05, |
|
"loss": 0.5097, |
|
"num_tokens": 323302156.0, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.5802047781569966, |
|
"grad_norm": 0.3385281078202508, |
|
"learning_rate": 4.752646630165393e-05, |
|
"loss": 0.5126, |
|
"num_tokens": 325102278.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.5836177474402731, |
|
"grad_norm": 0.4040596596048006, |
|
"learning_rate": 4.7487736677433256e-05, |
|
"loss": 0.5172, |
|
"num_tokens": 326903105.0, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.5870307167235495, |
|
"grad_norm": 0.39849331627110846, |
|
"learning_rate": 4.7448724116110264e-05, |
|
"loss": 0.5146, |
|
"num_tokens": 328831071.0, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.590443686006826, |
|
"grad_norm": 0.31821166064088463, |
|
"learning_rate": 4.74094291699286e-05, |
|
"loss": 0.5133, |
|
"num_tokens": 330823718.0, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.5938566552901023, |
|
"grad_norm": 0.32627133466823544, |
|
"learning_rate": 4.736985239512925e-05, |
|
"loss": 0.4976, |
|
"num_tokens": 332767044.0, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5972696245733788, |
|
"grad_norm": 0.3177734291813305, |
|
"learning_rate": 4.732999435194265e-05, |
|
"loss": 0.5165, |
|
"num_tokens": 334743310.0, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.6006825938566553, |
|
"grad_norm": 0.30605344292447234, |
|
"learning_rate": 4.728985560458068e-05, |
|
"loss": 0.4884, |
|
"num_tokens": 336595638.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.6040955631399317, |
|
"grad_norm": 0.32401153361842944, |
|
"learning_rate": 4.7249436721228795e-05, |
|
"loss": 0.5108, |
|
"num_tokens": 338382273.0, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.6075085324232082, |
|
"grad_norm": 0.31304402477648574, |
|
"learning_rate": 4.720873827403791e-05, |
|
"loss": 0.5198, |
|
"num_tokens": 340358925.0, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.6109215017064846, |
|
"grad_norm": 0.3218084504074191, |
|
"learning_rate": 4.716776083911631e-05, |
|
"loss": 0.5189, |
|
"num_tokens": 342188299.0, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.6143344709897611, |
|
"grad_norm": 0.3510785346642464, |
|
"learning_rate": 4.7126504996521484e-05, |
|
"loss": 0.5096, |
|
"num_tokens": 344239058.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6177474402730375, |
|
"grad_norm": 0.36297363440496044, |
|
"learning_rate": 4.7084971330251974e-05, |
|
"loss": 0.5204, |
|
"num_tokens": 346206452.0, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.621160409556314, |
|
"grad_norm": 0.3068962752186269, |
|
"learning_rate": 4.704316042823902e-05, |
|
"loss": 0.5152, |
|
"num_tokens": 348080585.0, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.6245733788395904, |
|
"grad_norm": 0.31633950180676207, |
|
"learning_rate": 4.700107288233831e-05, |
|
"loss": 0.5224, |
|
"num_tokens": 349930176.0, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.6279863481228669, |
|
"grad_norm": 0.31119199537147196, |
|
"learning_rate": 4.695870928832156e-05, |
|
"loss": 0.4999, |
|
"num_tokens": 351817695.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.6313993174061433, |
|
"grad_norm": 0.30951025365629076, |
|
"learning_rate": 4.6916070245868085e-05, |
|
"loss": 0.5052, |
|
"num_tokens": 353737820.0, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.6348122866894198, |
|
"grad_norm": 0.34341366500684045, |
|
"learning_rate": 4.6873156358556375e-05, |
|
"loss": 0.5059, |
|
"num_tokens": 355639183.0, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.6382252559726962, |
|
"grad_norm": 0.33780786043507366, |
|
"learning_rate": 4.682996823385541e-05, |
|
"loss": 0.5263, |
|
"num_tokens": 357548627.0, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.6416382252559727, |
|
"grad_norm": 0.3602258225812933, |
|
"learning_rate": 4.678650648311624e-05, |
|
"loss": 0.5235, |
|
"num_tokens": 359437581.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.6450511945392492, |
|
"grad_norm": 0.3623769577177757, |
|
"learning_rate": 4.6742771721563146e-05, |
|
"loss": 0.4995, |
|
"num_tokens": 361329513.0, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.6484641638225256, |
|
"grad_norm": 0.36738029330256716, |
|
"learning_rate": 4.66987645682851e-05, |
|
"loss": 0.5039, |
|
"num_tokens": 363189983.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.6518771331058021, |
|
"grad_norm": 0.2994153691319221, |
|
"learning_rate": 4.665448564622687e-05, |
|
"loss": 0.5085, |
|
"num_tokens": 365024705.0, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.6552901023890785, |
|
"grad_norm": 0.30495566025924037, |
|
"learning_rate": 4.660993558218028e-05, |
|
"loss": 0.499, |
|
"num_tokens": 366863209.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.658703071672355, |
|
"grad_norm": 0.3396370071116669, |
|
"learning_rate": 4.6565115006775335e-05, |
|
"loss": 0.5251, |
|
"num_tokens": 368837047.0, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.6621160409556314, |
|
"grad_norm": 0.3174353904822871, |
|
"learning_rate": 4.6520024554471224e-05, |
|
"loss": 0.5087, |
|
"num_tokens": 370725860.0, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.6655290102389079, |
|
"grad_norm": 0.28442067468175136, |
|
"learning_rate": 4.647466486354743e-05, |
|
"loss": 0.4907, |
|
"num_tokens": 372569301.0, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.6689419795221843, |
|
"grad_norm": 0.2919040706685906, |
|
"learning_rate": 4.642903657609463e-05, |
|
"loss": 0.5137, |
|
"num_tokens": 374566515.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.6723549488054608, |
|
"grad_norm": 0.29173671828185904, |
|
"learning_rate": 4.638314033800564e-05, |
|
"loss": 0.4944, |
|
"num_tokens": 376431969.0, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.6757679180887372, |
|
"grad_norm": 0.31461039206966784, |
|
"learning_rate": 4.633697679896626e-05, |
|
"loss": 0.495, |
|
"num_tokens": 378330489.0, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.6791808873720137, |
|
"grad_norm": 0.2915321504491105, |
|
"learning_rate": 4.629054661244606e-05, |
|
"loss": 0.4871, |
|
"num_tokens": 380245892.0, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.6825938566552902, |
|
"grad_norm": 0.286886601156865, |
|
"learning_rate": 4.624385043568917e-05, |
|
"loss": 0.5114, |
|
"num_tokens": 382103468.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6860068259385665, |
|
"grad_norm": 0.31706955102919415, |
|
"learning_rate": 4.6196888929704954e-05, |
|
"loss": 0.5196, |
|
"num_tokens": 384011945.0, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.689419795221843, |
|
"grad_norm": 0.33284742322766536, |
|
"learning_rate": 4.614966275925863e-05, |
|
"loss": 0.4963, |
|
"num_tokens": 385837297.0, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.6928327645051194, |
|
"grad_norm": 0.4000880667080671, |
|
"learning_rate": 4.61021725928619e-05, |
|
"loss": 0.5107, |
|
"num_tokens": 387718762.0, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.6962457337883959, |
|
"grad_norm": 0.318145983785095, |
|
"learning_rate": 4.6054419102763476e-05, |
|
"loss": 0.5, |
|
"num_tokens": 389586680.0, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.6996587030716723, |
|
"grad_norm": 0.29446028177317474, |
|
"learning_rate": 4.600640296493953e-05, |
|
"loss": 0.5173, |
|
"num_tokens": 391567792.0, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.7030716723549488, |
|
"grad_norm": 0.33638404359083496, |
|
"learning_rate": 4.59581248590842e-05, |
|
"loss": 0.5021, |
|
"num_tokens": 393428760.0, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.7064846416382252, |
|
"grad_norm": 0.3178901269025221, |
|
"learning_rate": 4.590958546859988e-05, |
|
"loss": 0.5184, |
|
"num_tokens": 395264025.0, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.7098976109215017, |
|
"grad_norm": 0.3175572857551193, |
|
"learning_rate": 4.58607854805876e-05, |
|
"loss": 0.5251, |
|
"num_tokens": 397158700.0, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.7133105802047781, |
|
"grad_norm": 0.3108349326509548, |
|
"learning_rate": 4.581172558583729e-05, |
|
"loss": 0.497, |
|
"num_tokens": 399092348.0, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.7167235494880546, |
|
"grad_norm": 0.3097661968317486, |
|
"learning_rate": 4.576240647881801e-05, |
|
"loss": 0.5148, |
|
"num_tokens": 400923938.0, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.7201365187713311, |
|
"grad_norm": 0.29168496463784344, |
|
"learning_rate": 4.571282885766808e-05, |
|
"loss": 0.4945, |
|
"num_tokens": 402794748.0, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.7235494880546075, |
|
"grad_norm": 0.30716631597516236, |
|
"learning_rate": 4.5662993424185244e-05, |
|
"loss": 0.5052, |
|
"num_tokens": 404685655.0, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.726962457337884, |
|
"grad_norm": 0.33227072895463494, |
|
"learning_rate": 4.561290088381672e-05, |
|
"loss": 0.5061, |
|
"num_tokens": 406613744.0, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.7303754266211604, |
|
"grad_norm": 0.3293212560670863, |
|
"learning_rate": 4.5562551945649215e-05, |
|
"loss": 0.5049, |
|
"num_tokens": 408582799.0, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.7337883959044369, |
|
"grad_norm": 0.3356632435675788, |
|
"learning_rate": 4.5511947322398855e-05, |
|
"loss": 0.513, |
|
"num_tokens": 410442926.0, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.7372013651877133, |
|
"grad_norm": 0.3107589924934008, |
|
"learning_rate": 4.546108773040116e-05, |
|
"loss": 0.5089, |
|
"num_tokens": 412386009.0, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.7406143344709898, |
|
"grad_norm": 0.30270204787047783, |
|
"learning_rate": 4.540997388960085e-05, |
|
"loss": 0.5197, |
|
"num_tokens": 414357432.0, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.7440273037542662, |
|
"grad_norm": 0.2818167774666258, |
|
"learning_rate": 4.53586065235417e-05, |
|
"loss": 0.5077, |
|
"num_tokens": 416372039.0, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.7474402730375427, |
|
"grad_norm": 0.3102971120341378, |
|
"learning_rate": 4.530698635935622e-05, |
|
"loss": 0.5046, |
|
"num_tokens": 418265973.0, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.7508532423208191, |
|
"grad_norm": 0.3807798685595783, |
|
"learning_rate": 4.525511412775545e-05, |
|
"loss": 0.4921, |
|
"num_tokens": 420051975.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7542662116040956, |
|
"grad_norm": 0.31661653050496685, |
|
"learning_rate": 4.5202990563018565e-05, |
|
"loss": 0.4934, |
|
"num_tokens": 421905923.0, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.757679180887372, |
|
"grad_norm": 0.35276872591605113, |
|
"learning_rate": 4.515061640298251e-05, |
|
"loss": 0.5042, |
|
"num_tokens": 423685709.0, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.7610921501706485, |
|
"grad_norm": 0.3844552713062626, |
|
"learning_rate": 4.509799238903153e-05, |
|
"loss": 0.4947, |
|
"num_tokens": 425515443.0, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.764505119453925, |
|
"grad_norm": 0.31759030359483253, |
|
"learning_rate": 4.504511926608667e-05, |
|
"loss": 0.4949, |
|
"num_tokens": 427405904.0, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.7679180887372014, |
|
"grad_norm": 0.3276618424804238, |
|
"learning_rate": 4.4991997782595286e-05, |
|
"loss": 0.5, |
|
"num_tokens": 429329969.0, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.7713310580204779, |
|
"grad_norm": 0.3504691972099564, |
|
"learning_rate": 4.493862869052038e-05, |
|
"loss": 0.4938, |
|
"num_tokens": 431288378.0, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.7747440273037542, |
|
"grad_norm": 0.31911074838089193, |
|
"learning_rate": 4.488501274532998e-05, |
|
"loss": 0.5025, |
|
"num_tokens": 433213317.0, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.7781569965870307, |
|
"grad_norm": 0.346412105131523, |
|
"learning_rate": 4.483115070598647e-05, |
|
"loss": 0.4941, |
|
"num_tokens": 435077995.0, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.7815699658703071, |
|
"grad_norm": 0.3211768432110787, |
|
"learning_rate": 4.4777043334935834e-05, |
|
"loss": 0.499, |
|
"num_tokens": 437063353.0, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.7849829351535836, |
|
"grad_norm": 0.2934813782432162, |
|
"learning_rate": 4.4722691398096845e-05, |
|
"loss": 0.4963, |
|
"num_tokens": 439006864.0, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.78839590443686, |
|
"grad_norm": 0.2838196869214083, |
|
"learning_rate": 4.466809566485022e-05, |
|
"loss": 0.5126, |
|
"num_tokens": 441123044.0, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.7918088737201365, |
|
"grad_norm": 0.30674961764611963, |
|
"learning_rate": 4.461325690802777e-05, |
|
"loss": 0.5124, |
|
"num_tokens": 443045688.0, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.7952218430034129, |
|
"grad_norm": 0.31400026415051735, |
|
"learning_rate": 4.455817590390144e-05, |
|
"loss": 0.5011, |
|
"num_tokens": 444874851.0, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.7986348122866894, |
|
"grad_norm": 0.32725515813051603, |
|
"learning_rate": 4.450285343217228e-05, |
|
"loss": 0.4986, |
|
"num_tokens": 446852018.0, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.8020477815699659, |
|
"grad_norm": 0.32318551620334435, |
|
"learning_rate": 4.444729027595948e-05, |
|
"loss": 0.5048, |
|
"num_tokens": 448665204.0, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.8054607508532423, |
|
"grad_norm": 0.33687487158543156, |
|
"learning_rate": 4.4391487221789216e-05, |
|
"loss": 0.5224, |
|
"num_tokens": 450687287.0, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.8088737201365188, |
|
"grad_norm": 0.35347464715186616, |
|
"learning_rate": 4.433544505958358e-05, |
|
"loss": 0.4992, |
|
"num_tokens": 452550699.0, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.8122866894197952, |
|
"grad_norm": 0.3445779383146421, |
|
"learning_rate": 4.427916458264935e-05, |
|
"loss": 0.4991, |
|
"num_tokens": 454483896.0, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.8156996587030717, |
|
"grad_norm": 0.29879424431987656, |
|
"learning_rate": 4.422264658766677e-05, |
|
"loss": 0.4841, |
|
"num_tokens": 456418788.0, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.8191126279863481, |
|
"grad_norm": 0.2991095523769755, |
|
"learning_rate": 4.416589187467828e-05, |
|
"loss": 0.51, |
|
"num_tokens": 458325861.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8225255972696246, |
|
"grad_norm": 0.32477512440054085, |
|
"learning_rate": 4.41089012470772e-05, |
|
"loss": 0.5153, |
|
"num_tokens": 460227692.0, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.825938566552901, |
|
"grad_norm": 0.3547912313792265, |
|
"learning_rate": 4.405167551159635e-05, |
|
"loss": 0.4893, |
|
"num_tokens": 462055868.0, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.8293515358361775, |
|
"grad_norm": 0.3016276005058045, |
|
"learning_rate": 4.399421547829661e-05, |
|
"loss": 0.4892, |
|
"num_tokens": 463988579.0, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.8327645051194539, |
|
"grad_norm": 0.309849759273995, |
|
"learning_rate": 4.393652196055548e-05, |
|
"loss": 0.504, |
|
"num_tokens": 465973876.0, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.8361774744027304, |
|
"grad_norm": 0.30189015799475877, |
|
"learning_rate": 4.3878595775055574e-05, |
|
"loss": 0.4969, |
|
"num_tokens": 467962214.0, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.8395904436860068, |
|
"grad_norm": 0.3047425782223173, |
|
"learning_rate": 4.382043774177302e-05, |
|
"loss": 0.4911, |
|
"num_tokens": 469791498.0, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.8430034129692833, |
|
"grad_norm": 0.29428684450637044, |
|
"learning_rate": 4.376204868396588e-05, |
|
"loss": 0.4926, |
|
"num_tokens": 471616902.0, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.8464163822525598, |
|
"grad_norm": 0.3158111338692259, |
|
"learning_rate": 4.370342942816249e-05, |
|
"loss": 0.49, |
|
"num_tokens": 473570581.0, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.8498293515358362, |
|
"grad_norm": 0.3408082001646201, |
|
"learning_rate": 4.3644580804149774e-05, |
|
"loss": 0.5041, |
|
"num_tokens": 475518680.0, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.8532423208191127, |
|
"grad_norm": 0.38616580597842337, |
|
"learning_rate": 4.358550364496148e-05, |
|
"loss": 0.5104, |
|
"num_tokens": 477401353.0, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.856655290102389, |
|
"grad_norm": 0.32247025368304155, |
|
"learning_rate": 4.3526198786866386e-05, |
|
"loss": 0.5048, |
|
"num_tokens": 479371988.0, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.8600682593856656, |
|
"grad_norm": 0.35588546668005355, |
|
"learning_rate": 4.3466667069356465e-05, |
|
"loss": 0.5074, |
|
"num_tokens": 481348892.0, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.863481228668942, |
|
"grad_norm": 0.3410788549802598, |
|
"learning_rate": 4.340690933513504e-05, |
|
"loss": 0.4931, |
|
"num_tokens": 483220037.0, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.8668941979522184, |
|
"grad_norm": 0.28996758799627625, |
|
"learning_rate": 4.334692643010478e-05, |
|
"loss": 0.5144, |
|
"num_tokens": 485171910.0, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.8703071672354948, |
|
"grad_norm": 0.3450006717750541, |
|
"learning_rate": 4.328671920335579e-05, |
|
"loss": 0.4909, |
|
"num_tokens": 487118187.0, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.8737201365187713, |
|
"grad_norm": 0.3168603261439533, |
|
"learning_rate": 4.322628850715357e-05, |
|
"loss": 0.487, |
|
"num_tokens": 489059548.0, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.8771331058020477, |
|
"grad_norm": 0.2934471279813396, |
|
"learning_rate": 4.3165635196926935e-05, |
|
"loss": 0.4902, |
|
"num_tokens": 490973976.0, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.8805460750853242, |
|
"grad_norm": 0.30116352534369073, |
|
"learning_rate": 4.310476013125593e-05, |
|
"loss": 0.4939, |
|
"num_tokens": 492893029.0, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.8839590443686007, |
|
"grad_norm": 0.27653712452340234, |
|
"learning_rate": 4.3043664171859676e-05, |
|
"loss": 0.5133, |
|
"num_tokens": 494980675.0, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.8873720136518771, |
|
"grad_norm": 0.3008423365150182, |
|
"learning_rate": 4.298234818358414e-05, |
|
"loss": 0.485, |
|
"num_tokens": 496905674.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8907849829351536, |
|
"grad_norm": 0.2986356900020865, |
|
"learning_rate": 4.2920813034389944e-05, |
|
"loss": 0.4984, |
|
"num_tokens": 498889766.0, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.89419795221843, |
|
"grad_norm": 0.274831418663253, |
|
"learning_rate": 4.285905959534002e-05, |
|
"loss": 0.4897, |
|
"num_tokens": 500864542.0, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.8976109215017065, |
|
"grad_norm": 0.3126326726365892, |
|
"learning_rate": 4.2797088740587324e-05, |
|
"loss": 0.5062, |
|
"num_tokens": 502876203.0, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.9010238907849829, |
|
"grad_norm": 0.3234689945785143, |
|
"learning_rate": 4.273490134736246e-05, |
|
"loss": 0.4892, |
|
"num_tokens": 504810366.0, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.9044368600682594, |
|
"grad_norm": 0.2728068836733229, |
|
"learning_rate": 4.267249829596123e-05, |
|
"loss": 0.4927, |
|
"num_tokens": 506777983.0, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.9078498293515358, |
|
"grad_norm": 0.3168203159146273, |
|
"learning_rate": 4.2609880469732196e-05, |
|
"loss": 0.4949, |
|
"num_tokens": 508607232.0, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.9112627986348123, |
|
"grad_norm": 0.29165446674166984, |
|
"learning_rate": 4.254704875506419e-05, |
|
"loss": 0.4949, |
|
"num_tokens": 510543906.0, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.9146757679180887, |
|
"grad_norm": 0.2873297869381404, |
|
"learning_rate": 4.2484004041373724e-05, |
|
"loss": 0.5078, |
|
"num_tokens": 512474084.0, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.9180887372013652, |
|
"grad_norm": 0.3073994896051493, |
|
"learning_rate": 4.242074722109244e-05, |
|
"loss": 0.4827, |
|
"num_tokens": 514322576.0, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.9215017064846417, |
|
"grad_norm": 0.33305778620591386, |
|
"learning_rate": 4.235727918965446e-05, |
|
"loss": 0.5005, |
|
"num_tokens": 516216104.0, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.9249146757679181, |
|
"grad_norm": 0.3369378626468833, |
|
"learning_rate": 4.2293600845483715e-05, |
|
"loss": 0.5062, |
|
"num_tokens": 518137999.0, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.9283276450511946, |
|
"grad_norm": 0.3298207620717988, |
|
"learning_rate": 4.222971308998123e-05, |
|
"loss": 0.4961, |
|
"num_tokens": 519966345.0, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.931740614334471, |
|
"grad_norm": 0.30388199133879135, |
|
"learning_rate": 4.216561682751234e-05, |
|
"loss": 0.4818, |
|
"num_tokens": 521824255.0, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.9351535836177475, |
|
"grad_norm": 0.32752381042516066, |
|
"learning_rate": 4.210131296539391e-05, |
|
"loss": 0.4921, |
|
"num_tokens": 523793837.0, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.9385665529010239, |
|
"grad_norm": 0.28445695076252103, |
|
"learning_rate": 4.2036802413881524e-05, |
|
"loss": 0.4982, |
|
"num_tokens": 525746360.0, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.9419795221843004, |
|
"grad_norm": 0.28899230017319383, |
|
"learning_rate": 4.1972086086156525e-05, |
|
"loss": 0.5019, |
|
"num_tokens": 527666864.0, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.9453924914675768, |
|
"grad_norm": 0.32011937903596116, |
|
"learning_rate": 4.190716489831315e-05, |
|
"loss": 0.5005, |
|
"num_tokens": 529624010.0, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.9488054607508533, |
|
"grad_norm": 0.29697819340477716, |
|
"learning_rate": 4.184203976934552e-05, |
|
"loss": 0.4939, |
|
"num_tokens": 531466359.0, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.9522184300341296, |
|
"grad_norm": 0.3027431585655998, |
|
"learning_rate": 4.177671162113468e-05, |
|
"loss": 0.5084, |
|
"num_tokens": 533360398.0, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.9556313993174061, |
|
"grad_norm": 0.32991449610546014, |
|
"learning_rate": 4.17111813784355e-05, |
|
"loss": 0.4906, |
|
"num_tokens": 535253217.0, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9590443686006825, |
|
"grad_norm": 0.33638706750117725, |
|
"learning_rate": 4.16454499688636e-05, |
|
"loss": 0.4806, |
|
"num_tokens": 537247787.0, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.962457337883959, |
|
"grad_norm": 0.2978526679067072, |
|
"learning_rate": 4.1579518322882264e-05, |
|
"loss": 0.4863, |
|
"num_tokens": 539137436.0, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.9658703071672355, |
|
"grad_norm": 0.28420517349753716, |
|
"learning_rate": 4.1513387373789174e-05, |
|
"loss": 0.491, |
|
"num_tokens": 540954808.0, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.9692832764505119, |
|
"grad_norm": 0.2914181751043302, |
|
"learning_rate": 4.1447058057703296e-05, |
|
"loss": 0.4974, |
|
"num_tokens": 542824098.0, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.9726962457337884, |
|
"grad_norm": 0.30896035937102595, |
|
"learning_rate": 4.138053131355158e-05, |
|
"loss": 0.51, |
|
"num_tokens": 544761712.0, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.9761092150170648, |
|
"grad_norm": 0.2996642933295098, |
|
"learning_rate": 4.131380808305565e-05, |
|
"loss": 0.4992, |
|
"num_tokens": 546578047.0, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.9795221843003413, |
|
"grad_norm": 0.32391571755782744, |
|
"learning_rate": 4.1246889310718536e-05, |
|
"loss": 0.4907, |
|
"num_tokens": 548490659.0, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.9829351535836177, |
|
"grad_norm": 0.2805330274894754, |
|
"learning_rate": 4.117977594381123e-05, |
|
"loss": 0.4904, |
|
"num_tokens": 550377811.0, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.9863481228668942, |
|
"grad_norm": 0.36985148320187977, |
|
"learning_rate": 4.111246893235935e-05, |
|
"loss": 0.4949, |
|
"num_tokens": 552306122.0, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.9897610921501706, |
|
"grad_norm": 0.2744153820711885, |
|
"learning_rate": 4.104496922912963e-05, |
|
"loss": 0.4791, |
|
"num_tokens": 554224024.0, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.9931740614334471, |
|
"grad_norm": 0.3466323284595071, |
|
"learning_rate": 4.097727778961648e-05, |
|
"loss": 0.4995, |
|
"num_tokens": 556055771.0, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.9965870307167235, |
|
"grad_norm": 0.2796639263924327, |
|
"learning_rate": 4.090939557202841e-05, |
|
"loss": 0.4876, |
|
"num_tokens": 558001366.0, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.2779393474451887, |
|
"learning_rate": 4.0841323537274544e-05, |
|
"loss": 0.4929, |
|
"num_tokens": 559917574.0, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.0034129692832765, |
|
"grad_norm": 0.3138567800728616, |
|
"learning_rate": 4.0773062648950905e-05, |
|
"loss": 0.4536, |
|
"num_tokens": 561795304.0, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.006825938566553, |
|
"grad_norm": 0.3114289367733724, |
|
"learning_rate": 4.0704613873326895e-05, |
|
"loss": 0.4544, |
|
"num_tokens": 563661756.0, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.0102389078498293, |
|
"grad_norm": 0.32100212575438497, |
|
"learning_rate": 4.0635978179331534e-05, |
|
"loss": 0.4557, |
|
"num_tokens": 565576615.0, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.0136518771331058, |
|
"grad_norm": 0.33784033052194967, |
|
"learning_rate": 4.056715653853977e-05, |
|
"loss": 0.4658, |
|
"num_tokens": 567552980.0, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.0170648464163823, |
|
"grad_norm": 0.2846562551793901, |
|
"learning_rate": 4.0498149925158765e-05, |
|
"loss": 0.4486, |
|
"num_tokens": 569441970.0, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.0204778156996588, |
|
"grad_norm": 0.27181341067200976, |
|
"learning_rate": 4.0428959316013996e-05, |
|
"loss": 0.4571, |
|
"num_tokens": 571408138.0, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.023890784982935, |
|
"grad_norm": 0.29080838948154386, |
|
"learning_rate": 4.0359585690535565e-05, |
|
"loss": 0.4679, |
|
"num_tokens": 573385584.0, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.0273037542662116, |
|
"grad_norm": 0.2862021615209201, |
|
"learning_rate": 4.0290030030744244e-05, |
|
"loss": 0.438, |
|
"num_tokens": 575190202.0, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.030716723549488, |
|
"grad_norm": 0.2972505881497117, |
|
"learning_rate": 4.0220293321237577e-05, |
|
"loss": 0.4564, |
|
"num_tokens": 577207221.0, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.0341296928327646, |
|
"grad_norm": 0.3039253750758028, |
|
"learning_rate": 4.0150376549176e-05, |
|
"loss": 0.4605, |
|
"num_tokens": 579169216.0, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.0375426621160408, |
|
"grad_norm": 0.29255452310430513, |
|
"learning_rate": 4.0080280704268805e-05, |
|
"loss": 0.4518, |
|
"num_tokens": 581176488.0, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.0409556313993173, |
|
"grad_norm": 0.2883720019415548, |
|
"learning_rate": 4.0010006778760175e-05, |
|
"loss": 0.4616, |
|
"num_tokens": 583060493.0, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.0443686006825939, |
|
"grad_norm": 0.2980789185115071, |
|
"learning_rate": 3.993955576741509e-05, |
|
"loss": 0.4491, |
|
"num_tokens": 584967164.0, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.0477815699658704, |
|
"grad_norm": 0.2962427827544438, |
|
"learning_rate": 3.986892866750532e-05, |
|
"loss": 0.469, |
|
"num_tokens": 586896302.0, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.0511945392491469, |
|
"grad_norm": 0.30639817855272605, |
|
"learning_rate": 3.979812647879522e-05, |
|
"loss": 0.4544, |
|
"num_tokens": 588745165.0, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.0546075085324231, |
|
"grad_norm": 0.2773317562699212, |
|
"learning_rate": 3.972715020352763e-05, |
|
"loss": 0.4486, |
|
"num_tokens": 590639322.0, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.0580204778156996, |
|
"grad_norm": 0.29236172696032264, |
|
"learning_rate": 3.9656000846409695e-05, |
|
"loss": 0.4424, |
|
"num_tokens": 592549757.0, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.0614334470989761, |
|
"grad_norm": 0.2571903473392426, |
|
"learning_rate": 3.9584679414598616e-05, |
|
"loss": 0.4595, |
|
"num_tokens": 594545910.0, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.0648464163822526, |
|
"grad_norm": 0.2961845232409812, |
|
"learning_rate": 3.951318691768741e-05, |
|
"loss": 0.4525, |
|
"num_tokens": 596464710.0, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.068259385665529, |
|
"grad_norm": 0.25912887664891626, |
|
"learning_rate": 3.944152436769059e-05, |
|
"loss": 0.4563, |
|
"num_tokens": 598527528.0, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.0716723549488054, |
|
"grad_norm": 0.28353901932289044, |
|
"learning_rate": 3.93696927790299e-05, |
|
"loss": 0.4512, |
|
"num_tokens": 600405627.0, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.075085324232082, |
|
"grad_norm": 0.32788777138253855, |
|
"learning_rate": 3.929769316851987e-05, |
|
"loss": 0.4519, |
|
"num_tokens": 602215607.0, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.0784982935153584, |
|
"grad_norm": 0.3517481812619457, |
|
"learning_rate": 3.92255265553535e-05, |
|
"loss": 0.4673, |
|
"num_tokens": 604135477.0, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.0819112627986347, |
|
"grad_norm": 0.32122104215438485, |
|
"learning_rate": 3.915319396108781e-05, |
|
"loss": 0.4534, |
|
"num_tokens": 606019690.0, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.0853242320819112, |
|
"grad_norm": 0.29953390066728885, |
|
"learning_rate": 3.9080696409629344e-05, |
|
"loss": 0.4532, |
|
"num_tokens": 607891544.0, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.0887372013651877, |
|
"grad_norm": 0.312931983732918, |
|
"learning_rate": 3.900803492721971e-05, |
|
"loss": 0.4546, |
|
"num_tokens": 609757887.0, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.0921501706484642, |
|
"grad_norm": 0.3128391047692134, |
|
"learning_rate": 3.8935210542421055e-05, |
|
"loss": 0.4487, |
|
"num_tokens": 611634451.0, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.0955631399317407, |
|
"grad_norm": 0.32796952939352125, |
|
"learning_rate": 3.886222428610149e-05, |
|
"loss": 0.447, |
|
"num_tokens": 613533180.0, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.098976109215017, |
|
"grad_norm": 0.29026332056250814, |
|
"learning_rate": 3.878907719142052e-05, |
|
"loss": 0.4616, |
|
"num_tokens": 615506453.0, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.1023890784982935, |
|
"grad_norm": 0.2821146739678027, |
|
"learning_rate": 3.871577029381439e-05, |
|
"loss": 0.4414, |
|
"num_tokens": 617472869.0, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.10580204778157, |
|
"grad_norm": 0.33356289539712947, |
|
"learning_rate": 3.864230463098142e-05, |
|
"loss": 0.4556, |
|
"num_tokens": 619275868.0, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.1092150170648465, |
|
"grad_norm": 0.2989940473508491, |
|
"learning_rate": 3.8568681242867375e-05, |
|
"loss": 0.4526, |
|
"num_tokens": 621299094.0, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.1126279863481228, |
|
"grad_norm": 0.2855743012501097, |
|
"learning_rate": 3.849490117165069e-05, |
|
"loss": 0.4434, |
|
"num_tokens": 623118795.0, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.1160409556313993, |
|
"grad_norm": 0.29837585943422495, |
|
"learning_rate": 3.842096546172772e-05, |
|
"loss": 0.4651, |
|
"num_tokens": 625108673.0, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.1194539249146758, |
|
"grad_norm": 0.30667205927828434, |
|
"learning_rate": 3.834687515969798e-05, |
|
"loss": 0.4607, |
|
"num_tokens": 627058447.0, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.1228668941979523, |
|
"grad_norm": 0.28121218504839973, |
|
"learning_rate": 3.827263131434934e-05, |
|
"loss": 0.4542, |
|
"num_tokens": 628986208.0, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.1262798634812285, |
|
"grad_norm": 0.3003909890336984, |
|
"learning_rate": 3.819823497664311e-05, |
|
"loss": 0.4484, |
|
"num_tokens": 630867075.0, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.129692832764505, |
|
"grad_norm": 0.2672543488419772, |
|
"learning_rate": 3.8123687199699266e-05, |
|
"loss": 0.4582, |
|
"num_tokens": 632732993.0, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 1.1331058020477816, |
|
"grad_norm": 0.2828052831967036, |
|
"learning_rate": 3.8048989038781435e-05, |
|
"loss": 0.4285, |
|
"num_tokens": 634492738.0, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.136518771331058, |
|
"grad_norm": 0.2960342355211648, |
|
"learning_rate": 3.797414155128205e-05, |
|
"loss": 0.4462, |
|
"num_tokens": 636253607.0, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.1399317406143346, |
|
"grad_norm": 0.27495101737982175, |
|
"learning_rate": 3.789914579670732e-05, |
|
"loss": 0.4617, |
|
"num_tokens": 638081192.0, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.1433447098976108, |
|
"grad_norm": 0.30596461849028866, |
|
"learning_rate": 3.7824002836662257e-05, |
|
"loss": 0.4491, |
|
"num_tokens": 640042390.0, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.1467576791808873, |
|
"grad_norm": 0.3312421873485475, |
|
"learning_rate": 3.774871373483565e-05, |
|
"loss": 0.4575, |
|
"num_tokens": 641973230.0, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.1501706484641638, |
|
"grad_norm": 0.3104278063985918, |
|
"learning_rate": 3.7673279556985e-05, |
|
"loss": 0.4525, |
|
"num_tokens": 643968400.0, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.1535836177474403, |
|
"grad_norm": 0.28162301519749217, |
|
"learning_rate": 3.7597701370921444e-05, |
|
"loss": 0.4621, |
|
"num_tokens": 645975251.0, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.1569965870307168, |
|
"grad_norm": 0.2861836614250746, |
|
"learning_rate": 3.7521980246494614e-05, |
|
"loss": 0.4521, |
|
"num_tokens": 647924628.0, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.1604095563139931, |
|
"grad_norm": 0.28785239642753035, |
|
"learning_rate": 3.744611725557753e-05, |
|
"loss": 0.4593, |
|
"num_tokens": 649791951.0, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.1638225255972696, |
|
"grad_norm": 0.27626617912281226, |
|
"learning_rate": 3.7370113472051406e-05, |
|
"loss": 0.4584, |
|
"num_tokens": 651657667.0, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 1.1672354948805461, |
|
"grad_norm": 0.3386332300169892, |
|
"learning_rate": 3.729396997179044e-05, |
|
"loss": 0.4629, |
|
"num_tokens": 653497824.0, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.1706484641638226, |
|
"grad_norm": 0.27240329301276556, |
|
"learning_rate": 3.72176878326466e-05, |
|
"loss": 0.4576, |
|
"num_tokens": 655459782.0, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 1.174061433447099, |
|
"grad_norm": 0.32392454797351683, |
|
"learning_rate": 3.714126813443435e-05, |
|
"loss": 0.4644, |
|
"num_tokens": 657369737.0, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.1774744027303754, |
|
"grad_norm": 0.29853720401881, |
|
"learning_rate": 3.706471195891541e-05, |
|
"loss": 0.4545, |
|
"num_tokens": 659396733.0, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.180887372013652, |
|
"grad_norm": 0.2773946756633204, |
|
"learning_rate": 3.698802038978337e-05, |
|
"loss": 0.4418, |
|
"num_tokens": 661291458.0, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.1843003412969284, |
|
"grad_norm": 0.2740460845448324, |
|
"learning_rate": 3.691119451264843e-05, |
|
"loss": 0.4698, |
|
"num_tokens": 663241121.0, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 1.1877133105802047, |
|
"grad_norm": 0.276547173388332, |
|
"learning_rate": 3.683423541502194e-05, |
|
"loss": 0.4442, |
|
"num_tokens": 665124115.0, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.1911262798634812, |
|
"grad_norm": 0.2903229118786473, |
|
"learning_rate": 3.675714418630111e-05, |
|
"loss": 0.4662, |
|
"num_tokens": 667040244.0, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 1.1945392491467577, |
|
"grad_norm": 0.27064420446562926, |
|
"learning_rate": 3.667992191775349e-05, |
|
"loss": 0.452, |
|
"num_tokens": 668907555.0, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.1979522184300342, |
|
"grad_norm": 0.28593868782908866, |
|
"learning_rate": 3.6602569702501604e-05, |
|
"loss": 0.4531, |
|
"num_tokens": 670890387.0, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 1.2013651877133107, |
|
"grad_norm": 0.2772025249561031, |
|
"learning_rate": 3.652508863550742e-05, |
|
"loss": 0.4463, |
|
"num_tokens": 672825173.0, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.204778156996587, |
|
"grad_norm": 0.2904532887461672, |
|
"learning_rate": 3.644747981355689e-05, |
|
"loss": 0.4559, |
|
"num_tokens": 674704748.0, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 1.2081911262798635, |
|
"grad_norm": 0.33272679002620287, |
|
"learning_rate": 3.636974433524439e-05, |
|
"loss": 0.442, |
|
"num_tokens": 676622447.0, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.21160409556314, |
|
"grad_norm": 0.3001225018723121, |
|
"learning_rate": 3.629188330095718e-05, |
|
"loss": 0.463, |
|
"num_tokens": 678387581.0, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.2150170648464165, |
|
"grad_norm": 0.2918057402571508, |
|
"learning_rate": 3.621389781285985e-05, |
|
"loss": 0.4652, |
|
"num_tokens": 680273514.0, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.2184300341296928, |
|
"grad_norm": 0.2715063996631153, |
|
"learning_rate": 3.61357889748787e-05, |
|
"loss": 0.4577, |
|
"num_tokens": 682257311.0, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 1.2218430034129693, |
|
"grad_norm": 0.29943640199690547, |
|
"learning_rate": 3.6057557892686086e-05, |
|
"loss": 0.4476, |
|
"num_tokens": 684116507.0, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.2252559726962458, |
|
"grad_norm": 0.30213895944964064, |
|
"learning_rate": 3.597920567368483e-05, |
|
"loss": 0.4545, |
|
"num_tokens": 685965611.0, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 1.2286689419795223, |
|
"grad_norm": 0.2850709767569452, |
|
"learning_rate": 3.590073342699248e-05, |
|
"loss": 0.4508, |
|
"num_tokens": 687988417.0, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.2320819112627985, |
|
"grad_norm": 0.28211304977263574, |
|
"learning_rate": 3.582214226342567e-05, |
|
"loss": 0.4528, |
|
"num_tokens": 689788689.0, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 1.235494880546075, |
|
"grad_norm": 0.30451801514288107, |
|
"learning_rate": 3.574343329548435e-05, |
|
"loss": 0.4684, |
|
"num_tokens": 691706998.0, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.2389078498293515, |
|
"grad_norm": 0.28161312286707335, |
|
"learning_rate": 3.566460763733606e-05, |
|
"loss": 0.4539, |
|
"num_tokens": 693650650.0, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 1.242320819112628, |
|
"grad_norm": 0.2855398246780951, |
|
"learning_rate": 3.5585666404800136e-05, |
|
"loss": 0.4481, |
|
"num_tokens": 695583845.0, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.2457337883959045, |
|
"grad_norm": 0.2685294618091198, |
|
"learning_rate": 3.5506610715331945e-05, |
|
"loss": 0.4454, |
|
"num_tokens": 697598560.0, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.2491467576791808, |
|
"grad_norm": 0.30253366081342353, |
|
"learning_rate": 3.5427441688007056e-05, |
|
"loss": 0.4248, |
|
"num_tokens": 699412264.0, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.2525597269624573, |
|
"grad_norm": 0.26768192457715556, |
|
"learning_rate": 3.534816044350539e-05, |
|
"loss": 0.4483, |
|
"num_tokens": 701334667.0, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 1.2559726962457338, |
|
"grad_norm": 0.27314950895844187, |
|
"learning_rate": 3.5268768104095365e-05, |
|
"loss": 0.4314, |
|
"num_tokens": 703234238.0, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.25938566552901, |
|
"grad_norm": 0.2796629207217079, |
|
"learning_rate": 3.5189265793618e-05, |
|
"loss": 0.4484, |
|
"num_tokens": 705158041.0, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 1.2627986348122868, |
|
"grad_norm": 0.29048618003370963, |
|
"learning_rate": 3.510965463747103e-05, |
|
"loss": 0.4574, |
|
"num_tokens": 707047974.0, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.266211604095563, |
|
"grad_norm": 0.2824168313312133, |
|
"learning_rate": 3.5029935762592935e-05, |
|
"loss": 0.4531, |
|
"num_tokens": 708901930.0, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 1.2696245733788396, |
|
"grad_norm": 0.2950742991938956, |
|
"learning_rate": 3.495011029744703e-05, |
|
"loss": 0.4501, |
|
"num_tokens": 710871136.0, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.273037542662116, |
|
"grad_norm": 0.2637357244615578, |
|
"learning_rate": 3.4870179372005466e-05, |
|
"loss": 0.4409, |
|
"num_tokens": 712727010.0, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 1.2764505119453924, |
|
"grad_norm": 0.27499001477652085, |
|
"learning_rate": 3.4790144117733234e-05, |
|
"loss": 0.4573, |
|
"num_tokens": 714757788.0, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.2798634812286689, |
|
"grad_norm": 0.2786231655621766, |
|
"learning_rate": 3.471000566757216e-05, |
|
"loss": 0.4451, |
|
"num_tokens": 716648944.0, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.2832764505119454, |
|
"grad_norm": 0.28497779850769467, |
|
"learning_rate": 3.462976515592487e-05, |
|
"loss": 0.4458, |
|
"num_tokens": 718518526.0, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.286689419795222, |
|
"grad_norm": 0.27321208726567636, |
|
"learning_rate": 3.454942371863873e-05, |
|
"loss": 0.4513, |
|
"num_tokens": 720498397.0, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 1.2901023890784984, |
|
"grad_norm": 0.2576256316628026, |
|
"learning_rate": 3.4468982492989746e-05, |
|
"loss": 0.4442, |
|
"num_tokens": 722394245.0, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.2935153583617747, |
|
"grad_norm": 0.26515887609716976, |
|
"learning_rate": 3.438844261766648e-05, |
|
"loss": 0.4603, |
|
"num_tokens": 724278291.0, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 1.2969283276450512, |
|
"grad_norm": 0.3040373005342316, |
|
"learning_rate": 3.4307805232753945e-05, |
|
"loss": 0.4613, |
|
"num_tokens": 726250325.0, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.3003412969283277, |
|
"grad_norm": 0.31517718265213174, |
|
"learning_rate": 3.4227071479717445e-05, |
|
"loss": 0.4654, |
|
"num_tokens": 728217806.0, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 1.3037542662116042, |
|
"grad_norm": 0.2605190400337086, |
|
"learning_rate": 3.414624250138645e-05, |
|
"loss": 0.4509, |
|
"num_tokens": 730069746.0, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.3071672354948807, |
|
"grad_norm": 0.29826014855147326, |
|
"learning_rate": 3.4065319441938355e-05, |
|
"loss": 0.452, |
|
"num_tokens": 731992759.0, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 1.310580204778157, |
|
"grad_norm": 0.28635184919521356, |
|
"learning_rate": 3.398430344688235e-05, |
|
"loss": 0.4537, |
|
"num_tokens": 733866439.0, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.3139931740614335, |
|
"grad_norm": 0.29234660944718105, |
|
"learning_rate": 3.390319566304319e-05, |
|
"loss": 0.4589, |
|
"num_tokens": 735858853.0, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.31740614334471, |
|
"grad_norm": 0.2962339298082799, |
|
"learning_rate": 3.3821997238544916e-05, |
|
"loss": 0.4558, |
|
"num_tokens": 737751290.0, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.3208191126279862, |
|
"grad_norm": 0.3070070098211661, |
|
"learning_rate": 3.374070932279465e-05, |
|
"loss": 0.4543, |
|
"num_tokens": 739700530.0, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 1.3242320819112627, |
|
"grad_norm": 0.4850349267925565, |
|
"learning_rate": 3.365933306646633e-05, |
|
"loss": 0.4639, |
|
"num_tokens": 741654310.0, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.3276450511945392, |
|
"grad_norm": 0.27351892148927176, |
|
"learning_rate": 3.357786962148437e-05, |
|
"loss": 0.4313, |
|
"num_tokens": 743544844.0, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 1.3310580204778157, |
|
"grad_norm": 0.2694056885578761, |
|
"learning_rate": 3.3496320141007406e-05, |
|
"loss": 0.4571, |
|
"num_tokens": 745423520.0, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.3344709897610922, |
|
"grad_norm": 0.2728262843326819, |
|
"learning_rate": 3.3414685779411945e-05, |
|
"loss": 0.4556, |
|
"num_tokens": 747401769.0, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 1.3378839590443685, |
|
"grad_norm": 0.30994439761647724, |
|
"learning_rate": 3.333296769227604e-05, |
|
"loss": 0.4587, |
|
"num_tokens": 749294404.0, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.341296928327645, |
|
"grad_norm": 0.29334073529273996, |
|
"learning_rate": 3.3251167036362915e-05, |
|
"loss": 0.4471, |
|
"num_tokens": 751251487.0, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 1.3447098976109215, |
|
"grad_norm": 0.33180165168065484, |
|
"learning_rate": 3.31692849696046e-05, |
|
"loss": 0.4487, |
|
"num_tokens": 753055155.0, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.348122866894198, |
|
"grad_norm": 0.2993222027921042, |
|
"learning_rate": 3.3087322651085554e-05, |
|
"loss": 0.4672, |
|
"num_tokens": 755096420.0, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.3515358361774745, |
|
"grad_norm": 0.2801645429736596, |
|
"learning_rate": 3.3005281241026215e-05, |
|
"loss": 0.4415, |
|
"num_tokens": 756939622.0, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.3549488054607508, |
|
"grad_norm": 0.2745528396247091, |
|
"learning_rate": 3.2923161900766614e-05, |
|
"loss": 0.4399, |
|
"num_tokens": 758793302.0, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 1.3583617747440273, |
|
"grad_norm": 0.2612966118749133, |
|
"learning_rate": 3.284096579274995e-05, |
|
"loss": 0.4447, |
|
"num_tokens": 760675723.0, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.3617747440273038, |
|
"grad_norm": 0.270210002371754, |
|
"learning_rate": 3.275869408050608e-05, |
|
"loss": 0.4503, |
|
"num_tokens": 762483433.0, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 1.36518771331058, |
|
"grad_norm": 0.2680635125777971, |
|
"learning_rate": 3.267634792863509e-05, |
|
"loss": 0.4574, |
|
"num_tokens": 764466963.0, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.3686006825938566, |
|
"grad_norm": 0.286309419549809, |
|
"learning_rate": 3.259392850279082e-05, |
|
"loss": 0.4449, |
|
"num_tokens": 766382996.0, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 1.372013651877133, |
|
"grad_norm": 0.2740692290659991, |
|
"learning_rate": 3.2511436969664284e-05, |
|
"loss": 0.4541, |
|
"num_tokens": 768293330.0, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.3754266211604096, |
|
"grad_norm": 0.2690578067704949, |
|
"learning_rate": 3.2428874496967274e-05, |
|
"loss": 0.455, |
|
"num_tokens": 770247322.0, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 1.378839590443686, |
|
"grad_norm": 0.2563408299269977, |
|
"learning_rate": 3.234624225341575e-05, |
|
"loss": 0.4459, |
|
"num_tokens": 772205337.0, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.3822525597269624, |
|
"grad_norm": 0.25655513271782726, |
|
"learning_rate": 3.22635414087133e-05, |
|
"loss": 0.4658, |
|
"num_tokens": 774217733.0, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.3856655290102389, |
|
"grad_norm": 0.29060509201295237, |
|
"learning_rate": 3.218077313353462e-05, |
|
"loss": 0.4488, |
|
"num_tokens": 776079090.0, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.3890784982935154, |
|
"grad_norm": 0.2720186770597433, |
|
"learning_rate": 3.20979385995089e-05, |
|
"loss": 0.442, |
|
"num_tokens": 777982393.0, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 1.3924914675767919, |
|
"grad_norm": 0.2861264692108029, |
|
"learning_rate": 3.201503897920327e-05, |
|
"loss": 0.4496, |
|
"num_tokens": 779989886.0, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.3959044368600684, |
|
"grad_norm": 0.29707981121890303, |
|
"learning_rate": 3.193207544610621e-05, |
|
"loss": 0.4432, |
|
"num_tokens": 781895136.0, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 1.3993174061433447, |
|
"grad_norm": 0.27309938834165404, |
|
"learning_rate": 3.184904917461088e-05, |
|
"loss": 0.4563, |
|
"num_tokens": 783942187.0, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.4027303754266212, |
|
"grad_norm": 0.2893396465962206, |
|
"learning_rate": 3.1765961339998565e-05, |
|
"loss": 0.4468, |
|
"num_tokens": 785853334.0, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 1.4061433447098977, |
|
"grad_norm": 0.2896324408589564, |
|
"learning_rate": 3.1682813118422e-05, |
|
"loss": 0.4484, |
|
"num_tokens": 787909060.0, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.409556313993174, |
|
"grad_norm": 0.26774630794325843, |
|
"learning_rate": 3.159960568688872e-05, |
|
"loss": 0.4507, |
|
"num_tokens": 789874438.0, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 1.4129692832764504, |
|
"grad_norm": 0.3167509710093353, |
|
"learning_rate": 3.151634022324444e-05, |
|
"loss": 0.4599, |
|
"num_tokens": 791709412.0, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.416382252559727, |
|
"grad_norm": 0.27632081551359566, |
|
"learning_rate": 3.1433017906156316e-05, |
|
"loss": 0.4429, |
|
"num_tokens": 793521730.0, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 1.4197952218430034, |
|
"grad_norm": 0.2766678534542702, |
|
"learning_rate": 3.134963991509631e-05, |
|
"loss": 0.4613, |
|
"num_tokens": 795344918.0, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.42320819112628, |
|
"grad_norm": 0.28361755145893136, |
|
"learning_rate": 3.126620743032447e-05, |
|
"loss": 0.4467, |
|
"num_tokens": 797189457.0, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 1.4266211604095562, |
|
"grad_norm": 0.27697812269141797, |
|
"learning_rate": 3.1182721632872254e-05, |
|
"loss": 0.436, |
|
"num_tokens": 799164609.0, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.4300341296928327, |
|
"grad_norm": 0.2754786232171963, |
|
"learning_rate": 3.109918370452575e-05, |
|
"loss": 0.445, |
|
"num_tokens": 801159658.0, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 1.4334470989761092, |
|
"grad_norm": 0.2604825606354588, |
|
"learning_rate": 3.101559482780903e-05, |
|
"loss": 0.4408, |
|
"num_tokens": 803102119.0, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.4368600682593857, |
|
"grad_norm": 0.27648906929611866, |
|
"learning_rate": 3.093195618596735e-05, |
|
"loss": 0.4466, |
|
"num_tokens": 805178655.0, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 1.4402730375426622, |
|
"grad_norm": 0.265092750892, |
|
"learning_rate": 3.084826896295041e-05, |
|
"loss": 0.4473, |
|
"num_tokens": 807040305.0, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.4436860068259385, |
|
"grad_norm": 0.2722185832102206, |
|
"learning_rate": 3.07645343433956e-05, |
|
"loss": 0.4669, |
|
"num_tokens": 808944713.0, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 1.447098976109215, |
|
"grad_norm": 0.28414946820217796, |
|
"learning_rate": 3.068075351261126e-05, |
|
"loss": 0.4374, |
|
"num_tokens": 810867962.0, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.4505119453924915, |
|
"grad_norm": 0.2650239217756763, |
|
"learning_rate": 3.0596927656559834e-05, |
|
"loss": 0.4422, |
|
"num_tokens": 812796180.0, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 1.4539249146757678, |
|
"grad_norm": 0.28516975016618823, |
|
"learning_rate": 3.0513057961841175e-05, |
|
"loss": 0.4569, |
|
"num_tokens": 814693135.0, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.4573378839590443, |
|
"grad_norm": 0.2531012712568243, |
|
"learning_rate": 3.042914561567563e-05, |
|
"loss": 0.4414, |
|
"num_tokens": 816615984.0, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 1.4607508532423208, |
|
"grad_norm": 0.3105010778606175, |
|
"learning_rate": 3.0345191805887367e-05, |
|
"loss": 0.4502, |
|
"num_tokens": 818449910.0, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.4641638225255973, |
|
"grad_norm": 0.3002226245659818, |
|
"learning_rate": 3.0261197720887457e-05, |
|
"loss": 0.4257, |
|
"num_tokens": 820318972.0, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 1.4675767918088738, |
|
"grad_norm": 0.2890933430333858, |
|
"learning_rate": 3.017716454965708e-05, |
|
"loss": 0.4578, |
|
"num_tokens": 822330354.0, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.47098976109215, |
|
"grad_norm": 0.2740990659865864, |
|
"learning_rate": 3.0093093481730723e-05, |
|
"loss": 0.4485, |
|
"num_tokens": 824181338.0, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 1.4744027303754266, |
|
"grad_norm": 0.2649593030398301, |
|
"learning_rate": 3.0008985707179326e-05, |
|
"loss": 0.4417, |
|
"num_tokens": 826004744.0, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.477815699658703, |
|
"grad_norm": 0.28281137311210414, |
|
"learning_rate": 2.9924842416593406e-05, |
|
"loss": 0.4515, |
|
"num_tokens": 827894923.0, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 1.4812286689419796, |
|
"grad_norm": 0.2759663786994046, |
|
"learning_rate": 2.9840664801066247e-05, |
|
"loss": 0.4499, |
|
"num_tokens": 829837206.0, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.484641638225256, |
|
"grad_norm": 0.2580667059210538, |
|
"learning_rate": 2.9756454052177012e-05, |
|
"loss": 0.4573, |
|
"num_tokens": 831686870.0, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 1.4880546075085324, |
|
"grad_norm": 0.2575391507719894, |
|
"learning_rate": 2.96722113619739e-05, |
|
"loss": 0.4392, |
|
"num_tokens": 833569477.0, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.4914675767918089, |
|
"grad_norm": 0.27408399127884947, |
|
"learning_rate": 2.9587937922957233e-05, |
|
"loss": 0.4452, |
|
"num_tokens": 835500719.0, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 1.4948805460750854, |
|
"grad_norm": 0.2615339855136178, |
|
"learning_rate": 2.950363492806262e-05, |
|
"loss": 0.4708, |
|
"num_tokens": 837484919.0, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.4982935153583616, |
|
"grad_norm": 0.25621570699947277, |
|
"learning_rate": 2.941930357064402e-05, |
|
"loss": 0.4501, |
|
"num_tokens": 839486730.0, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 1.5017064846416384, |
|
"grad_norm": 0.2756773591629605, |
|
"learning_rate": 2.9334945044456923e-05, |
|
"loss": 0.452, |
|
"num_tokens": 841489715.0, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.5051194539249146, |
|
"grad_norm": 0.27466282153404803, |
|
"learning_rate": 2.925056054364137e-05, |
|
"loss": 0.4485, |
|
"num_tokens": 843365250.0, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 1.5085324232081911, |
|
"grad_norm": 0.2804813346164974, |
|
"learning_rate": 2.9166151262705105e-05, |
|
"loss": 0.4408, |
|
"num_tokens": 845275912.0, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.5119453924914676, |
|
"grad_norm": 0.2595259256898971, |
|
"learning_rate": 2.9081718396506635e-05, |
|
"loss": 0.4429, |
|
"num_tokens": 847232214.0, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 1.515358361774744, |
|
"grad_norm": 0.2663724850178613, |
|
"learning_rate": 2.8997263140238346e-05, |
|
"loss": 0.4495, |
|
"num_tokens": 848987167.0, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.5187713310580204, |
|
"grad_norm": 0.2589624660607149, |
|
"learning_rate": 2.8912786689409556e-05, |
|
"loss": 0.4464, |
|
"num_tokens": 850922570.0, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 1.522184300341297, |
|
"grad_norm": 0.2656741346606789, |
|
"learning_rate": 2.88282902398296e-05, |
|
"loss": 0.4517, |
|
"num_tokens": 852822624.0, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.5255972696245734, |
|
"grad_norm": 0.293689264474386, |
|
"learning_rate": 2.8743774987590916e-05, |
|
"loss": 0.4529, |
|
"num_tokens": 854740425.0, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 1.52901023890785, |
|
"grad_norm": 0.27142432002975353, |
|
"learning_rate": 2.8659242129052093e-05, |
|
"loss": 0.4414, |
|
"num_tokens": 856603546.0, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.5324232081911262, |
|
"grad_norm": 0.267870113436132, |
|
"learning_rate": 2.8574692860820974e-05, |
|
"loss": 0.4596, |
|
"num_tokens": 858627881.0, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 1.5358361774744027, |
|
"grad_norm": 0.26103422109658747, |
|
"learning_rate": 2.849012837973764e-05, |
|
"loss": 0.4475, |
|
"num_tokens": 860521721.0, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.5392491467576792, |
|
"grad_norm": 0.2601926950553868, |
|
"learning_rate": 2.840554988285755e-05, |
|
"loss": 0.4484, |
|
"num_tokens": 862459060.0, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 1.5426621160409555, |
|
"grad_norm": 0.27246673405743077, |
|
"learning_rate": 2.8320958567434585e-05, |
|
"loss": 0.447, |
|
"num_tokens": 864416686.0, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.5460750853242322, |
|
"grad_norm": 0.2881598506293941, |
|
"learning_rate": 2.8236355630904037e-05, |
|
"loss": 0.45, |
|
"num_tokens": 866372213.0, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 1.5494880546075085, |
|
"grad_norm": 0.28066610110205614, |
|
"learning_rate": 2.8151742270865722e-05, |
|
"loss": 0.4593, |
|
"num_tokens": 868286031.0, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.552901023890785, |
|
"grad_norm": 0.2862420813113712, |
|
"learning_rate": 2.8067119685067e-05, |
|
"loss": 0.4614, |
|
"num_tokens": 870215475.0, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 1.5563139931740615, |
|
"grad_norm": 0.2554540585046952, |
|
"learning_rate": 2.798248907138584e-05, |
|
"loss": 0.4484, |
|
"num_tokens": 872204026.0, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.5597269624573378, |
|
"grad_norm": 0.26746236600835344, |
|
"learning_rate": 2.7897851627813836e-05, |
|
"loss": 0.4508, |
|
"num_tokens": 874200553.0, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 1.5631399317406145, |
|
"grad_norm": 0.2591829699984779, |
|
"learning_rate": 2.7813208552439257e-05, |
|
"loss": 0.4486, |
|
"num_tokens": 876046353.0, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.5665529010238908, |
|
"grad_norm": 0.2792969836284745, |
|
"learning_rate": 2.7728561043430118e-05, |
|
"loss": 0.456, |
|
"num_tokens": 877975265.0, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 1.5699658703071673, |
|
"grad_norm": 0.2861082160674022, |
|
"learning_rate": 2.7643910299017168e-05, |
|
"loss": 0.4399, |
|
"num_tokens": 879857040.0, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.5733788395904438, |
|
"grad_norm": 0.26577617584751645, |
|
"learning_rate": 2.7559257517476972e-05, |
|
"loss": 0.4584, |
|
"num_tokens": 881775297.0, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 1.57679180887372, |
|
"grad_norm": 0.304890853130377, |
|
"learning_rate": 2.747460389711492e-05, |
|
"loss": 0.46, |
|
"num_tokens": 883694768.0, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.5802047781569966, |
|
"grad_norm": 0.27264196953878284, |
|
"learning_rate": 2.7389950636248284e-05, |
|
"loss": 0.4627, |
|
"num_tokens": 885606244.0, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 1.583617747440273, |
|
"grad_norm": 0.26079561956352626, |
|
"learning_rate": 2.7305298933189255e-05, |
|
"loss": 0.4583, |
|
"num_tokens": 887457723.0, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.5870307167235493, |
|
"grad_norm": 0.24807936020422583, |
|
"learning_rate": 2.7220649986227964e-05, |
|
"loss": 0.4417, |
|
"num_tokens": 889409099.0, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 1.590443686006826, |
|
"grad_norm": 0.2651021999880332, |
|
"learning_rate": 2.7136004993615505e-05, |
|
"loss": 0.4465, |
|
"num_tokens": 891339953.0, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.5938566552901023, |
|
"grad_norm": 0.26971209715318745, |
|
"learning_rate": 2.7051365153547027e-05, |
|
"loss": 0.4424, |
|
"num_tokens": 893325476.0, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 1.5972696245733788, |
|
"grad_norm": 0.28339562896194187, |
|
"learning_rate": 2.6966731664144733e-05, |
|
"loss": 0.4451, |
|
"num_tokens": 895188323.0, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.6006825938566553, |
|
"grad_norm": 0.27171867664083665, |
|
"learning_rate": 2.688210572344095e-05, |
|
"loss": 0.4437, |
|
"num_tokens": 897121692.0, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 1.6040955631399316, |
|
"grad_norm": 0.2813986180164497, |
|
"learning_rate": 2.6797488529361093e-05, |
|
"loss": 0.4447, |
|
"num_tokens": 899043370.0, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.6075085324232083, |
|
"grad_norm": 0.271610888053345, |
|
"learning_rate": 2.6712881279706814e-05, |
|
"loss": 0.4523, |
|
"num_tokens": 900952717.0, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 1.6109215017064846, |
|
"grad_norm": 0.28993349676639346, |
|
"learning_rate": 2.662828517213899e-05, |
|
"loss": 0.4467, |
|
"num_tokens": 902743085.0, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.6143344709897611, |
|
"grad_norm": 0.27680179119800896, |
|
"learning_rate": 2.6543701404160748e-05, |
|
"loss": 0.4453, |
|
"num_tokens": 904703890.0, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 1.6177474402730376, |
|
"grad_norm": 0.28092901036185763, |
|
"learning_rate": 2.645913117310057e-05, |
|
"loss": 0.4538, |
|
"num_tokens": 906627984.0, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.621160409556314, |
|
"grad_norm": 0.26517844884302066, |
|
"learning_rate": 2.637457567609531e-05, |
|
"loss": 0.4469, |
|
"num_tokens": 908570138.0, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 1.6245733788395904, |
|
"grad_norm": 0.26409207116238576, |
|
"learning_rate": 2.6290036110073242e-05, |
|
"loss": 0.4469, |
|
"num_tokens": 910543342.0, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.627986348122867, |
|
"grad_norm": 0.2583484326247722, |
|
"learning_rate": 2.6205513671737135e-05, |
|
"loss": 0.4468, |
|
"num_tokens": 912533770.0, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 1.6313993174061432, |
|
"grad_norm": 0.24446056976219688, |
|
"learning_rate": 2.612100955754731e-05, |
|
"loss": 0.4469, |
|
"num_tokens": 914532283.0, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.63481228668942, |
|
"grad_norm": 0.2693898345306616, |
|
"learning_rate": 2.6036524963704705e-05, |
|
"loss": 0.4428, |
|
"num_tokens": 916447270.0, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 1.6382252559726962, |
|
"grad_norm": 0.2603116651063788, |
|
"learning_rate": 2.5952061086133915e-05, |
|
"loss": 0.4372, |
|
"num_tokens": 918323773.0, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.6416382252559727, |
|
"grad_norm": 0.28162088278006275, |
|
"learning_rate": 2.58676191204663e-05, |
|
"loss": 0.4453, |
|
"num_tokens": 920259747.0, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 1.6450511945392492, |
|
"grad_norm": 0.2710337541109652, |
|
"learning_rate": 2.578320026202306e-05, |
|
"loss": 0.4364, |
|
"num_tokens": 922162828.0, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.6484641638225255, |
|
"grad_norm": 0.27020261434441295, |
|
"learning_rate": 2.5698805705798273e-05, |
|
"loss": 0.4507, |
|
"num_tokens": 924056073.0, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 1.6518771331058022, |
|
"grad_norm": 0.2634733100798534, |
|
"learning_rate": 2.5614436646442015e-05, |
|
"loss": 0.4519, |
|
"num_tokens": 925974493.0, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.6552901023890785, |
|
"grad_norm": 0.255269239584084, |
|
"learning_rate": 2.553009427824345e-05, |
|
"loss": 0.4385, |
|
"num_tokens": 927926823.0, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 1.658703071672355, |
|
"grad_norm": 0.2853510934721726, |
|
"learning_rate": 2.54457797951139e-05, |
|
"loss": 0.436, |
|
"num_tokens": 929863736.0, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.6621160409556315, |
|
"grad_norm": 0.27590699133483987, |
|
"learning_rate": 2.5361494390569973e-05, |
|
"loss": 0.4383, |
|
"num_tokens": 931699965.0, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 1.6655290102389078, |
|
"grad_norm": 0.2672913591423428, |
|
"learning_rate": 2.527723925771664e-05, |
|
"loss": 0.4315, |
|
"num_tokens": 933525777.0, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.6689419795221843, |
|
"grad_norm": 0.26586385140122987, |
|
"learning_rate": 2.5193015589230374e-05, |
|
"loss": 0.4492, |
|
"num_tokens": 935357508.0, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 1.6723549488054608, |
|
"grad_norm": 0.26155221487403196, |
|
"learning_rate": 2.5108824577342243e-05, |
|
"loss": 0.4324, |
|
"num_tokens": 937184859.0, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.675767918088737, |
|
"grad_norm": 0.2604660508130615, |
|
"learning_rate": 2.502466741382105e-05, |
|
"loss": 0.4467, |
|
"num_tokens": 939164719.0, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 1.6791808873720138, |
|
"grad_norm": 0.2672255765255275, |
|
"learning_rate": 2.494054528995644e-05, |
|
"loss": 0.4403, |
|
"num_tokens": 941017477.0, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.68259385665529, |
|
"grad_norm": 0.2584274462980594, |
|
"learning_rate": 2.4856459396542092e-05, |
|
"loss": 0.4421, |
|
"num_tokens": 942907050.0, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 1.6860068259385665, |
|
"grad_norm": 0.24720970294975506, |
|
"learning_rate": 2.477241092385877e-05, |
|
"loss": 0.4418, |
|
"num_tokens": 944808236.0, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.689419795221843, |
|
"grad_norm": 0.26472526826866016, |
|
"learning_rate": 2.4688401061657563e-05, |
|
"loss": 0.4381, |
|
"num_tokens": 946840703.0, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 1.6928327645051193, |
|
"grad_norm": 5.0030305349934885, |
|
"learning_rate": 2.4604430999143002e-05, |
|
"loss": 0.4506, |
|
"num_tokens": 948769790.0, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.696245733788396, |
|
"grad_norm": 0.2680033634853091, |
|
"learning_rate": 2.452050192495624e-05, |
|
"loss": 0.4335, |
|
"num_tokens": 950570738.0, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 1.6996587030716723, |
|
"grad_norm": 0.2839588999603886, |
|
"learning_rate": 2.4436615027158194e-05, |
|
"loss": 0.4463, |
|
"num_tokens": 952490894.0, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.7030716723549488, |
|
"grad_norm": 0.26446203164353277, |
|
"learning_rate": 2.4352771493212763e-05, |
|
"loss": 0.4532, |
|
"num_tokens": 954445294.0, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 1.7064846416382253, |
|
"grad_norm": 0.2706578896879531, |
|
"learning_rate": 2.4268972509970027e-05, |
|
"loss": 0.438, |
|
"num_tokens": 956373551.0, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.7098976109215016, |
|
"grad_norm": 0.2510763849552845, |
|
"learning_rate": 2.4185219263649402e-05, |
|
"loss": 0.4369, |
|
"num_tokens": 958195010.0, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 1.713310580204778, |
|
"grad_norm": 0.2833747779105397, |
|
"learning_rate": 2.4101512939822875e-05, |
|
"loss": 0.4416, |
|
"num_tokens": 960041529.0, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.7167235494880546, |
|
"grad_norm": 0.26481804710093265, |
|
"learning_rate": 2.401785472339823e-05, |
|
"loss": 0.4389, |
|
"num_tokens": 961954801.0, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 1.7201365187713311, |
|
"grad_norm": 0.2647677324608605, |
|
"learning_rate": 2.393424579860228e-05, |
|
"loss": 0.4381, |
|
"num_tokens": 963894044.0, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.7235494880546076, |
|
"grad_norm": 0.26943728181282595, |
|
"learning_rate": 2.385068734896404e-05, |
|
"loss": 0.4419, |
|
"num_tokens": 965758747.0, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 1.726962457337884, |
|
"grad_norm": 0.2594300124417837, |
|
"learning_rate": 2.3767180557298074e-05, |
|
"loss": 0.4383, |
|
"num_tokens": 967735319.0, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.7303754266211604, |
|
"grad_norm": 0.2515723573014202, |
|
"learning_rate": 2.368372660568768e-05, |
|
"loss": 0.421, |
|
"num_tokens": 969551079.0, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 1.733788395904437, |
|
"grad_norm": 0.2593132229076659, |
|
"learning_rate": 2.3600326675468158e-05, |
|
"loss": 0.4351, |
|
"num_tokens": 971469515.0, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.7372013651877132, |
|
"grad_norm": 0.24852875972861324, |
|
"learning_rate": 2.351698194721013e-05, |
|
"loss": 0.4508, |
|
"num_tokens": 973400224.0, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 1.74061433447099, |
|
"grad_norm": 0.2894838158575043, |
|
"learning_rate": 2.343369360070281e-05, |
|
"loss": 0.4632, |
|
"num_tokens": 975250566.0, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.7440273037542662, |
|
"grad_norm": 0.25922772428222707, |
|
"learning_rate": 2.335046281493728e-05, |
|
"loss": 0.4327, |
|
"num_tokens": 977140229.0, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 1.7474402730375427, |
|
"grad_norm": 0.26168920953623176, |
|
"learning_rate": 2.326729076808981e-05, |
|
"loss": 0.4477, |
|
"num_tokens": 979159476.0, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.7508532423208192, |
|
"grad_norm": 0.26615040708871945, |
|
"learning_rate": 2.3184178637505227e-05, |
|
"loss": 0.4375, |
|
"num_tokens": 981111783.0, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 1.7542662116040955, |
|
"grad_norm": 0.25291556202610044, |
|
"learning_rate": 2.310112759968018e-05, |
|
"loss": 0.4464, |
|
"num_tokens": 983196600.0, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.757679180887372, |
|
"grad_norm": 0.2695038308858752, |
|
"learning_rate": 2.3018138830246516e-05, |
|
"loss": 0.4482, |
|
"num_tokens": 985104722.0, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 1.7610921501706485, |
|
"grad_norm": 0.26848491423115955, |
|
"learning_rate": 2.2935213503954662e-05, |
|
"loss": 0.4522, |
|
"num_tokens": 986930750.0, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.764505119453925, |
|
"grad_norm": 0.25839245304492914, |
|
"learning_rate": 2.285235279465696e-05, |
|
"loss": 0.4427, |
|
"num_tokens": 988686155.0, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 1.7679180887372015, |
|
"grad_norm": 0.2399595630712539, |
|
"learning_rate": 2.2769557875291063e-05, |
|
"loss": 0.4424, |
|
"num_tokens": 990711608.0, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.7713310580204777, |
|
"grad_norm": 0.2544879611742542, |
|
"learning_rate": 2.2686829917863333e-05, |
|
"loss": 0.4441, |
|
"num_tokens": 992694774.0, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 1.7747440273037542, |
|
"grad_norm": 0.2775597763576913, |
|
"learning_rate": 2.2604170093432255e-05, |
|
"loss": 0.437, |
|
"num_tokens": 994568102.0, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.7781569965870307, |
|
"grad_norm": 0.26757996732416056, |
|
"learning_rate": 2.252157957209185e-05, |
|
"loss": 0.4339, |
|
"num_tokens": 996378479.0, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 1.781569965870307, |
|
"grad_norm": 0.2636448817949683, |
|
"learning_rate": 2.2439059522955107e-05, |
|
"loss": 0.4325, |
|
"num_tokens": 998156090.0, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.7849829351535837, |
|
"grad_norm": 0.275460716899482, |
|
"learning_rate": 2.2356611114137465e-05, |
|
"loss": 0.4354, |
|
"num_tokens": 999976607.0, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 1.78839590443686, |
|
"grad_norm": 0.2625063656499445, |
|
"learning_rate": 2.2274235512740248e-05, |
|
"loss": 0.4327, |
|
"num_tokens": 1001825873.0, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.7918088737201365, |
|
"grad_norm": 0.28402951172695334, |
|
"learning_rate": 2.2191933884834148e-05, |
|
"loss": 0.4597, |
|
"num_tokens": 1003678305.0, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 1.795221843003413, |
|
"grad_norm": 0.2602751829566075, |
|
"learning_rate": 2.2109707395442714e-05, |
|
"loss": 0.4368, |
|
"num_tokens": 1005516007.0, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.7986348122866893, |
|
"grad_norm": 0.26647070338777407, |
|
"learning_rate": 2.2027557208525883e-05, |
|
"loss": 0.4428, |
|
"num_tokens": 1007344786.0, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 1.802047781569966, |
|
"grad_norm": 0.26794683193288027, |
|
"learning_rate": 2.194548448696349e-05, |
|
"loss": 0.4362, |
|
"num_tokens": 1009075212.0, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.8054607508532423, |
|
"grad_norm": 0.27816074997567525, |
|
"learning_rate": 2.1863490392538816e-05, |
|
"loss": 0.4438, |
|
"num_tokens": 1010808416.0, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 1.8088737201365188, |
|
"grad_norm": 0.27463994949053044, |
|
"learning_rate": 2.1781576085922083e-05, |
|
"loss": 0.4512, |
|
"num_tokens": 1012683694.0, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.8122866894197953, |
|
"grad_norm": 0.2707994505036392, |
|
"learning_rate": 2.1699742726654132e-05, |
|
"loss": 0.4376, |
|
"num_tokens": 1014601234.0, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 1.8156996587030716, |
|
"grad_norm": 0.2627965386697017, |
|
"learning_rate": 2.161799147312994e-05, |
|
"loss": 0.439, |
|
"num_tokens": 1016448826.0, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.819112627986348, |
|
"grad_norm": 0.2522129136280826, |
|
"learning_rate": 2.15363234825822e-05, |
|
"loss": 0.4358, |
|
"num_tokens": 1018365951.0, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 1.8225255972696246, |
|
"grad_norm": 0.252384471077172, |
|
"learning_rate": 2.1454739911065002e-05, |
|
"loss": 0.441, |
|
"num_tokens": 1020273196.0, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.8259385665529009, |
|
"grad_norm": 0.24823623801093808, |
|
"learning_rate": 2.137324191343743e-05, |
|
"loss": 0.4287, |
|
"num_tokens": 1022225624.0, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 1.8293515358361776, |
|
"grad_norm": 0.27365445309933684, |
|
"learning_rate": 2.129183064334725e-05, |
|
"loss": 0.4374, |
|
"num_tokens": 1024124126.0, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.8327645051194539, |
|
"grad_norm": 0.265169099623179, |
|
"learning_rate": 2.1210507253214495e-05, |
|
"loss": 0.4512, |
|
"num_tokens": 1026116910.0, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 1.8361774744027304, |
|
"grad_norm": 0.2687843686775235, |
|
"learning_rate": 2.1129272894215262e-05, |
|
"loss": 0.4411, |
|
"num_tokens": 1027954118.0, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.8395904436860069, |
|
"grad_norm": 0.2609011147767087, |
|
"learning_rate": 2.1048128716265357e-05, |
|
"loss": 0.4379, |
|
"num_tokens": 1029884450.0, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 1.8430034129692832, |
|
"grad_norm": 0.27599227786573743, |
|
"learning_rate": 2.0967075868003995e-05, |
|
"loss": 0.4527, |
|
"num_tokens": 1031794352.0, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.8464163822525599, |
|
"grad_norm": 0.2343269626270317, |
|
"learning_rate": 2.0886115496777598e-05, |
|
"loss": 0.4405, |
|
"num_tokens": 1033740753.0, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 1.8498293515358362, |
|
"grad_norm": 0.26453037363335735, |
|
"learning_rate": 2.0805248748623528e-05, |
|
"loss": 0.4443, |
|
"num_tokens": 1035620201.0, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.8532423208191127, |
|
"grad_norm": 0.2536404978918816, |
|
"learning_rate": 2.0724476768253854e-05, |
|
"loss": 0.453, |
|
"num_tokens": 1037586913.0, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 1.8566552901023892, |
|
"grad_norm": 0.2544903502546627, |
|
"learning_rate": 2.064380069903914e-05, |
|
"loss": 0.4447, |
|
"num_tokens": 1039498293.0, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.8600682593856654, |
|
"grad_norm": 0.2702028649002257, |
|
"learning_rate": 2.0563221682992305e-05, |
|
"loss": 0.4411, |
|
"num_tokens": 1041470501.0, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 1.863481228668942, |
|
"grad_norm": 0.26562312719614667, |
|
"learning_rate": 2.048274086075242e-05, |
|
"loss": 0.4429, |
|
"num_tokens": 1043288032.0, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.8668941979522184, |
|
"grad_norm": 0.24219783619847288, |
|
"learning_rate": 2.0402359371568554e-05, |
|
"loss": 0.4485, |
|
"num_tokens": 1045205381.0, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 1.8703071672354947, |
|
"grad_norm": 0.2506743431312461, |
|
"learning_rate": 2.0322078353283676e-05, |
|
"loss": 0.4404, |
|
"num_tokens": 1047143820.0, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.8737201365187715, |
|
"grad_norm": 0.2586737586940106, |
|
"learning_rate": 2.0241898942318538e-05, |
|
"loss": 0.4387, |
|
"num_tokens": 1049056564.0, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 1.8771331058020477, |
|
"grad_norm": 0.26391954870509793, |
|
"learning_rate": 2.016182227365559e-05, |
|
"loss": 0.4453, |
|
"num_tokens": 1051032103.0, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.8805460750853242, |
|
"grad_norm": 0.2564856135718643, |
|
"learning_rate": 2.0081849480822896e-05, |
|
"loss": 0.4338, |
|
"num_tokens": 1052811471.0, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 1.8839590443686007, |
|
"grad_norm": 0.2470202752335773, |
|
"learning_rate": 2.000198169587811e-05, |
|
"loss": 0.4431, |
|
"num_tokens": 1054738378.0, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.887372013651877, |
|
"grad_norm": 0.2542101749179161, |
|
"learning_rate": 1.9922220049392438e-05, |
|
"loss": 0.4319, |
|
"num_tokens": 1056676264.0, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 1.8907849829351537, |
|
"grad_norm": 0.2435430685322008, |
|
"learning_rate": 1.9842565670434648e-05, |
|
"loss": 0.4226, |
|
"num_tokens": 1058467325.0, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.89419795221843, |
|
"grad_norm": 0.25562350059297084, |
|
"learning_rate": 1.9763019686555073e-05, |
|
"loss": 0.4376, |
|
"num_tokens": 1060452339.0, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 1.8976109215017065, |
|
"grad_norm": 0.29345078376520056, |
|
"learning_rate": 1.9683583223769658e-05, |
|
"loss": 0.428, |
|
"num_tokens": 1062292098.0, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.901023890784983, |
|
"grad_norm": 0.2584621207973962, |
|
"learning_rate": 1.9604257406544024e-05, |
|
"loss": 0.4511, |
|
"num_tokens": 1064263837.0, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 1.9044368600682593, |
|
"grad_norm": 0.27128805508680764, |
|
"learning_rate": 1.9525043357777516e-05, |
|
"loss": 0.4462, |
|
"num_tokens": 1066251533.0, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.9078498293515358, |
|
"grad_norm": 0.2727233705082907, |
|
"learning_rate": 1.9445942198787382e-05, |
|
"loss": 0.4361, |
|
"num_tokens": 1068173478.0, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 1.9112627986348123, |
|
"grad_norm": 0.2625441116715545, |
|
"learning_rate": 1.9366955049292828e-05, |
|
"loss": 0.4338, |
|
"num_tokens": 1070084280.0, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.9146757679180886, |
|
"grad_norm": 0.2739406236004986, |
|
"learning_rate": 1.9288083027399184e-05, |
|
"loss": 0.4318, |
|
"num_tokens": 1072030507.0, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 1.9180887372013653, |
|
"grad_norm": 0.2763042179831527, |
|
"learning_rate": 1.920932724958211e-05, |
|
"loss": 0.4503, |
|
"num_tokens": 1073943343.0, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.9215017064846416, |
|
"grad_norm": 0.26164021127778475, |
|
"learning_rate": 1.9130688830671767e-05, |
|
"loss": 0.4441, |
|
"num_tokens": 1075835547.0, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 1.924914675767918, |
|
"grad_norm": 0.27098665794683024, |
|
"learning_rate": 1.9052168883837036e-05, |
|
"loss": 0.4381, |
|
"num_tokens": 1077765498.0, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.9283276450511946, |
|
"grad_norm": 0.23896063773534829, |
|
"learning_rate": 1.8973768520569736e-05, |
|
"loss": 0.4287, |
|
"num_tokens": 1079685882.0, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 1.9317406143344709, |
|
"grad_norm": 0.25936956300235064, |
|
"learning_rate": 1.889548885066894e-05, |
|
"loss": 0.4391, |
|
"num_tokens": 1081506827.0, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.9351535836177476, |
|
"grad_norm": 0.2535472219270516, |
|
"learning_rate": 1.8817330982225266e-05, |
|
"loss": 0.4513, |
|
"num_tokens": 1083401425.0, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 1.9385665529010239, |
|
"grad_norm": 0.26568889655316086, |
|
"learning_rate": 1.8739296021605118e-05, |
|
"loss": 0.4427, |
|
"num_tokens": 1085262365.0, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.9419795221843004, |
|
"grad_norm": 0.266890642029229, |
|
"learning_rate": 1.8661385073435107e-05, |
|
"loss": 0.4353, |
|
"num_tokens": 1087251428.0, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 1.9453924914675769, |
|
"grad_norm": 0.26724914343445294, |
|
"learning_rate": 1.858359924058637e-05, |
|
"loss": 0.4465, |
|
"num_tokens": 1089194392.0, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.9488054607508531, |
|
"grad_norm": 0.2564504243189192, |
|
"learning_rate": 1.8505939624158974e-05, |
|
"loss": 0.4376, |
|
"num_tokens": 1091066424.0, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 1.9522184300341296, |
|
"grad_norm": 0.25705029863794304, |
|
"learning_rate": 1.8428407323466325e-05, |
|
"loss": 0.4382, |
|
"num_tokens": 1093050518.0, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.9556313993174061, |
|
"grad_norm": 0.2431513884931927, |
|
"learning_rate": 1.8351003436019594e-05, |
|
"loss": 0.455, |
|
"num_tokens": 1095046925.0, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 1.9590443686006824, |
|
"grad_norm": 0.2664425379728042, |
|
"learning_rate": 1.8273729057512213e-05, |
|
"loss": 0.4329, |
|
"num_tokens": 1096832378.0, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.9624573378839592, |
|
"grad_norm": 0.2604943224342159, |
|
"learning_rate": 1.8196585281804328e-05, |
|
"loss": 0.4331, |
|
"num_tokens": 1098831283.0, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 1.9658703071672354, |
|
"grad_norm": 0.25398450356166435, |
|
"learning_rate": 1.8119573200907346e-05, |
|
"loss": 0.435, |
|
"num_tokens": 1100801547.0, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.969283276450512, |
|
"grad_norm": 0.27269389447622255, |
|
"learning_rate": 1.8042693904968466e-05, |
|
"loss": 0.4346, |
|
"num_tokens": 1102602774.0, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 1.9726962457337884, |
|
"grad_norm": 0.2680478392449107, |
|
"learning_rate": 1.7965948482255245e-05, |
|
"loss": 0.4434, |
|
"num_tokens": 1104612875.0, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.9761092150170647, |
|
"grad_norm": 0.25071872779112125, |
|
"learning_rate": 1.7889338019140155e-05, |
|
"loss": 0.4252, |
|
"num_tokens": 1106487794.0, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 1.9795221843003414, |
|
"grad_norm": 0.24655928594788282, |
|
"learning_rate": 1.7812863600085295e-05, |
|
"loss": 0.4294, |
|
"num_tokens": 1108397131.0, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.9829351535836177, |
|
"grad_norm": 0.27068290921667937, |
|
"learning_rate": 1.7736526307626984e-05, |
|
"loss": 0.4465, |
|
"num_tokens": 1110328750.0, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 1.9863481228668942, |
|
"grad_norm": 0.26321066071381893, |
|
"learning_rate": 1.766032722236038e-05, |
|
"loss": 0.4421, |
|
"num_tokens": 1112269290.0, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.9897610921501707, |
|
"grad_norm": 0.24753433514478085, |
|
"learning_rate": 1.7584267422924316e-05, |
|
"loss": 0.4444, |
|
"num_tokens": 1114173750.0, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 1.993174061433447, |
|
"grad_norm": 0.24923481634416494, |
|
"learning_rate": 1.750834798598592e-05, |
|
"loss": 0.4284, |
|
"num_tokens": 1116064691.0, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.9965870307167235, |
|
"grad_norm": 0.2494698895081856, |
|
"learning_rate": 1.743256998622543e-05, |
|
"loss": 0.4347, |
|
"num_tokens": 1118027431.0, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.2710124353887983, |
|
"learning_rate": 1.7356934496320964e-05, |
|
"loss": 0.4343, |
|
"num_tokens": 1119829452.0, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.0034129692832763, |
|
"grad_norm": 0.3039709938451478, |
|
"learning_rate": 1.7281442586933312e-05, |
|
"loss": 0.3991, |
|
"num_tokens": 1121774995.0, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 2.006825938566553, |
|
"grad_norm": 0.26543870061365027, |
|
"learning_rate": 1.720609532669085e-05, |
|
"loss": 0.3938, |
|
"num_tokens": 1123664582.0, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.0102389078498293, |
|
"grad_norm": 0.27016391920786065, |
|
"learning_rate": 1.7130893782174333e-05, |
|
"loss": 0.3919, |
|
"num_tokens": 1125654432.0, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 2.013651877133106, |
|
"grad_norm": 0.27052638008268537, |
|
"learning_rate": 1.7055839017901835e-05, |
|
"loss": 0.4028, |
|
"num_tokens": 1127599030.0, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.0170648464163823, |
|
"grad_norm": 0.2562182608373477, |
|
"learning_rate": 1.6980932096313697e-05, |
|
"loss": 0.3973, |
|
"num_tokens": 1129551904.0, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 2.0204778156996586, |
|
"grad_norm": 0.28525693433456295, |
|
"learning_rate": 1.6906174077757448e-05, |
|
"loss": 0.3929, |
|
"num_tokens": 1131280173.0, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.0238907849829353, |
|
"grad_norm": 0.2743618011639598, |
|
"learning_rate": 1.6831566020472817e-05, |
|
"loss": 0.406, |
|
"num_tokens": 1133218605.0, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 2.0273037542662116, |
|
"grad_norm": 0.253725112327626, |
|
"learning_rate": 1.675710898057677e-05, |
|
"loss": 0.3924, |
|
"num_tokens": 1135165512.0, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.030716723549488, |
|
"grad_norm": 0.2848097672160611, |
|
"learning_rate": 1.668280401204852e-05, |
|
"loss": 0.4141, |
|
"num_tokens": 1137123206.0, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 2.0341296928327646, |
|
"grad_norm": 0.28740321034842653, |
|
"learning_rate": 1.6608652166714625e-05, |
|
"loss": 0.4012, |
|
"num_tokens": 1139148605.0, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.037542662116041, |
|
"grad_norm": 0.2499470155743692, |
|
"learning_rate": 1.6534654494234137e-05, |
|
"loss": 0.4023, |
|
"num_tokens": 1141138126.0, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 2.0409556313993176, |
|
"grad_norm": 0.2471292947781148, |
|
"learning_rate": 1.6460812042083656e-05, |
|
"loss": 0.3882, |
|
"num_tokens": 1142933925.0, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.044368600682594, |
|
"grad_norm": 0.2507392899848245, |
|
"learning_rate": 1.6387125855542612e-05, |
|
"loss": 0.3836, |
|
"num_tokens": 1144759330.0, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 2.04778156996587, |
|
"grad_norm": 0.2398128538110352, |
|
"learning_rate": 1.6313596977678365e-05, |
|
"loss": 0.4016, |
|
"num_tokens": 1146764722.0, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.051194539249147, |
|
"grad_norm": 0.26352910950792985, |
|
"learning_rate": 1.624022644933151e-05, |
|
"loss": 0.4162, |
|
"num_tokens": 1148732344.0, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 2.054607508532423, |
|
"grad_norm": 0.25363271732366105, |
|
"learning_rate": 1.6167015309101124e-05, |
|
"loss": 0.3852, |
|
"num_tokens": 1150602940.0, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.0580204778157, |
|
"grad_norm": 0.2507903258771862, |
|
"learning_rate": 1.6093964593330032e-05, |
|
"loss": 0.3964, |
|
"num_tokens": 1152471105.0, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 2.061433447098976, |
|
"grad_norm": 0.2582500570099867, |
|
"learning_rate": 1.6021075336090195e-05, |
|
"loss": 0.4066, |
|
"num_tokens": 1154361749.0, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.0648464163822524, |
|
"grad_norm": 0.25407514009499443, |
|
"learning_rate": 1.5948348569168037e-05, |
|
"loss": 0.3958, |
|
"num_tokens": 1156475739.0, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 2.068259385665529, |
|
"grad_norm": 0.26754352821223326, |
|
"learning_rate": 1.587578532204983e-05, |
|
"loss": 0.3986, |
|
"num_tokens": 1158363541.0, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.0716723549488054, |
|
"grad_norm": 0.263192137906748, |
|
"learning_rate": 1.5803386621907145e-05, |
|
"loss": 0.4023, |
|
"num_tokens": 1160236837.0, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 2.0750853242320817, |
|
"grad_norm": 0.25505691061126157, |
|
"learning_rate": 1.573115349358231e-05, |
|
"loss": 0.4002, |
|
"num_tokens": 1162129918.0, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 2.0784982935153584, |
|
"grad_norm": 0.2711167689164809, |
|
"learning_rate": 1.5659086959573887e-05, |
|
"loss": 0.4018, |
|
"num_tokens": 1164024665.0, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 2.0819112627986347, |
|
"grad_norm": 0.26572076468891587, |
|
"learning_rate": 1.5587188040022198e-05, |
|
"loss": 0.3969, |
|
"num_tokens": 1165915543.0, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.0853242320819114, |
|
"grad_norm": 0.28019838675386793, |
|
"learning_rate": 1.5515457752694897e-05, |
|
"loss": 0.3945, |
|
"num_tokens": 1167829802.0, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 2.0887372013651877, |
|
"grad_norm": 0.26850496481924274, |
|
"learning_rate": 1.544389711297257e-05, |
|
"loss": 0.395, |
|
"num_tokens": 1169715890.0, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.092150170648464, |
|
"grad_norm": 0.2791716668257277, |
|
"learning_rate": 1.5372507133834368e-05, |
|
"loss": 0.4012, |
|
"num_tokens": 1171614830.0, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 2.0955631399317407, |
|
"grad_norm": 0.26105997446681856, |
|
"learning_rate": 1.5301288825843584e-05, |
|
"loss": 0.4042, |
|
"num_tokens": 1173464835.0, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 2.098976109215017, |
|
"grad_norm": 0.2670165726151138, |
|
"learning_rate": 1.523024319713348e-05, |
|
"loss": 0.4011, |
|
"num_tokens": 1175409243.0, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 2.1023890784982937, |
|
"grad_norm": 0.2876373869088731, |
|
"learning_rate": 1.5159371253392928e-05, |
|
"loss": 0.4077, |
|
"num_tokens": 1177337387.0, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 2.10580204778157, |
|
"grad_norm": 0.26433480908646045, |
|
"learning_rate": 1.5088673997852183e-05, |
|
"loss": 0.3956, |
|
"num_tokens": 1179233879.0, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 2.1092150170648463, |
|
"grad_norm": 0.25917326213961195, |
|
"learning_rate": 1.5018152431268712e-05, |
|
"loss": 0.4027, |
|
"num_tokens": 1181151842.0, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 2.112627986348123, |
|
"grad_norm": 0.28427260914311997, |
|
"learning_rate": 1.4947807551913001e-05, |
|
"loss": 0.4043, |
|
"num_tokens": 1183082878.0, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 2.1160409556313993, |
|
"grad_norm": 0.30436740585047983, |
|
"learning_rate": 1.4877640355554454e-05, |
|
"loss": 0.4027, |
|
"num_tokens": 1185016399.0, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.1194539249146755, |
|
"grad_norm": 0.2834148607288468, |
|
"learning_rate": 1.480765183544725e-05, |
|
"loss": 0.4126, |
|
"num_tokens": 1186879823.0, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 2.1228668941979523, |
|
"grad_norm": 0.27439489593858457, |
|
"learning_rate": 1.4737842982316313e-05, |
|
"loss": 0.4134, |
|
"num_tokens": 1188773106.0, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 2.1262798634812285, |
|
"grad_norm": 0.26473117800887336, |
|
"learning_rate": 1.4668214784343315e-05, |
|
"loss": 0.3792, |
|
"num_tokens": 1190558029.0, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 2.1296928327645053, |
|
"grad_norm": 0.25563541947968327, |
|
"learning_rate": 1.4598768227152621e-05, |
|
"loss": 0.4037, |
|
"num_tokens": 1192501980.0, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 2.1331058020477816, |
|
"grad_norm": 0.2503903645932623, |
|
"learning_rate": 1.4529504293797389e-05, |
|
"loss": 0.3877, |
|
"num_tokens": 1194378327.0, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 2.136518771331058, |
|
"grad_norm": 0.24140775538856318, |
|
"learning_rate": 1.4460423964745649e-05, |
|
"loss": 0.3934, |
|
"num_tokens": 1196267966.0, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 2.1399317406143346, |
|
"grad_norm": 0.2600080789731432, |
|
"learning_rate": 1.4391528217866396e-05, |
|
"loss": 0.3973, |
|
"num_tokens": 1198190295.0, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 2.143344709897611, |
|
"grad_norm": 0.27181663294997715, |
|
"learning_rate": 1.4322818028415765e-05, |
|
"loss": 0.3988, |
|
"num_tokens": 1200151285.0, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 2.1467576791808876, |
|
"grad_norm": 0.2537637030557871, |
|
"learning_rate": 1.4254294369023258e-05, |
|
"loss": 0.3963, |
|
"num_tokens": 1202046752.0, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 2.150170648464164, |
|
"grad_norm": 0.2494321959131866, |
|
"learning_rate": 1.4185958209677901e-05, |
|
"loss": 0.3956, |
|
"num_tokens": 1203977515.0, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.15358361774744, |
|
"grad_norm": 0.26108243025286193, |
|
"learning_rate": 1.4117810517714575e-05, |
|
"loss": 0.4032, |
|
"num_tokens": 1205900022.0, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 2.156996587030717, |
|
"grad_norm": 0.26473158504792393, |
|
"learning_rate": 1.4049852257800325e-05, |
|
"loss": 0.4023, |
|
"num_tokens": 1207887070.0, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 2.160409556313993, |
|
"grad_norm": 0.24762994609328232, |
|
"learning_rate": 1.3982084391920641e-05, |
|
"loss": 0.4029, |
|
"num_tokens": 1209821581.0, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 2.1638225255972694, |
|
"grad_norm": 0.26299537685268803, |
|
"learning_rate": 1.391450787936594e-05, |
|
"loss": 0.3893, |
|
"num_tokens": 1211692505.0, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 2.167235494880546, |
|
"grad_norm": 0.25375467426023246, |
|
"learning_rate": 1.3847123676717857e-05, |
|
"loss": 0.41, |
|
"num_tokens": 1213608110.0, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 2.1706484641638224, |
|
"grad_norm": 0.2635067571833855, |
|
"learning_rate": 1.3779932737835844e-05, |
|
"loss": 0.4067, |
|
"num_tokens": 1215501666.0, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 2.174061433447099, |
|
"grad_norm": 0.2590695796559807, |
|
"learning_rate": 1.371293601384358e-05, |
|
"loss": 0.4057, |
|
"num_tokens": 1217434979.0, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 2.1774744027303754, |
|
"grad_norm": 0.2743187153407509, |
|
"learning_rate": 1.36461344531155e-05, |
|
"loss": 0.3976, |
|
"num_tokens": 1219286362.0, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 2.1808873720136517, |
|
"grad_norm": 0.2553965380053624, |
|
"learning_rate": 1.3579529001263441e-05, |
|
"loss": 0.4143, |
|
"num_tokens": 1221237480.0, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 2.1843003412969284, |
|
"grad_norm": 0.2530670854295067, |
|
"learning_rate": 1.3513120601123195e-05, |
|
"loss": 0.3923, |
|
"num_tokens": 1223165783.0, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.1877133105802047, |
|
"grad_norm": 0.24797200268758846, |
|
"learning_rate": 1.3446910192741174e-05, |
|
"loss": 0.3729, |
|
"num_tokens": 1224980893.0, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 2.1911262798634814, |
|
"grad_norm": 0.2620732555128451, |
|
"learning_rate": 1.3380898713361128e-05, |
|
"loss": 0.3927, |
|
"num_tokens": 1226871414.0, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 2.1945392491467577, |
|
"grad_norm": 0.2609322854993342, |
|
"learning_rate": 1.3315087097410835e-05, |
|
"loss": 0.4083, |
|
"num_tokens": 1228764510.0, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 2.197952218430034, |
|
"grad_norm": 0.24742442098811646, |
|
"learning_rate": 1.3249476276488937e-05, |
|
"loss": 0.4007, |
|
"num_tokens": 1230729714.0, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 2.2013651877133107, |
|
"grad_norm": 0.2781961602579005, |
|
"learning_rate": 1.3184067179351677e-05, |
|
"loss": 0.3857, |
|
"num_tokens": 1232478639.0, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 2.204778156996587, |
|
"grad_norm": 0.2562282084423495, |
|
"learning_rate": 1.3118860731899807e-05, |
|
"loss": 0.4118, |
|
"num_tokens": 1234465242.0, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 2.2081911262798632, |
|
"grad_norm": 0.2681846880059678, |
|
"learning_rate": 1.305385785716548e-05, |
|
"loss": 0.4051, |
|
"num_tokens": 1236387820.0, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 2.21160409556314, |
|
"grad_norm": 0.25338427225837795, |
|
"learning_rate": 1.2989059475299137e-05, |
|
"loss": 0.3926, |
|
"num_tokens": 1238270889.0, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 2.2150170648464163, |
|
"grad_norm": 0.24393136100195462, |
|
"learning_rate": 1.2924466503556523e-05, |
|
"loss": 0.4102, |
|
"num_tokens": 1240290068.0, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 2.218430034129693, |
|
"grad_norm": 0.2528007996358032, |
|
"learning_rate": 1.2860079856285717e-05, |
|
"loss": 0.3913, |
|
"num_tokens": 1242113295.0, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.2218430034129693, |
|
"grad_norm": 0.2578860500257866, |
|
"learning_rate": 1.279590044491414e-05, |
|
"loss": 0.4018, |
|
"num_tokens": 1244058774.0, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 2.2252559726962455, |
|
"grad_norm": 0.2556123598222948, |
|
"learning_rate": 1.2731929177935664e-05, |
|
"loss": 0.4069, |
|
"num_tokens": 1245986085.0, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 2.2286689419795223, |
|
"grad_norm": 0.24628583265829654, |
|
"learning_rate": 1.2668166960897815e-05, |
|
"loss": 0.4175, |
|
"num_tokens": 1248054717.0, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 2.2320819112627985, |
|
"grad_norm": 0.2539187999798937, |
|
"learning_rate": 1.2604614696388855e-05, |
|
"loss": 0.3911, |
|
"num_tokens": 1249915944.0, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 2.2354948805460753, |
|
"grad_norm": 0.28754364964508844, |
|
"learning_rate": 1.2541273284025088e-05, |
|
"loss": 0.3939, |
|
"num_tokens": 1251667373.0, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 2.2389078498293515, |
|
"grad_norm": 0.26299909392718523, |
|
"learning_rate": 1.247814362043808e-05, |
|
"loss": 0.4014, |
|
"num_tokens": 1253547204.0, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 2.242320819112628, |
|
"grad_norm": 0.25498776876513835, |
|
"learning_rate": 1.2415226599261972e-05, |
|
"loss": 0.4059, |
|
"num_tokens": 1255555709.0, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 2.2457337883959045, |
|
"grad_norm": 0.2524752384826022, |
|
"learning_rate": 1.2352523111120858e-05, |
|
"loss": 0.3885, |
|
"num_tokens": 1257467951.0, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 2.249146757679181, |
|
"grad_norm": 0.2636786776353791, |
|
"learning_rate": 1.2290034043616148e-05, |
|
"loss": 0.4067, |
|
"num_tokens": 1259415847.0, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 2.252559726962457, |
|
"grad_norm": 0.24510895102259947, |
|
"learning_rate": 1.2227760281314001e-05, |
|
"loss": 0.4056, |
|
"num_tokens": 1261370375.0, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.255972696245734, |
|
"grad_norm": 0.2685313662473033, |
|
"learning_rate": 1.216570270573284e-05, |
|
"loss": 0.396, |
|
"num_tokens": 1263209852.0, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 2.25938566552901, |
|
"grad_norm": 0.2912242466154553, |
|
"learning_rate": 1.2103862195330833e-05, |
|
"loss": 0.4135, |
|
"num_tokens": 1265148638.0, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 2.262798634812287, |
|
"grad_norm": 0.2508984374513887, |
|
"learning_rate": 1.2042239625493465e-05, |
|
"loss": 0.3856, |
|
"num_tokens": 1267037306.0, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 2.266211604095563, |
|
"grad_norm": 0.72856347101417, |
|
"learning_rate": 1.1980835868521188e-05, |
|
"loss": 0.4029, |
|
"num_tokens": 1268983314.0, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 2.26962457337884, |
|
"grad_norm": 0.25899903691774656, |
|
"learning_rate": 1.1919651793617011e-05, |
|
"loss": 0.3838, |
|
"num_tokens": 1270826829.0, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 2.273037542662116, |
|
"grad_norm": 0.25418536949443005, |
|
"learning_rate": 1.185868826687424e-05, |
|
"loss": 0.3927, |
|
"num_tokens": 1272699748.0, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 2.2764505119453924, |
|
"grad_norm": 0.2454564607450951, |
|
"learning_rate": 1.1797946151264186e-05, |
|
"loss": 0.3889, |
|
"num_tokens": 1274682677.0, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 2.279863481228669, |
|
"grad_norm": 0.2401309320335627, |
|
"learning_rate": 1.1737426306623996e-05, |
|
"loss": 0.3964, |
|
"num_tokens": 1276615688.0, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 2.2832764505119454, |
|
"grad_norm": 0.262862375486088, |
|
"learning_rate": 1.1677129589644446e-05, |
|
"loss": 0.412, |
|
"num_tokens": 1278546081.0, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 2.2866894197952217, |
|
"grad_norm": 0.2453847358307599, |
|
"learning_rate": 1.1617056853857787e-05, |
|
"loss": 0.3943, |
|
"num_tokens": 1280497422.0, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.2901023890784984, |
|
"grad_norm": 0.27755459465541255, |
|
"learning_rate": 1.1557208949625736e-05, |
|
"loss": 0.4032, |
|
"num_tokens": 1282340664.0, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 2.2935153583617747, |
|
"grad_norm": 0.24394967766877984, |
|
"learning_rate": 1.1497586724127396e-05, |
|
"loss": 0.3937, |
|
"num_tokens": 1284342162.0, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 2.296928327645051, |
|
"grad_norm": 0.23730769947011054, |
|
"learning_rate": 1.143819102134723e-05, |
|
"loss": 0.4012, |
|
"num_tokens": 1286272914.0, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 2.3003412969283277, |
|
"grad_norm": 0.2742813641159952, |
|
"learning_rate": 1.1379022682063195e-05, |
|
"loss": 0.3933, |
|
"num_tokens": 1288199996.0, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 2.303754266211604, |
|
"grad_norm": 0.2642748821784269, |
|
"learning_rate": 1.1320082543834764e-05, |
|
"loss": 0.4001, |
|
"num_tokens": 1290121821.0, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 2.3071672354948807, |
|
"grad_norm": 0.2554580567261532, |
|
"learning_rate": 1.1261371440991137e-05, |
|
"loss": 0.4088, |
|
"num_tokens": 1292096399.0, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 2.310580204778157, |
|
"grad_norm": 0.25534131996398396, |
|
"learning_rate": 1.1202890204619353e-05, |
|
"loss": 0.3855, |
|
"num_tokens": 1293977825.0, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 2.3139931740614337, |
|
"grad_norm": 0.2511343749324971, |
|
"learning_rate": 1.1144639662552592e-05, |
|
"loss": 0.4002, |
|
"num_tokens": 1295850805.0, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 2.31740614334471, |
|
"grad_norm": 0.24497341281835422, |
|
"learning_rate": 1.1086620639358442e-05, |
|
"loss": 0.3844, |
|
"num_tokens": 1297791728.0, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 2.3208191126279862, |
|
"grad_norm": 0.24993434873174314, |
|
"learning_rate": 1.1028833956327198e-05, |
|
"loss": 0.393, |
|
"num_tokens": 1299649317.0, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.324232081911263, |
|
"grad_norm": 0.2766260502313979, |
|
"learning_rate": 1.0971280431460257e-05, |
|
"loss": 0.4048, |
|
"num_tokens": 1301526586.0, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 2.3276450511945392, |
|
"grad_norm": 0.23439107923909808, |
|
"learning_rate": 1.0913960879458557e-05, |
|
"loss": 0.4113, |
|
"num_tokens": 1303510626.0, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 2.3310580204778155, |
|
"grad_norm": 0.24827116673836433, |
|
"learning_rate": 1.0856876111711003e-05, |
|
"loss": 0.3924, |
|
"num_tokens": 1305289984.0, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 2.3344709897610922, |
|
"grad_norm": 0.25114000454246466, |
|
"learning_rate": 1.0800026936283011e-05, |
|
"loss": 0.4038, |
|
"num_tokens": 1307208316.0, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 2.3378839590443685, |
|
"grad_norm": 0.26280740614744735, |
|
"learning_rate": 1.074341415790507e-05, |
|
"loss": 0.3893, |
|
"num_tokens": 1309117594.0, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 2.3412969283276452, |
|
"grad_norm": 0.25721690855657364, |
|
"learning_rate": 1.0687038577961334e-05, |
|
"loss": 0.3987, |
|
"num_tokens": 1310930129.0, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 2.3447098976109215, |
|
"grad_norm": 0.2648598938841444, |
|
"learning_rate": 1.0630900994478271e-05, |
|
"loss": 0.4002, |
|
"num_tokens": 1312822051.0, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 2.348122866894198, |
|
"grad_norm": 0.26164289201070806, |
|
"learning_rate": 1.0575002202113422e-05, |
|
"loss": 0.3905, |
|
"num_tokens": 1314614448.0, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 2.3515358361774745, |
|
"grad_norm": 0.23672854825147213, |
|
"learning_rate": 1.0519342992144073e-05, |
|
"loss": 0.3897, |
|
"num_tokens": 1316495391.0, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 2.354948805460751, |
|
"grad_norm": 0.25335620270381537, |
|
"learning_rate": 1.0463924152456117e-05, |
|
"loss": 0.3894, |
|
"num_tokens": 1318368009.0, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.3583617747440275, |
|
"grad_norm": 0.23819804416780682, |
|
"learning_rate": 1.0408746467532864e-05, |
|
"loss": 0.3888, |
|
"num_tokens": 1320174846.0, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 2.361774744027304, |
|
"grad_norm": 0.24041042748052557, |
|
"learning_rate": 1.0353810718443949e-05, |
|
"loss": 0.3917, |
|
"num_tokens": 1322107997.0, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 2.36518771331058, |
|
"grad_norm": 0.25302315227083694, |
|
"learning_rate": 1.0299117682834295e-05, |
|
"loss": 0.3926, |
|
"num_tokens": 1323934060.0, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 2.368600682593857, |
|
"grad_norm": 0.24411931442269785, |
|
"learning_rate": 1.0244668134913053e-05, |
|
"loss": 0.3974, |
|
"num_tokens": 1325905507.0, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 2.372013651877133, |
|
"grad_norm": 0.24821036313437658, |
|
"learning_rate": 1.0190462845442702e-05, |
|
"loss": 0.3924, |
|
"num_tokens": 1327780847.0, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 2.3754266211604094, |
|
"grad_norm": 0.23916482562392274, |
|
"learning_rate": 1.0136502581728109e-05, |
|
"loss": 0.3895, |
|
"num_tokens": 1329668788.0, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 2.378839590443686, |
|
"grad_norm": 0.25207319235021297, |
|
"learning_rate": 1.0082788107605665e-05, |
|
"loss": 0.3874, |
|
"num_tokens": 1331519325.0, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 2.3822525597269624, |
|
"grad_norm": 0.2649045025779577, |
|
"learning_rate": 1.0029320183432468e-05, |
|
"loss": 0.387, |
|
"num_tokens": 1333459697.0, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 2.385665529010239, |
|
"grad_norm": 0.2722179095902226, |
|
"learning_rate": 9.976099566075591e-06, |
|
"loss": 0.3961, |
|
"num_tokens": 1335261419.0, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 2.3890784982935154, |
|
"grad_norm": 0.2631815719468567, |
|
"learning_rate": 9.923127008901334e-06, |
|
"loss": 0.4047, |
|
"num_tokens": 1337245592.0, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.3924914675767917, |
|
"grad_norm": 0.24412031355907363, |
|
"learning_rate": 9.87040326176457e-06, |
|
"loss": 0.3937, |
|
"num_tokens": 1339175144.0, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 2.3959044368600684, |
|
"grad_norm": 0.24694376687052613, |
|
"learning_rate": 9.817929070998133e-06, |
|
"loss": 0.3992, |
|
"num_tokens": 1341031356.0, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 2.3993174061433447, |
|
"grad_norm": 0.24983130243543367, |
|
"learning_rate": 9.765705179402262e-06, |
|
"loss": 0.4056, |
|
"num_tokens": 1342981326.0, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 2.4027303754266214, |
|
"grad_norm": 0.27290814627861304, |
|
"learning_rate": 9.713732326234085e-06, |
|
"loss": 0.4064, |
|
"num_tokens": 1344847445.0, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 2.4061433447098977, |
|
"grad_norm": 0.2735478214317808, |
|
"learning_rate": 9.662011247197111e-06, |
|
"loss": 0.409, |
|
"num_tokens": 1346880201.0, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 2.409556313993174, |
|
"grad_norm": 0.2676701207069597, |
|
"learning_rate": 9.610542674430893e-06, |
|
"loss": 0.411, |
|
"num_tokens": 1348775492.0, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 2.4129692832764507, |
|
"grad_norm": 0.2535137200935169, |
|
"learning_rate": 9.559327336500597e-06, |
|
"loss": 0.3916, |
|
"num_tokens": 1350779236.0, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 2.416382252559727, |
|
"grad_norm": 0.2568613906059044, |
|
"learning_rate": 9.508365958386714e-06, |
|
"loss": 0.3925, |
|
"num_tokens": 1352677486.0, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 2.419795221843003, |
|
"grad_norm": 0.24833002064524512, |
|
"learning_rate": 9.457659261474821e-06, |
|
"loss": 0.3989, |
|
"num_tokens": 1354567655.0, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 2.42320819112628, |
|
"grad_norm": 0.25899774223190003, |
|
"learning_rate": 9.407207963545322e-06, |
|
"loss": 0.4025, |
|
"num_tokens": 1356434352.0, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.426621160409556, |
|
"grad_norm": 0.2464870171227337, |
|
"learning_rate": 9.357012778763327e-06, |
|
"loss": 0.3976, |
|
"num_tokens": 1358296741.0, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 2.430034129692833, |
|
"grad_norm": 0.23706503790371114, |
|
"learning_rate": 9.307074417668519e-06, |
|
"loss": 0.4037, |
|
"num_tokens": 1360327394.0, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 2.4334470989761092, |
|
"grad_norm": 0.24878886578696077, |
|
"learning_rate": 9.2573935871651e-06, |
|
"loss": 0.397, |
|
"num_tokens": 1362266965.0, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 2.4368600682593855, |
|
"grad_norm": 0.2527659055892693, |
|
"learning_rate": 9.207970990511808e-06, |
|
"loss": 0.3931, |
|
"num_tokens": 1364208448.0, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 2.4402730375426622, |
|
"grad_norm": 0.2584388941834717, |
|
"learning_rate": 9.158807327311925e-06, |
|
"loss": 0.3982, |
|
"num_tokens": 1366138625.0, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 2.4436860068259385, |
|
"grad_norm": 0.2538965465130803, |
|
"learning_rate": 9.109903293503386e-06, |
|
"loss": 0.4053, |
|
"num_tokens": 1368129796.0, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 2.4470989761092152, |
|
"grad_norm": 0.2473126356674962, |
|
"learning_rate": 9.061259581348966e-06, |
|
"loss": 0.4024, |
|
"num_tokens": 1370084334.0, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 2.4505119453924915, |
|
"grad_norm": 0.25579986929786075, |
|
"learning_rate": 9.01287687942641e-06, |
|
"loss": 0.3974, |
|
"num_tokens": 1371929041.0, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 2.453924914675768, |
|
"grad_norm": 0.24496560953291152, |
|
"learning_rate": 8.964755872618739e-06, |
|
"loss": 0.3945, |
|
"num_tokens": 1373930783.0, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 2.4573378839590445, |
|
"grad_norm": 0.23916550521261395, |
|
"learning_rate": 8.916897242104547e-06, |
|
"loss": 0.3964, |
|
"num_tokens": 1375701367.0, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.460750853242321, |
|
"grad_norm": 0.27385157987053893, |
|
"learning_rate": 8.869301665348344e-06, |
|
"loss": 0.3882, |
|
"num_tokens": 1377461734.0, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 2.464163822525597, |
|
"grad_norm": 0.25702930592535705, |
|
"learning_rate": 8.821969816090966e-06, |
|
"loss": 0.3927, |
|
"num_tokens": 1379370664.0, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 2.467576791808874, |
|
"grad_norm": 0.2525995216753665, |
|
"learning_rate": 8.774902364340062e-06, |
|
"loss": 0.3914, |
|
"num_tokens": 1381309974.0, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 2.47098976109215, |
|
"grad_norm": 0.2513274331105906, |
|
"learning_rate": 8.728099976360573e-06, |
|
"loss": 0.3993, |
|
"num_tokens": 1383131522.0, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 2.474402730375427, |
|
"grad_norm": 0.23420881905848545, |
|
"learning_rate": 8.68156331466535e-06, |
|
"loss": 0.3939, |
|
"num_tokens": 1385084855.0, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 2.477815699658703, |
|
"grad_norm": 0.28788382628340653, |
|
"learning_rate": 8.635293038005704e-06, |
|
"loss": 0.3957, |
|
"num_tokens": 1386955618.0, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 2.4812286689419794, |
|
"grad_norm": 0.2587696251636566, |
|
"learning_rate": 8.58928980136216e-06, |
|
"loss": 0.394, |
|
"num_tokens": 1388905153.0, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 2.484641638225256, |
|
"grad_norm": 0.2590768818161258, |
|
"learning_rate": 8.543554255935143e-06, |
|
"loss": 0.4054, |
|
"num_tokens": 1390842205.0, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 2.4880546075085324, |
|
"grad_norm": 0.24772041281038729, |
|
"learning_rate": 8.498087049135738e-06, |
|
"loss": 0.3858, |
|
"num_tokens": 1392714747.0, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 2.491467576791809, |
|
"grad_norm": 0.24778534706466526, |
|
"learning_rate": 8.452888824576588e-06, |
|
"loss": 0.385, |
|
"num_tokens": 1394469454.0, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.4948805460750854, |
|
"grad_norm": 0.24364215488154967, |
|
"learning_rate": 8.407960222062734e-06, |
|
"loss": 0.3941, |
|
"num_tokens": 1396446146.0, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 2.4982935153583616, |
|
"grad_norm": 0.25058147643890444, |
|
"learning_rate": 8.363301877582572e-06, |
|
"loss": 0.3849, |
|
"num_tokens": 1398333563.0, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 2.5017064846416384, |
|
"grad_norm": 0.25450013492478507, |
|
"learning_rate": 8.318914423298848e-06, |
|
"loss": 0.399, |
|
"num_tokens": 1400273261.0, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 2.5051194539249146, |
|
"grad_norm": 0.24566497134038118, |
|
"learning_rate": 8.274798487539715e-06, |
|
"loss": 0.4041, |
|
"num_tokens": 1402142031.0, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 2.508532423208191, |
|
"grad_norm": 0.2611873096022051, |
|
"learning_rate": 8.23095469478984e-06, |
|
"loss": 0.3979, |
|
"num_tokens": 1404063953.0, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 2.5119453924914676, |
|
"grad_norm": 0.24351692997405652, |
|
"learning_rate": 8.18738366568157e-06, |
|
"loss": 0.4078, |
|
"num_tokens": 1406021491.0, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 2.515358361774744, |
|
"grad_norm": 0.2660498083738705, |
|
"learning_rate": 8.144086016986098e-06, |
|
"loss": 0.3933, |
|
"num_tokens": 1407825519.0, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 2.51877133105802, |
|
"grad_norm": 0.23795381298513307, |
|
"learning_rate": 8.10106236160482e-06, |
|
"loss": 0.3922, |
|
"num_tokens": 1409756663.0, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 2.522184300341297, |
|
"grad_norm": 0.22685355776603627, |
|
"learning_rate": 8.05831330856058e-06, |
|
"loss": 0.3956, |
|
"num_tokens": 1411714094.0, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 2.5255972696245736, |
|
"grad_norm": 0.24879932954473205, |
|
"learning_rate": 8.01583946298908e-06, |
|
"loss": 0.4137, |
|
"num_tokens": 1413598256.0, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.52901023890785, |
|
"grad_norm": 0.25134610896136816, |
|
"learning_rate": 7.97364142613033e-06, |
|
"loss": 0.3961, |
|
"num_tokens": 1415457816.0, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 2.532423208191126, |
|
"grad_norm": 0.24514468786362295, |
|
"learning_rate": 7.9317197953201e-06, |
|
"loss": 0.397, |
|
"num_tokens": 1417330639.0, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 2.535836177474403, |
|
"grad_norm": 0.23174407336003164, |
|
"learning_rate": 7.890075163981505e-06, |
|
"loss": 0.3903, |
|
"num_tokens": 1419297418.0, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 2.539249146757679, |
|
"grad_norm": 0.2510853584108354, |
|
"learning_rate": 7.848708121616567e-06, |
|
"loss": 0.4062, |
|
"num_tokens": 1421278815.0, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 2.5426621160409555, |
|
"grad_norm": 0.25550059792582824, |
|
"learning_rate": 7.807619253797891e-06, |
|
"loss": 0.4022, |
|
"num_tokens": 1423300631.0, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 2.546075085324232, |
|
"grad_norm": 0.24251183778596538, |
|
"learning_rate": 7.766809142160385e-06, |
|
"loss": 0.3899, |
|
"num_tokens": 1425213662.0, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 2.5494880546075085, |
|
"grad_norm": 0.25490145483738413, |
|
"learning_rate": 7.726278364393e-06, |
|
"loss": 0.3934, |
|
"num_tokens": 1427257799.0, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 2.5529010238907848, |
|
"grad_norm": 0.24924750644971594, |
|
"learning_rate": 7.686027494230566e-06, |
|
"loss": 0.3915, |
|
"num_tokens": 1429239975.0, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 2.5563139931740615, |
|
"grad_norm": 0.2449260738764112, |
|
"learning_rate": 7.646057101445686e-06, |
|
"loss": 0.386, |
|
"num_tokens": 1431117805.0, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 2.5597269624573378, |
|
"grad_norm": 0.23586099566737673, |
|
"learning_rate": 7.606367751840644e-06, |
|
"loss": 0.4013, |
|
"num_tokens": 1433152174.0, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 2.5631399317406145, |
|
"grad_norm": 0.23854900024400358, |
|
"learning_rate": 7.566960007239405e-06, |
|
"loss": 0.3911, |
|
"num_tokens": 1435008887.0, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 2.5665529010238908, |
|
"grad_norm": 0.2500987961586113, |
|
"learning_rate": 7.5278344254796764e-06, |
|
"loss": 0.4031, |
|
"num_tokens": 1436955687.0, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 2.5699658703071675, |
|
"grad_norm": 0.2706863126827021, |
|
"learning_rate": 7.48899156040499e-06, |
|
"loss": 0.3973, |
|
"num_tokens": 1438848557.0, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 2.573378839590444, |
|
"grad_norm": 0.2543704542807227, |
|
"learning_rate": 7.450431961856869e-06, |
|
"loss": 0.4031, |
|
"num_tokens": 1440739976.0, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 2.57679180887372, |
|
"grad_norm": 0.2669655519777121, |
|
"learning_rate": 7.412156175667064e-06, |
|
"loss": 0.3943, |
|
"num_tokens": 1442599846.0, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 2.580204778156997, |
|
"grad_norm": 0.24836800578210708, |
|
"learning_rate": 7.3741647436497846e-06, |
|
"loss": 0.3946, |
|
"num_tokens": 1444571386.0, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 2.583617747440273, |
|
"grad_norm": 0.2600600913802552, |
|
"learning_rate": 7.336458203594086e-06, |
|
"loss": 0.3937, |
|
"num_tokens": 1446445368.0, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 2.5870307167235493, |
|
"grad_norm": 0.24666717274669178, |
|
"learning_rate": 7.299037089256197e-06, |
|
"loss": 0.4076, |
|
"num_tokens": 1448440439.0, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 2.590443686006826, |
|
"grad_norm": 0.25452167411429993, |
|
"learning_rate": 7.2619019303520065e-06, |
|
"loss": 0.4054, |
|
"num_tokens": 1450323674.0, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 2.5938566552901023, |
|
"grad_norm": 0.26154476417880795, |
|
"learning_rate": 7.225053252549556e-06, |
|
"loss": 0.3907, |
|
"num_tokens": 1452237710.0, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.5972696245733786, |
|
"grad_norm": 0.2750355702856808, |
|
"learning_rate": 7.188491577461573e-06, |
|
"loss": 0.4003, |
|
"num_tokens": 1454072198.0, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 2.6006825938566553, |
|
"grad_norm": 0.26966622870777457, |
|
"learning_rate": 7.1522174226381315e-06, |
|
"loss": 0.4121, |
|
"num_tokens": 1455997298.0, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 2.6040955631399316, |
|
"grad_norm": 0.23947567913720003, |
|
"learning_rate": 7.116231301559292e-06, |
|
"loss": 0.3939, |
|
"num_tokens": 1457990359.0, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 2.6075085324232083, |
|
"grad_norm": 0.25804521315764417, |
|
"learning_rate": 7.080533723627844e-06, |
|
"loss": 0.392, |
|
"num_tokens": 1459817918.0, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 2.6109215017064846, |
|
"grad_norm": 0.2439185854647888, |
|
"learning_rate": 7.045125194162096e-06, |
|
"loss": 0.3891, |
|
"num_tokens": 1461691998.0, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 2.6143344709897613, |
|
"grad_norm": 0.2332908442050089, |
|
"learning_rate": 7.010006214388713e-06, |
|
"loss": 0.388, |
|
"num_tokens": 1463593827.0, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 2.6177474402730376, |
|
"grad_norm": 0.25261750552287104, |
|
"learning_rate": 6.975177281435641e-06, |
|
"loss": 0.3915, |
|
"num_tokens": 1465426370.0, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 2.621160409556314, |
|
"grad_norm": 0.23818001032099148, |
|
"learning_rate": 6.9406388883250545e-06, |
|
"loss": 0.3964, |
|
"num_tokens": 1467336173.0, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 2.6245733788395906, |
|
"grad_norm": 0.25882672475274643, |
|
"learning_rate": 6.906391523966373e-06, |
|
"loss": 0.3931, |
|
"num_tokens": 1469211173.0, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 2.627986348122867, |
|
"grad_norm": 0.2541572268544839, |
|
"learning_rate": 6.872435673149356e-06, |
|
"loss": 0.391, |
|
"num_tokens": 1471172352.0, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 2.631399317406143, |
|
"grad_norm": 0.2613262919497558, |
|
"learning_rate": 6.838771816537246e-06, |
|
"loss": 0.3925, |
|
"num_tokens": 1473141245.0, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 2.63481228668942, |
|
"grad_norm": 0.2664766003389638, |
|
"learning_rate": 6.805400430659915e-06, |
|
"loss": 0.3871, |
|
"num_tokens": 1474940344.0, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 2.638225255972696, |
|
"grad_norm": 0.25284839351036326, |
|
"learning_rate": 6.772321987907193e-06, |
|
"loss": 0.3921, |
|
"num_tokens": 1476903937.0, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 2.6416382252559725, |
|
"grad_norm": 0.2719817319285575, |
|
"learning_rate": 6.739536956522123e-06, |
|
"loss": 0.4109, |
|
"num_tokens": 1478813688.0, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 2.645051194539249, |
|
"grad_norm": 0.2645200858858097, |
|
"learning_rate": 6.707045800594355e-06, |
|
"loss": 0.4038, |
|
"num_tokens": 1480766595.0, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 2.6484641638225255, |
|
"grad_norm": 0.26001001469112844, |
|
"learning_rate": 6.674848980053584e-06, |
|
"loss": 0.3966, |
|
"num_tokens": 1482642747.0, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 2.651877133105802, |
|
"grad_norm": 0.24500708714170433, |
|
"learning_rate": 6.642946950663017e-06, |
|
"loss": 0.396, |
|
"num_tokens": 1484419324.0, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 2.6552901023890785, |
|
"grad_norm": 0.23023591716339004, |
|
"learning_rate": 6.611340164012951e-06, |
|
"loss": 0.3902, |
|
"num_tokens": 1486408843.0, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 2.658703071672355, |
|
"grad_norm": 0.2563250156761757, |
|
"learning_rate": 6.580029067514346e-06, |
|
"loss": 0.3975, |
|
"num_tokens": 1488281943.0, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 2.6621160409556315, |
|
"grad_norm": 0.2318318316850287, |
|
"learning_rate": 6.549014104392517e-06, |
|
"loss": 0.3859, |
|
"num_tokens": 1490170627.0, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.6655290102389078, |
|
"grad_norm": 0.24688799503819153, |
|
"learning_rate": 6.518295713680865e-06, |
|
"loss": 0.4052, |
|
"num_tokens": 1492100727.0, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 2.6689419795221845, |
|
"grad_norm": 0.2678221813912207, |
|
"learning_rate": 6.487874330214634e-06, |
|
"loss": 0.3893, |
|
"num_tokens": 1493930672.0, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 2.6723549488054608, |
|
"grad_norm": 0.24999129402505174, |
|
"learning_rate": 6.4577503846247705e-06, |
|
"loss": 0.3937, |
|
"num_tokens": 1495878482.0, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 2.675767918088737, |
|
"grad_norm": 0.24973780539752083, |
|
"learning_rate": 6.427924303331842e-06, |
|
"loss": 0.3828, |
|
"num_tokens": 1497662915.0, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 2.6791808873720138, |
|
"grad_norm": 0.2346890895753833, |
|
"learning_rate": 6.398396508539978e-06, |
|
"loss": 0.4061, |
|
"num_tokens": 1499588241.0, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 2.68259385665529, |
|
"grad_norm": 0.24884364729155156, |
|
"learning_rate": 6.369167418230905e-06, |
|
"loss": 0.3998, |
|
"num_tokens": 1501588836.0, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 2.6860068259385663, |
|
"grad_norm": 0.23641487250123258, |
|
"learning_rate": 6.340237446158029e-06, |
|
"loss": 0.4003, |
|
"num_tokens": 1503502828.0, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 2.689419795221843, |
|
"grad_norm": 0.258701689896867, |
|
"learning_rate": 6.31160700184058e-06, |
|
"loss": 0.3897, |
|
"num_tokens": 1505280996.0, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 2.6928327645051193, |
|
"grad_norm": 0.24492715549764293, |
|
"learning_rate": 6.283276490557805e-06, |
|
"loss": 0.3874, |
|
"num_tokens": 1507079279.0, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 2.696245733788396, |
|
"grad_norm": 0.258468233589569, |
|
"learning_rate": 6.255246313343244e-06, |
|
"loss": 0.3901, |
|
"num_tokens": 1508954940.0, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 2.6996587030716723, |
|
"grad_norm": 0.24437266193161206, |
|
"learning_rate": 6.227516866979042e-06, |
|
"loss": 0.3992, |
|
"num_tokens": 1510879092.0, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 2.703071672354949, |
|
"grad_norm": 0.2297482842865935, |
|
"learning_rate": 6.200088543990355e-06, |
|
"loss": 0.3907, |
|
"num_tokens": 1512799299.0, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 2.7064846416382253, |
|
"grad_norm": 0.2468930948842974, |
|
"learning_rate": 6.1729617326397484e-06, |
|
"loss": 0.3923, |
|
"num_tokens": 1514827859.0, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 2.7098976109215016, |
|
"grad_norm": 0.25174441050573704, |
|
"learning_rate": 6.1461368169217515e-06, |
|
"loss": 0.3992, |
|
"num_tokens": 1516677372.0, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 2.7133105802047783, |
|
"grad_norm": 0.23550127912898583, |
|
"learning_rate": 6.119614176557399e-06, |
|
"loss": 0.3983, |
|
"num_tokens": 1518630477.0, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 2.7167235494880546, |
|
"grad_norm": 0.25435494543972065, |
|
"learning_rate": 6.093394186988837e-06, |
|
"loss": 0.3941, |
|
"num_tokens": 1520552814.0, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 2.720136518771331, |
|
"grad_norm": 0.24689339414271427, |
|
"learning_rate": 6.0674772193740485e-06, |
|
"loss": 0.3951, |
|
"num_tokens": 1522494886.0, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 2.7235494880546076, |
|
"grad_norm": 0.23307385388488122, |
|
"learning_rate": 6.041863640581571e-06, |
|
"loss": 0.4008, |
|
"num_tokens": 1524429474.0, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 2.726962457337884, |
|
"grad_norm": 0.2605356057614315, |
|
"learning_rate": 6.016553813185308e-06, |
|
"loss": 0.3893, |
|
"num_tokens": 1526216361.0, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 2.73037542662116, |
|
"grad_norm": 0.2523506025794084, |
|
"learning_rate": 5.991548095459404e-06, |
|
"loss": 0.4007, |
|
"num_tokens": 1528138941.0, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.733788395904437, |
|
"grad_norm": 0.2499474707985221, |
|
"learning_rate": 5.966846841373165e-06, |
|
"loss": 0.4069, |
|
"num_tokens": 1530018296.0, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 2.737201365187713, |
|
"grad_norm": 0.24921605460943203, |
|
"learning_rate": 5.942450400586057e-06, |
|
"loss": 0.4049, |
|
"num_tokens": 1532049388.0, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 2.74061433447099, |
|
"grad_norm": 0.2498586016104822, |
|
"learning_rate": 5.9183591184427425e-06, |
|
"loss": 0.4045, |
|
"num_tokens": 1534051867.0, |
|
"step": 4015 |
|
}, |
|
{ |
|
"epoch": 2.744027303754266, |
|
"grad_norm": 0.24618708809713136, |
|
"learning_rate": 5.894573335968203e-06, |
|
"loss": 0.3853, |
|
"num_tokens": 1535873961.0, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 2.747440273037543, |
|
"grad_norm": 0.2505546224989872, |
|
"learning_rate": 5.8710933898629166e-06, |
|
"loss": 0.3837, |
|
"num_tokens": 1537679516.0, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 2.750853242320819, |
|
"grad_norm": 0.23197077317386236, |
|
"learning_rate": 5.847919612498076e-06, |
|
"loss": 0.3853, |
|
"num_tokens": 1539735224.0, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 2.7542662116040955, |
|
"grad_norm": 0.2439125129228283, |
|
"learning_rate": 5.825052331910887e-06, |
|
"loss": 0.4079, |
|
"num_tokens": 1541826635.0, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 2.757679180887372, |
|
"grad_norm": 0.23851878381859146, |
|
"learning_rate": 5.8024918717999445e-06, |
|
"loss": 0.3995, |
|
"num_tokens": 1543746445.0, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 2.7610921501706485, |
|
"grad_norm": 0.24287441981308092, |
|
"learning_rate": 5.780238551520622e-06, |
|
"loss": 0.4013, |
|
"num_tokens": 1545660100.0, |
|
"step": 4045 |
|
}, |
|
{ |
|
"epoch": 2.7645051194539247, |
|
"grad_norm": 0.27270907920127346, |
|
"learning_rate": 5.75829268608057e-06, |
|
"loss": 0.3939, |
|
"num_tokens": 1547633390.0, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 2.7679180887372015, |
|
"grad_norm": 0.2215511372126067, |
|
"learning_rate": 5.7366545861352515e-06, |
|
"loss": 0.3934, |
|
"num_tokens": 1549636484.0, |
|
"step": 4055 |
|
}, |
|
{ |
|
"epoch": 2.7713310580204777, |
|
"grad_norm": 0.23824072596286747, |
|
"learning_rate": 5.715324557983544e-06, |
|
"loss": 0.3985, |
|
"num_tokens": 1551617632.0, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 2.774744027303754, |
|
"grad_norm": 0.2500935931294177, |
|
"learning_rate": 5.694302903563405e-06, |
|
"loss": 0.4064, |
|
"num_tokens": 1553671734.0, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 2.7781569965870307, |
|
"grad_norm": 0.26068724299179286, |
|
"learning_rate": 5.673589920447592e-06, |
|
"loss": 0.3916, |
|
"num_tokens": 1555493757.0, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 2.781569965870307, |
|
"grad_norm": 0.2541875918112447, |
|
"learning_rate": 5.653185901839459e-06, |
|
"loss": 0.3856, |
|
"num_tokens": 1557404748.0, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 2.7849829351535837, |
|
"grad_norm": 0.24522050182761787, |
|
"learning_rate": 5.6330911365688025e-06, |
|
"loss": 0.387, |
|
"num_tokens": 1559334578.0, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 2.78839590443686, |
|
"grad_norm": 0.2494978034193831, |
|
"learning_rate": 5.613305909087776e-06, |
|
"loss": 0.4005, |
|
"num_tokens": 1561255667.0, |
|
"step": 4085 |
|
}, |
|
{ |
|
"epoch": 2.7918088737201368, |
|
"grad_norm": 0.2585725559042003, |
|
"learning_rate": 5.593830499466846e-06, |
|
"loss": 0.3983, |
|
"num_tokens": 1563224423.0, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 2.795221843003413, |
|
"grad_norm": 0.24273735328231116, |
|
"learning_rate": 5.574665183390861e-06, |
|
"loss": 0.4039, |
|
"num_tokens": 1565191375.0, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 2.7986348122866893, |
|
"grad_norm": 0.2717130897992212, |
|
"learning_rate": 5.5558102321551155e-06, |
|
"loss": 0.3982, |
|
"num_tokens": 1567008310.0, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.802047781569966, |
|
"grad_norm": 0.25253291196009875, |
|
"learning_rate": 5.537265912661524e-06, |
|
"loss": 0.4122, |
|
"num_tokens": 1568946206.0, |
|
"step": 4105 |
|
}, |
|
{ |
|
"epoch": 2.8054607508532423, |
|
"grad_norm": 0.2619145601499863, |
|
"learning_rate": 5.519032487414857e-06, |
|
"loss": 0.3993, |
|
"num_tokens": 1570872759.0, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 2.8088737201365186, |
|
"grad_norm": 0.2686949634633545, |
|
"learning_rate": 5.501110214518992e-06, |
|
"loss": 0.3937, |
|
"num_tokens": 1572860132.0, |
|
"step": 4115 |
|
}, |
|
{ |
|
"epoch": 2.8122866894197953, |
|
"grad_norm": 0.23455834216077803, |
|
"learning_rate": 5.483499347673291e-06, |
|
"loss": 0.3819, |
|
"num_tokens": 1574790093.0, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 2.8156996587030716, |
|
"grad_norm": 0.2354856688706412, |
|
"learning_rate": 5.466200136168988e-06, |
|
"loss": 0.3994, |
|
"num_tokens": 1576855613.0, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 2.819112627986348, |
|
"grad_norm": 0.2404835881052928, |
|
"learning_rate": 5.449212824885679e-06, |
|
"loss": 0.3945, |
|
"num_tokens": 1578697231.0, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 2.8225255972696246, |
|
"grad_norm": 0.24782491779127389, |
|
"learning_rate": 5.432537654287839e-06, |
|
"loss": 0.3926, |
|
"num_tokens": 1580557801.0, |
|
"step": 4135 |
|
}, |
|
{ |
|
"epoch": 2.825938566552901, |
|
"grad_norm": 0.25709964300656496, |
|
"learning_rate": 5.416174860421423e-06, |
|
"loss": 0.4062, |
|
"num_tokens": 1582409503.0, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 2.8293515358361776, |
|
"grad_norm": 0.24193936395054547, |
|
"learning_rate": 5.400124674910531e-06, |
|
"loss": 0.3838, |
|
"num_tokens": 1584225497.0, |
|
"step": 4145 |
|
}, |
|
{ |
|
"epoch": 2.832764505119454, |
|
"grad_norm": 0.24785601077252273, |
|
"learning_rate": 5.384387324954123e-06, |
|
"loss": 0.3823, |
|
"num_tokens": 1586141543.0, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 2.8361774744027306, |
|
"grad_norm": 0.24772540511880872, |
|
"learning_rate": 5.368963033322803e-06, |
|
"loss": 0.395, |
|
"num_tokens": 1588008199.0, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 2.839590443686007, |
|
"grad_norm": 0.2616741371511031, |
|
"learning_rate": 5.353852018355671e-06, |
|
"loss": 0.3858, |
|
"num_tokens": 1589775524.0, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 2.843003412969283, |
|
"grad_norm": 0.23998611616488993, |
|
"learning_rate": 5.339054493957223e-06, |
|
"loss": 0.4036, |
|
"num_tokens": 1591713597.0, |
|
"step": 4165 |
|
}, |
|
{ |
|
"epoch": 2.84641638225256, |
|
"grad_norm": 0.2473340920387729, |
|
"learning_rate": 5.324570669594329e-06, |
|
"loss": 0.3987, |
|
"num_tokens": 1593620097.0, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 2.849829351535836, |
|
"grad_norm": 0.24291234707437337, |
|
"learning_rate": 5.310400750293274e-06, |
|
"loss": 0.3885, |
|
"num_tokens": 1595431634.0, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 2.8532423208191124, |
|
"grad_norm": 0.25195102405447184, |
|
"learning_rate": 5.296544936636839e-06, |
|
"loss": 0.3991, |
|
"num_tokens": 1597345689.0, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 2.856655290102389, |
|
"grad_norm": 0.24244585678705705, |
|
"learning_rate": 5.283003424761481e-06, |
|
"loss": 0.4016, |
|
"num_tokens": 1599337430.0, |
|
"step": 4185 |
|
}, |
|
{ |
|
"epoch": 2.8600682593856654, |
|
"grad_norm": 0.2420260300363789, |
|
"learning_rate": 5.269776406354538e-06, |
|
"loss": 0.3888, |
|
"num_tokens": 1601383218.0, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 2.8634812286689417, |
|
"grad_norm": 0.23531179171366776, |
|
"learning_rate": 5.25686406865153e-06, |
|
"loss": 0.3986, |
|
"num_tokens": 1603461399.0, |
|
"step": 4195 |
|
}, |
|
{ |
|
"epoch": 2.8668941979522184, |
|
"grad_norm": 0.23924316659304262, |
|
"learning_rate": 5.244266594433509e-06, |
|
"loss": 0.403, |
|
"num_tokens": 1605406571.0, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.8703071672354947, |
|
"grad_norm": 0.25159268694642145, |
|
"learning_rate": 5.231984162024453e-06, |
|
"loss": 0.4079, |
|
"num_tokens": 1607383394.0, |
|
"step": 4205 |
|
}, |
|
{ |
|
"epoch": 2.8737201365187715, |
|
"grad_norm": 0.26246430303018, |
|
"learning_rate": 5.220016945288762e-06, |
|
"loss": 0.3947, |
|
"num_tokens": 1609294043.0, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 2.8771331058020477, |
|
"grad_norm": 0.2583127082260596, |
|
"learning_rate": 5.208365113628795e-06, |
|
"loss": 0.4073, |
|
"num_tokens": 1611262646.0, |
|
"step": 4215 |
|
}, |
|
{ |
|
"epoch": 2.8805460750853245, |
|
"grad_norm": 0.2703860048405685, |
|
"learning_rate": 5.197028831982456e-06, |
|
"loss": 0.3951, |
|
"num_tokens": 1613173875.0, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 2.8839590443686007, |
|
"grad_norm": 0.24985368025540616, |
|
"learning_rate": 5.186008260820875e-06, |
|
"loss": 0.3894, |
|
"num_tokens": 1615060675.0, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 2.887372013651877, |
|
"grad_norm": 0.24347269615277906, |
|
"learning_rate": 5.17530355614613e-06, |
|
"loss": 0.3865, |
|
"num_tokens": 1616973984.0, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 2.8907849829351537, |
|
"grad_norm": 0.25860445333919463, |
|
"learning_rate": 5.164914869489042e-06, |
|
"loss": 0.3886, |
|
"num_tokens": 1618849994.0, |
|
"step": 4235 |
|
}, |
|
{ |
|
"epoch": 2.89419795221843, |
|
"grad_norm": 0.25992126090841056, |
|
"learning_rate": 5.154842347907027e-06, |
|
"loss": 0.3949, |
|
"num_tokens": 1620722746.0, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 2.8976109215017063, |
|
"grad_norm": 0.2507997098825153, |
|
"learning_rate": 5.145086133982016e-06, |
|
"loss": 0.4036, |
|
"num_tokens": 1622535501.0, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 2.901023890784983, |
|
"grad_norm": 0.25556096631117053, |
|
"learning_rate": 5.1356463658184294e-06, |
|
"loss": 0.4005, |
|
"num_tokens": 1624444287.0, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 2.9044368600682593, |
|
"grad_norm": 0.247634476621531, |
|
"learning_rate": 5.126523177041238e-06, |
|
"loss": 0.4, |
|
"num_tokens": 1626411719.0, |
|
"step": 4255 |
|
}, |
|
{ |
|
"epoch": 2.9078498293515356, |
|
"grad_norm": 0.24948299890290146, |
|
"learning_rate": 5.117716696794059e-06, |
|
"loss": 0.3981, |
|
"num_tokens": 1628332813.0, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 2.9112627986348123, |
|
"grad_norm": 0.23498087131331388, |
|
"learning_rate": 5.109227049737329e-06, |
|
"loss": 0.3856, |
|
"num_tokens": 1630193509.0, |
|
"step": 4265 |
|
}, |
|
{ |
|
"epoch": 2.9146757679180886, |
|
"grad_norm": 0.2359135742357886, |
|
"learning_rate": 5.101054356046542e-06, |
|
"loss": 0.4039, |
|
"num_tokens": 1632145825.0, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 2.9180887372013653, |
|
"grad_norm": 0.2453789165871806, |
|
"learning_rate": 5.093198731410548e-06, |
|
"loss": 0.4038, |
|
"num_tokens": 1634164850.0, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 2.9215017064846416, |
|
"grad_norm": 0.2615881798789952, |
|
"learning_rate": 5.085660287029918e-06, |
|
"loss": 0.3906, |
|
"num_tokens": 1635989796.0, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 2.9249146757679183, |
|
"grad_norm": 0.26649221952806657, |
|
"learning_rate": 5.078439129615357e-06, |
|
"loss": 0.383, |
|
"num_tokens": 1637928342.0, |
|
"step": 4285 |
|
}, |
|
{ |
|
"epoch": 2.9283276450511946, |
|
"grad_norm": 0.2612187057511615, |
|
"learning_rate": 5.071535361386216e-06, |
|
"loss": 0.3944, |
|
"num_tokens": 1639831973.0, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 2.931740614334471, |
|
"grad_norm": 0.2573017244859236, |
|
"learning_rate": 5.064949080069025e-06, |
|
"loss": 0.3888, |
|
"num_tokens": 1641670758.0, |
|
"step": 4295 |
|
}, |
|
{ |
|
"epoch": 2.9351535836177476, |
|
"grad_norm": 0.2513577594767894, |
|
"learning_rate": 5.058680378896119e-06, |
|
"loss": 0.4059, |
|
"num_tokens": 1643634455.0, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.938566552901024, |
|
"grad_norm": 0.24800683186403535, |
|
"learning_rate": 5.0527293466043126e-06, |
|
"loss": 0.3838, |
|
"num_tokens": 1645383276.0, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 2.9419795221843, |
|
"grad_norm": 0.2433798047992357, |
|
"learning_rate": 5.047096067433657e-06, |
|
"loss": 0.394, |
|
"num_tokens": 1647349819.0, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 2.945392491467577, |
|
"grad_norm": 0.24355801889840303, |
|
"learning_rate": 5.0417806211262245e-06, |
|
"loss": 0.3859, |
|
"num_tokens": 1649242582.0, |
|
"step": 4315 |
|
}, |
|
{ |
|
"epoch": 2.948805460750853, |
|
"grad_norm": 0.25828743538143506, |
|
"learning_rate": 5.036783082925003e-06, |
|
"loss": 0.4005, |
|
"num_tokens": 1651206051.0, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 2.9522184300341294, |
|
"grad_norm": 0.2514852306063191, |
|
"learning_rate": 5.032103523572822e-06, |
|
"loss": 0.3914, |
|
"num_tokens": 1653037850.0, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 2.955631399317406, |
|
"grad_norm": 0.26158521210090824, |
|
"learning_rate": 5.027742009311342e-06, |
|
"loss": 0.3972, |
|
"num_tokens": 1654950263.0, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 2.9590443686006824, |
|
"grad_norm": 0.2302195005923823, |
|
"learning_rate": 5.023698601880131e-06, |
|
"loss": 0.3953, |
|
"num_tokens": 1656844160.0, |
|
"step": 4335 |
|
}, |
|
{ |
|
"epoch": 2.962457337883959, |
|
"grad_norm": 0.25697806948819085, |
|
"learning_rate": 5.019973358515785e-06, |
|
"loss": 0.3881, |
|
"num_tokens": 1658618203.0, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 2.9658703071672354, |
|
"grad_norm": 0.26230031669230236, |
|
"learning_rate": 5.016566331951116e-06, |
|
"loss": 0.3995, |
|
"num_tokens": 1660479956.0, |
|
"step": 4345 |
|
}, |
|
{ |
|
"epoch": 2.969283276450512, |
|
"grad_norm": 0.24647721425219105, |
|
"learning_rate": 5.013477570414405e-06, |
|
"loss": 0.3947, |
|
"num_tokens": 1662435735.0, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 2.9726962457337884, |
|
"grad_norm": 0.24145722179190598, |
|
"learning_rate": 5.010707117628725e-06, |
|
"loss": 0.3984, |
|
"num_tokens": 1664298270.0, |
|
"step": 4355 |
|
}, |
|
{ |
|
"epoch": 2.9761092150170647, |
|
"grad_norm": 0.2541130044127246, |
|
"learning_rate": 5.008255012811318e-06, |
|
"loss": 0.3881, |
|
"num_tokens": 1666219228.0, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 2.9795221843003414, |
|
"grad_norm": 0.2554311961056411, |
|
"learning_rate": 5.006121290673037e-06, |
|
"loss": 0.4008, |
|
"num_tokens": 1668243685.0, |
|
"step": 4365 |
|
}, |
|
{ |
|
"epoch": 2.9829351535836177, |
|
"grad_norm": 0.236207532281216, |
|
"learning_rate": 5.004305981417863e-06, |
|
"loss": 0.3904, |
|
"num_tokens": 1670202629.0, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 2.986348122866894, |
|
"grad_norm": 0.23352563711926605, |
|
"learning_rate": 5.002809110742464e-06, |
|
"loss": 0.3968, |
|
"num_tokens": 1672186589.0, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 2.9897610921501707, |
|
"grad_norm": 0.26559636339552617, |
|
"learning_rate": 5.001630699835849e-06, |
|
"loss": 0.4007, |
|
"num_tokens": 1674108971.0, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 2.993174061433447, |
|
"grad_norm": 0.23381874902902047, |
|
"learning_rate": 5.000770765379057e-06, |
|
"loss": 0.3962, |
|
"num_tokens": 1676074131.0, |
|
"step": 4385 |
|
}, |
|
{ |
|
"epoch": 2.9965870307167233, |
|
"grad_norm": 0.24395543004381443, |
|
"learning_rate": 5.000229319544913e-06, |
|
"loss": 0.3862, |
|
"num_tokens": 1677958772.0, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.24316523321927602, |
|
"learning_rate": 5.0000063699978795e-06, |
|
"loss": 0.3918, |
|
"num_tokens": 1679763258.0, |
|
"step": 4395 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 4395, |
|
"total_flos": 3436437652111360.0, |
|
"train_loss": 0.4596632290483199, |
|
"train_runtime": 36029.2862, |
|
"train_samples_per_second": 7.805, |
|
"train_steps_per_second": 0.122 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 4395, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3436437652111360.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|