|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 2932, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0068212824010914054, |
|
"grad_norm": 2.541781765483875, |
|
"learning_rate": 1.360544217687075e-06, |
|
"loss": 0.8582, |
|
"num_tokens": 3759146.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.013642564802182811, |
|
"grad_norm": 1.7880590777745466, |
|
"learning_rate": 3.0612244897959185e-06, |
|
"loss": 0.852, |
|
"num_tokens": 7668808.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.020463847203274217, |
|
"grad_norm": 1.1396843211846335, |
|
"learning_rate": 4.7619047619047615e-06, |
|
"loss": 0.7985, |
|
"num_tokens": 11368873.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.027285129604365622, |
|
"grad_norm": 0.7380484470623474, |
|
"learning_rate": 6.462585034013606e-06, |
|
"loss": 0.7495, |
|
"num_tokens": 15118063.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.034106412005457026, |
|
"grad_norm": 0.5940722286411614, |
|
"learning_rate": 8.163265306122448e-06, |
|
"loss": 0.7103, |
|
"num_tokens": 18906839.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.040927694406548434, |
|
"grad_norm": 0.5396036096677221, |
|
"learning_rate": 9.863945578231292e-06, |
|
"loss": 0.6796, |
|
"num_tokens": 22641755.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.047748976807639835, |
|
"grad_norm": 0.48176514966602646, |
|
"learning_rate": 1.1564625850340138e-05, |
|
"loss": 0.6702, |
|
"num_tokens": 26636629.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.054570259208731244, |
|
"grad_norm": 0.4277566165637127, |
|
"learning_rate": 1.3265306122448982e-05, |
|
"loss": 0.6354, |
|
"num_tokens": 30417967.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.061391541609822645, |
|
"grad_norm": 0.32948370527060555, |
|
"learning_rate": 1.4965986394557824e-05, |
|
"loss": 0.6353, |
|
"num_tokens": 34231333.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.06821282401091405, |
|
"grad_norm": 0.30904400396091464, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.6331, |
|
"num_tokens": 37961424.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07503410641200546, |
|
"grad_norm": 0.3379471244414341, |
|
"learning_rate": 1.836734693877551e-05, |
|
"loss": 0.6307, |
|
"num_tokens": 41826860.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.08185538881309687, |
|
"grad_norm": 0.3194776627652625, |
|
"learning_rate": 2.0068027210884355e-05, |
|
"loss": 0.6154, |
|
"num_tokens": 45543403.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08867667121418826, |
|
"grad_norm": 0.3576895809208668, |
|
"learning_rate": 2.17687074829932e-05, |
|
"loss": 0.6176, |
|
"num_tokens": 49369486.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.09549795361527967, |
|
"grad_norm": 0.37184023838165053, |
|
"learning_rate": 2.3469387755102043e-05, |
|
"loss": 0.6053, |
|
"num_tokens": 53010874.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.10231923601637108, |
|
"grad_norm": 0.3667912581195206, |
|
"learning_rate": 2.5170068027210887e-05, |
|
"loss": 0.6063, |
|
"num_tokens": 56909889.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.10914051841746249, |
|
"grad_norm": 0.3730278875443016, |
|
"learning_rate": 2.687074829931973e-05, |
|
"loss": 0.5878, |
|
"num_tokens": 60650570.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.11596180081855388, |
|
"grad_norm": 0.36241315719448003, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 0.5797, |
|
"num_tokens": 64564660.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.12278308321964529, |
|
"grad_norm": 0.36828276580401526, |
|
"learning_rate": 3.0272108843537418e-05, |
|
"loss": 0.5973, |
|
"num_tokens": 68426882.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1296043656207367, |
|
"grad_norm": 0.34302064089420164, |
|
"learning_rate": 3.1972789115646265e-05, |
|
"loss": 0.5922, |
|
"num_tokens": 72252819.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.1364256480218281, |
|
"grad_norm": 0.4550066149442531, |
|
"learning_rate": 3.36734693877551e-05, |
|
"loss": 0.5781, |
|
"num_tokens": 76160914.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1432469304229195, |
|
"grad_norm": 0.40436569138766876, |
|
"learning_rate": 3.5374149659863946e-05, |
|
"loss": 0.5795, |
|
"num_tokens": 80152955.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.15006821282401092, |
|
"grad_norm": 0.3562171730848496, |
|
"learning_rate": 3.707482993197279e-05, |
|
"loss": 0.5826, |
|
"num_tokens": 83901702.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.15688949522510232, |
|
"grad_norm": 0.42960332050781525, |
|
"learning_rate": 3.8775510204081634e-05, |
|
"loss": 0.5806, |
|
"num_tokens": 87691018.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.16371077762619374, |
|
"grad_norm": 0.3710888003891392, |
|
"learning_rate": 4.047619047619048e-05, |
|
"loss": 0.5695, |
|
"num_tokens": 91542428.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.17053206002728513, |
|
"grad_norm": 0.5761721385944898, |
|
"learning_rate": 4.217687074829932e-05, |
|
"loss": 0.5767, |
|
"num_tokens": 95438170.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.17735334242837653, |
|
"grad_norm": 0.6241666610197041, |
|
"learning_rate": 4.387755102040816e-05, |
|
"loss": 0.5724, |
|
"num_tokens": 99383692.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.18417462482946795, |
|
"grad_norm": 0.5871695733429951, |
|
"learning_rate": 4.557823129251701e-05, |
|
"loss": 0.5721, |
|
"num_tokens": 103223537.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.19099590723055934, |
|
"grad_norm": 0.49124082309652556, |
|
"learning_rate": 4.7278911564625856e-05, |
|
"loss": 0.5849, |
|
"num_tokens": 106901687.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.19781718963165076, |
|
"grad_norm": 0.4800174598604436, |
|
"learning_rate": 4.89795918367347e-05, |
|
"loss": 0.5564, |
|
"num_tokens": 110840758.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.20463847203274216, |
|
"grad_norm": 0.44927716136774587, |
|
"learning_rate": 4.9999942738637725e-05, |
|
"loss": 0.5714, |
|
"num_tokens": 114521504.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.21145975443383355, |
|
"grad_norm": 0.42337064215442904, |
|
"learning_rate": 4.999929855165921e-05, |
|
"loss": 0.5553, |
|
"num_tokens": 118445759.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.21828103683492497, |
|
"grad_norm": 0.43044313054459166, |
|
"learning_rate": 4.999793862156041e-05, |
|
"loss": 0.5637, |
|
"num_tokens": 122311693.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.22510231923601637, |
|
"grad_norm": 0.4339776202600696, |
|
"learning_rate": 4.999586299160312e-05, |
|
"loss": 0.566, |
|
"num_tokens": 126094524.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.23192360163710776, |
|
"grad_norm": 0.40738667516921956, |
|
"learning_rate": 4.999307172781686e-05, |
|
"loss": 0.5592, |
|
"num_tokens": 129971056.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.23874488403819918, |
|
"grad_norm": 0.40693766273399773, |
|
"learning_rate": 4.998956491899676e-05, |
|
"loss": 0.5735, |
|
"num_tokens": 133765982.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.24556616643929058, |
|
"grad_norm": 0.45034001459143336, |
|
"learning_rate": 4.9985342676700705e-05, |
|
"loss": 0.5531, |
|
"num_tokens": 137522281.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.252387448840382, |
|
"grad_norm": 0.36073152691456045, |
|
"learning_rate": 4.998040513524581e-05, |
|
"loss": 0.5474, |
|
"num_tokens": 141196084.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.2592087312414734, |
|
"grad_norm": 0.3992148561894294, |
|
"learning_rate": 4.997475245170414e-05, |
|
"loss": 0.5583, |
|
"num_tokens": 145136704.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2660300136425648, |
|
"grad_norm": 0.38393616918462065, |
|
"learning_rate": 4.996838480589772e-05, |
|
"loss": 0.546, |
|
"num_tokens": 148940122.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2728512960436562, |
|
"grad_norm": 0.34915039592664654, |
|
"learning_rate": 4.9961302400392804e-05, |
|
"loss": 0.5555, |
|
"num_tokens": 152809678.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.27967257844474763, |
|
"grad_norm": 0.4709177431986326, |
|
"learning_rate": 4.9953505460493435e-05, |
|
"loss": 0.549, |
|
"num_tokens": 156683573.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.286493860845839, |
|
"grad_norm": 0.3989622124157632, |
|
"learning_rate": 4.99449942342343e-05, |
|
"loss": 0.5502, |
|
"num_tokens": 160433326.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2933151432469304, |
|
"grad_norm": 0.2963557816370656, |
|
"learning_rate": 4.993576899237278e-05, |
|
"loss": 0.5534, |
|
"num_tokens": 164374316.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.30013642564802184, |
|
"grad_norm": 0.33146262474141547, |
|
"learning_rate": 4.992583002838041e-05, |
|
"loss": 0.5325, |
|
"num_tokens": 168223299.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3069577080491132, |
|
"grad_norm": 0.3804270204403636, |
|
"learning_rate": 4.991517765843349e-05, |
|
"loss": 0.5454, |
|
"num_tokens": 172132000.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.31377899045020463, |
|
"grad_norm": 0.4050739455664775, |
|
"learning_rate": 4.990381222140305e-05, |
|
"loss": 0.541, |
|
"num_tokens": 176086331.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.32060027285129605, |
|
"grad_norm": 0.438158122564832, |
|
"learning_rate": 4.989173407884408e-05, |
|
"loss": 0.533, |
|
"num_tokens": 179917640.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.3274215552523875, |
|
"grad_norm": 0.37482443871129906, |
|
"learning_rate": 4.987894361498399e-05, |
|
"loss": 0.5457, |
|
"num_tokens": 183746129.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.33424283765347884, |
|
"grad_norm": 0.40883725910500407, |
|
"learning_rate": 4.9865441236710415e-05, |
|
"loss": 0.5542, |
|
"num_tokens": 187699370.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.34106412005457026, |
|
"grad_norm": 0.36840773537703164, |
|
"learning_rate": 4.985122737355828e-05, |
|
"loss": 0.5362, |
|
"num_tokens": 191512142.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3478854024556617, |
|
"grad_norm": 0.35196282145686897, |
|
"learning_rate": 4.983630247769613e-05, |
|
"loss": 0.5373, |
|
"num_tokens": 195367749.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.35470668485675305, |
|
"grad_norm": 0.32453218633384984, |
|
"learning_rate": 4.982066702391169e-05, |
|
"loss": 0.5259, |
|
"num_tokens": 199215371.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3615279672578445, |
|
"grad_norm": 0.3333209805493323, |
|
"learning_rate": 4.980432150959687e-05, |
|
"loss": 0.5455, |
|
"num_tokens": 202903048.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.3683492496589359, |
|
"grad_norm": 0.34300836131361134, |
|
"learning_rate": 4.978726645473186e-05, |
|
"loss": 0.5344, |
|
"num_tokens": 206761213.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.37517053206002726, |
|
"grad_norm": 0.33687880661376, |
|
"learning_rate": 4.976950240186857e-05, |
|
"loss": 0.5302, |
|
"num_tokens": 210585632.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3819918144611187, |
|
"grad_norm": 0.3062190092375375, |
|
"learning_rate": 4.975102991611348e-05, |
|
"loss": 0.536, |
|
"num_tokens": 214261738.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3888130968622101, |
|
"grad_norm": 0.3228746457123477, |
|
"learning_rate": 4.973184958510955e-05, |
|
"loss": 0.5404, |
|
"num_tokens": 218144024.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3956343792633015, |
|
"grad_norm": 0.3198594107258337, |
|
"learning_rate": 4.971196201901757e-05, |
|
"loss": 0.5361, |
|
"num_tokens": 222057295.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4024556616643929, |
|
"grad_norm": 0.3994626681138991, |
|
"learning_rate": 4.969136785049676e-05, |
|
"loss": 0.5275, |
|
"num_tokens": 225828573.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.4092769440654843, |
|
"grad_norm": 0.36849928732257725, |
|
"learning_rate": 4.9670067734684625e-05, |
|
"loss": 0.5272, |
|
"num_tokens": 229710487.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.41609822646657574, |
|
"grad_norm": 0.3489875276180288, |
|
"learning_rate": 4.9648062349176145e-05, |
|
"loss": 0.5422, |
|
"num_tokens": 233617525.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.4229195088676671, |
|
"grad_norm": 0.36792321807790457, |
|
"learning_rate": 4.962535239400217e-05, |
|
"loss": 0.5396, |
|
"num_tokens": 237394471.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.4297407912687585, |
|
"grad_norm": 0.34720260466539804, |
|
"learning_rate": 4.9601938591607175e-05, |
|
"loss": 0.5338, |
|
"num_tokens": 241114261.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.43656207366984995, |
|
"grad_norm": 0.3329495509148233, |
|
"learning_rate": 4.9577821686826304e-05, |
|
"loss": 0.5342, |
|
"num_tokens": 244967987.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4433833560709413, |
|
"grad_norm": 0.34562763504560073, |
|
"learning_rate": 4.9553002446861634e-05, |
|
"loss": 0.5406, |
|
"num_tokens": 248751095.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.45020463847203274, |
|
"grad_norm": 0.385098545376304, |
|
"learning_rate": 4.952748166125779e-05, |
|
"loss": 0.528, |
|
"num_tokens": 252607265.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.45702592087312416, |
|
"grad_norm": 0.32550400486163533, |
|
"learning_rate": 4.950126014187683e-05, |
|
"loss": 0.5344, |
|
"num_tokens": 256417909.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.4638472032742155, |
|
"grad_norm": 0.3806963854916901, |
|
"learning_rate": 4.9474338722872404e-05, |
|
"loss": 0.5272, |
|
"num_tokens": 260456429.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.47066848567530695, |
|
"grad_norm": 0.32980302955349, |
|
"learning_rate": 4.9446718260663234e-05, |
|
"loss": 0.5203, |
|
"num_tokens": 264287905.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.47748976807639837, |
|
"grad_norm": 0.3603093816461743, |
|
"learning_rate": 4.941839963390585e-05, |
|
"loss": 0.529, |
|
"num_tokens": 268098790.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4843110504774898, |
|
"grad_norm": 0.3631235151389752, |
|
"learning_rate": 4.9389383743466675e-05, |
|
"loss": 0.5383, |
|
"num_tokens": 271974065.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.49113233287858116, |
|
"grad_norm": 0.40273291547493417, |
|
"learning_rate": 4.935967151239331e-05, |
|
"loss": 0.5422, |
|
"num_tokens": 275852045.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4979536152796726, |
|
"grad_norm": 0.36255296270077164, |
|
"learning_rate": 4.932926388588524e-05, |
|
"loss": 0.5198, |
|
"num_tokens": 279504484.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.504774897680764, |
|
"grad_norm": 0.3182608267836354, |
|
"learning_rate": 4.92981618312637e-05, |
|
"loss": 0.5217, |
|
"num_tokens": 283461546.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5115961800818554, |
|
"grad_norm": 0.36234286989341263, |
|
"learning_rate": 4.9266366337940945e-05, |
|
"loss": 0.53, |
|
"num_tokens": 287371918.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.5184174624829468, |
|
"grad_norm": 0.4092201031412652, |
|
"learning_rate": 4.923387841738875e-05, |
|
"loss": 0.5162, |
|
"num_tokens": 291057738.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5252387448840382, |
|
"grad_norm": 0.30609433227409455, |
|
"learning_rate": 4.920069910310625e-05, |
|
"loss": 0.5255, |
|
"num_tokens": 294881963.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.5320600272851296, |
|
"grad_norm": 0.3224125224406666, |
|
"learning_rate": 4.9166829450587075e-05, |
|
"loss": 0.5282, |
|
"num_tokens": 298677895.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.538881309686221, |
|
"grad_norm": 0.2959811215364232, |
|
"learning_rate": 4.9132270537285726e-05, |
|
"loss": 0.512, |
|
"num_tokens": 302387985.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.5457025920873124, |
|
"grad_norm": 20.354817642986546, |
|
"learning_rate": 4.9097023462583345e-05, |
|
"loss": 0.5424, |
|
"num_tokens": 306130593.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5525238744884038, |
|
"grad_norm": 0.42997615431371866, |
|
"learning_rate": 4.906108934775272e-05, |
|
"loss": 0.5344, |
|
"num_tokens": 309952008.0, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.5593451568894953, |
|
"grad_norm": 0.43537708372363537, |
|
"learning_rate": 4.902446933592261e-05, |
|
"loss": 0.5181, |
|
"num_tokens": 313725489.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5661664392905866, |
|
"grad_norm": 0.3490812508662286, |
|
"learning_rate": 4.8987164592041416e-05, |
|
"loss": 0.5226, |
|
"num_tokens": 317525148.0, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.572987721691678, |
|
"grad_norm": 0.32574253533741937, |
|
"learning_rate": 4.894917630284007e-05, |
|
"loss": 0.5223, |
|
"num_tokens": 321410101.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5798090040927695, |
|
"grad_norm": 0.3086970265131434, |
|
"learning_rate": 4.891050567679433e-05, |
|
"loss": 0.5174, |
|
"num_tokens": 325102278.0, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.5866302864938608, |
|
"grad_norm": 0.3208378690386464, |
|
"learning_rate": 4.88711539440863e-05, |
|
"loss": 0.522, |
|
"num_tokens": 328831071.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5934515688949522, |
|
"grad_norm": 0.3106759301738243, |
|
"learning_rate": 4.8831122356565323e-05, |
|
"loss": 0.5116, |
|
"num_tokens": 332767044.0, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.6002728512960437, |
|
"grad_norm": 0.3108281530425403, |
|
"learning_rate": 4.8790412187708125e-05, |
|
"loss": 0.509, |
|
"num_tokens": 336595638.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.607094133697135, |
|
"grad_norm": 0.3147467882312625, |
|
"learning_rate": 4.874902473257835e-05, |
|
"loss": 0.5218, |
|
"num_tokens": 340358925.0, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.6139154160982264, |
|
"grad_norm": 0.3366316381524695, |
|
"learning_rate": 4.870696130778532e-05, |
|
"loss": 0.5206, |
|
"num_tokens": 344239058.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6207366984993179, |
|
"grad_norm": 0.3171021318012896, |
|
"learning_rate": 4.8664223251442154e-05, |
|
"loss": 0.524, |
|
"num_tokens": 348080585.0, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.6275579809004093, |
|
"grad_norm": 0.3178449068910906, |
|
"learning_rate": 4.862081192312326e-05, |
|
"loss": 0.5176, |
|
"num_tokens": 351817695.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6343792633015006, |
|
"grad_norm": 0.363976784318056, |
|
"learning_rate": 4.8576728703820976e-05, |
|
"loss": 0.5122, |
|
"num_tokens": 355639183.0, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.6412005457025921, |
|
"grad_norm": 0.33984465023934035, |
|
"learning_rate": 4.853197499590174e-05, |
|
"loss": 0.5308, |
|
"num_tokens": 359437581.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.6480218281036835, |
|
"grad_norm": 0.32388712017688437, |
|
"learning_rate": 4.848655222306144e-05, |
|
"loss": 0.5066, |
|
"num_tokens": 363189983.0, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.654843110504775, |
|
"grad_norm": 0.29612685969086366, |
|
"learning_rate": 4.844046183028009e-05, |
|
"loss": 0.509, |
|
"num_tokens": 366863209.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6616643929058663, |
|
"grad_norm": 0.34029178906832475, |
|
"learning_rate": 4.839370528377592e-05, |
|
"loss": 0.5231, |
|
"num_tokens": 370725860.0, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.6684856753069577, |
|
"grad_norm": 0.3103169185956682, |
|
"learning_rate": 4.834628407095871e-05, |
|
"loss": 0.5085, |
|
"num_tokens": 374566515.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6753069577080492, |
|
"grad_norm": 0.2800430805166771, |
|
"learning_rate": 4.829819970038245e-05, |
|
"loss": 0.5012, |
|
"num_tokens": 378330489.0, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.6821282401091405, |
|
"grad_norm": 0.34884629801915895, |
|
"learning_rate": 4.8249453701697385e-05, |
|
"loss": 0.5059, |
|
"num_tokens": 382103468.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6889495225102319, |
|
"grad_norm": 0.3197637736232303, |
|
"learning_rate": 4.820004762560134e-05, |
|
"loss": 0.5144, |
|
"num_tokens": 385837297.0, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.6957708049113234, |
|
"grad_norm": 0.3936823154590106, |
|
"learning_rate": 4.814998304379036e-05, |
|
"loss": 0.5117, |
|
"num_tokens": 389586680.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7025920873124147, |
|
"grad_norm": 0.3172798834226314, |
|
"learning_rate": 4.8099261548908773e-05, |
|
"loss": 0.5162, |
|
"num_tokens": 393428760.0, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.7094133697135061, |
|
"grad_norm": 0.3160858881655075, |
|
"learning_rate": 4.8047884754498495e-05, |
|
"loss": 0.5279, |
|
"num_tokens": 397158700.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7162346521145976, |
|
"grad_norm": 0.37082219207657685, |
|
"learning_rate": 4.799585429494768e-05, |
|
"loss": 0.5123, |
|
"num_tokens": 400923938.0, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.723055934515689, |
|
"grad_norm": 0.31992077884834463, |
|
"learning_rate": 4.794317182543875e-05, |
|
"loss": 0.506, |
|
"num_tokens": 404685655.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.7298772169167803, |
|
"grad_norm": 0.29913070231582056, |
|
"learning_rate": 4.7889839021895724e-05, |
|
"loss": 0.5117, |
|
"num_tokens": 408582799.0, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.7366984993178718, |
|
"grad_norm": 0.28227205669712635, |
|
"learning_rate": 4.783585758093095e-05, |
|
"loss": 0.517, |
|
"num_tokens": 412386009.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.7435197817189632, |
|
"grad_norm": 0.3067668406718358, |
|
"learning_rate": 4.778122921979104e-05, |
|
"loss": 0.5201, |
|
"num_tokens": 416372039.0, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.7503410641200545, |
|
"grad_norm": 0.3186838734464595, |
|
"learning_rate": 4.772595567630237e-05, |
|
"loss": 0.5046, |
|
"num_tokens": 420051975.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.757162346521146, |
|
"grad_norm": 0.34804937870860536, |
|
"learning_rate": 4.7670038708815676e-05, |
|
"loss": 0.5051, |
|
"num_tokens": 423685709.0, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.7639836289222374, |
|
"grad_norm": 0.33506793399336343, |
|
"learning_rate": 4.761348009615018e-05, |
|
"loss": 0.5011, |
|
"num_tokens": 427405904.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.7708049113233287, |
|
"grad_norm": 0.34443735099415185, |
|
"learning_rate": 4.7556281637536985e-05, |
|
"loss": 0.5011, |
|
"num_tokens": 431288378.0, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.7776261937244202, |
|
"grad_norm": 0.32414846184121815, |
|
"learning_rate": 4.7498445152561864e-05, |
|
"loss": 0.5042, |
|
"num_tokens": 435077995.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.7844474761255116, |
|
"grad_norm": 0.3176009794001488, |
|
"learning_rate": 4.743997248110733e-05, |
|
"loss": 0.5041, |
|
"num_tokens": 439006864.0, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.791268758526603, |
|
"grad_norm": 0.3149518360403247, |
|
"learning_rate": 4.738086548329416e-05, |
|
"loss": 0.5192, |
|
"num_tokens": 443045688.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.7980900409276944, |
|
"grad_norm": 0.3207146432446857, |
|
"learning_rate": 4.732112603942216e-05, |
|
"loss": 0.5065, |
|
"num_tokens": 446852018.0, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.8049113233287858, |
|
"grad_norm": 0.2888771649434731, |
|
"learning_rate": 4.7260756049910406e-05, |
|
"loss": 0.5203, |
|
"num_tokens": 450687287.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.8117326057298773, |
|
"grad_norm": 0.3057129065721774, |
|
"learning_rate": 4.7199757435236744e-05, |
|
"loss": 0.5057, |
|
"num_tokens": 454483896.0, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.8185538881309686, |
|
"grad_norm": 0.28756788730889826, |
|
"learning_rate": 4.713813213587674e-05, |
|
"loss": 0.5036, |
|
"num_tokens": 458325861.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.82537517053206, |
|
"grad_norm": 0.30014832786980916, |
|
"learning_rate": 4.70758821122419e-05, |
|
"loss": 0.5092, |
|
"num_tokens": 462055868.0, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.8321964529331515, |
|
"grad_norm": 0.3111436835827166, |
|
"learning_rate": 4.701300934461736e-05, |
|
"loss": 0.5032, |
|
"num_tokens": 465973876.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.8390177353342428, |
|
"grad_norm": 0.32758808149750535, |
|
"learning_rate": 4.6949515833098824e-05, |
|
"loss": 0.5004, |
|
"num_tokens": 469791498.0, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.8458390177353342, |
|
"grad_norm": 0.27029086656552975, |
|
"learning_rate": 4.688540359752902e-05, |
|
"loss": 0.4979, |
|
"num_tokens": 473570581.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.8526603001364257, |
|
"grad_norm": 0.29349068259319094, |
|
"learning_rate": 4.6820674677433376e-05, |
|
"loss": 0.5139, |
|
"num_tokens": 477401353.0, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.859481582537517, |
|
"grad_norm": 0.2866715279919294, |
|
"learning_rate": 4.675533113195515e-05, |
|
"loss": 0.5129, |
|
"num_tokens": 481348892.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.8663028649386084, |
|
"grad_norm": 0.2864693588210585, |
|
"learning_rate": 4.6689375039789954e-05, |
|
"loss": 0.5108, |
|
"num_tokens": 485171910.0, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.8731241473396999, |
|
"grad_norm": 0.27626775639483725, |
|
"learning_rate": 4.6622808499119625e-05, |
|
"loss": 0.4956, |
|
"num_tokens": 489059548.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.8799454297407913, |
|
"grad_norm": 0.2946930273512276, |
|
"learning_rate": 4.655563362754543e-05, |
|
"loss": 0.4989, |
|
"num_tokens": 492893029.0, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.8867667121418826, |
|
"grad_norm": 0.3238619095461243, |
|
"learning_rate": 4.648785256202076e-05, |
|
"loss": 0.5064, |
|
"num_tokens": 496905674.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8935879945429741, |
|
"grad_norm": 0.27166442143334074, |
|
"learning_rate": 4.6419467458783125e-05, |
|
"loss": 0.5012, |
|
"num_tokens": 500864542.0, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.9004092769440655, |
|
"grad_norm": 0.35133448756025343, |
|
"learning_rate": 4.635048049328555e-05, |
|
"loss": 0.505, |
|
"num_tokens": 504810366.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.9072305593451568, |
|
"grad_norm": 0.35555108957660303, |
|
"learning_rate": 4.628089386012737e-05, |
|
"loss": 0.5011, |
|
"num_tokens": 508607232.0, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.9140518417462483, |
|
"grad_norm": 0.32064397314940585, |
|
"learning_rate": 4.621070977298446e-05, |
|
"loss": 0.508, |
|
"num_tokens": 512474084.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.9208731241473397, |
|
"grad_norm": 0.28781771682272567, |
|
"learning_rate": 4.613993046453875e-05, |
|
"loss": 0.4986, |
|
"num_tokens": 516216104.0, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.927694406548431, |
|
"grad_norm": 0.278681463872458, |
|
"learning_rate": 4.606855818640724e-05, |
|
"loss": 0.5079, |
|
"num_tokens": 519966345.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.9345156889495225, |
|
"grad_norm": 0.2777129164143007, |
|
"learning_rate": 4.5996595209070356e-05, |
|
"loss": 0.4934, |
|
"num_tokens": 523793837.0, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.9413369713506139, |
|
"grad_norm": 0.29049878841827675, |
|
"learning_rate": 4.5924043821799734e-05, |
|
"loss": 0.5069, |
|
"num_tokens": 527666864.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.9481582537517054, |
|
"grad_norm": 0.30785475125106526, |
|
"learning_rate": 4.585090633258539e-05, |
|
"loss": 0.5042, |
|
"num_tokens": 531466359.0, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.9549795361527967, |
|
"grad_norm": 0.29390951682226474, |
|
"learning_rate": 4.577718506806228e-05, |
|
"loss": 0.5066, |
|
"num_tokens": 535253217.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.9618008185538881, |
|
"grad_norm": 0.33095337694320254, |
|
"learning_rate": 4.570288237343632e-05, |
|
"loss": 0.4904, |
|
"num_tokens": 539137436.0, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.9686221009549796, |
|
"grad_norm": 0.2850378778466875, |
|
"learning_rate": 4.562800061240975e-05, |
|
"loss": 0.5009, |
|
"num_tokens": 542824098.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.975443383356071, |
|
"grad_norm": 0.30760761583081375, |
|
"learning_rate": 4.555254216710597e-05, |
|
"loss": 0.5119, |
|
"num_tokens": 546578047.0, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.9822646657571623, |
|
"grad_norm": 0.27149624139361506, |
|
"learning_rate": 4.5476509437993726e-05, |
|
"loss": 0.4976, |
|
"num_tokens": 550377811.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.9890859481582538, |
|
"grad_norm": 0.30828520368635326, |
|
"learning_rate": 4.5399904843810756e-05, |
|
"loss": 0.4941, |
|
"num_tokens": 554224024.0, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.9959072305593452, |
|
"grad_norm": 0.3464146216808222, |
|
"learning_rate": 4.532273082148689e-05, |
|
"loss": 0.5015, |
|
"num_tokens": 558001366.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.0027285129604366, |
|
"grad_norm": 0.3490549767071155, |
|
"learning_rate": 4.5244989826066444e-05, |
|
"loss": 0.4889, |
|
"num_tokens": 561842493.0, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.009549795361528, |
|
"grad_norm": 0.3148778232856733, |
|
"learning_rate": 4.51666843306302e-05, |
|
"loss": 0.4679, |
|
"num_tokens": 565569218.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.0163710777626194, |
|
"grad_norm": 0.3731178499173407, |
|
"learning_rate": 4.5087816826216695e-05, |
|
"loss": 0.4764, |
|
"num_tokens": 569401220.0, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.0231923601637107, |
|
"grad_norm": 0.34226324439808514, |
|
"learning_rate": 4.500838982174297e-05, |
|
"loss": 0.4766, |
|
"num_tokens": 573382420.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.030013642564802, |
|
"grad_norm": 0.2805013542076892, |
|
"learning_rate": 4.492840584392478e-05, |
|
"loss": 0.4671, |
|
"num_tokens": 577211964.0, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.0368349249658937, |
|
"grad_norm": 0.3065721441536721, |
|
"learning_rate": 4.484786743719619e-05, |
|
"loss": 0.4689, |
|
"num_tokens": 581185526.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.043656207366985, |
|
"grad_norm": 0.2936712359839026, |
|
"learning_rate": 4.4766777163628656e-05, |
|
"loss": 0.4712, |
|
"num_tokens": 584961788.0, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.0504774897680764, |
|
"grad_norm": 0.2764462935192823, |
|
"learning_rate": 4.468513760284952e-05, |
|
"loss": 0.4797, |
|
"num_tokens": 588808530.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.0572987721691678, |
|
"grad_norm": 0.28592666412762696, |
|
"learning_rate": 4.460295135195991e-05, |
|
"loss": 0.4599, |
|
"num_tokens": 592600556.0, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.0641200545702592, |
|
"grad_norm": 0.29631998771015455, |
|
"learning_rate": 4.452022102545217e-05, |
|
"loss": 0.4673, |
|
"num_tokens": 596480393.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.0709413369713505, |
|
"grad_norm": 0.26505805501945173, |
|
"learning_rate": 4.443694925512665e-05, |
|
"loss": 0.4717, |
|
"num_tokens": 600450795.0, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.077762619372442, |
|
"grad_norm": 0.28531533545747306, |
|
"learning_rate": 4.4353138690008026e-05, |
|
"loss": 0.474, |
|
"num_tokens": 604180876.0, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.0845839017735335, |
|
"grad_norm": 0.2780327870404011, |
|
"learning_rate": 4.426879199626098e-05, |
|
"loss": 0.467, |
|
"num_tokens": 607934610.0, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.0914051841746248, |
|
"grad_norm": 0.27715466802943606, |
|
"learning_rate": 4.418391185710543e-05, |
|
"loss": 0.4695, |
|
"num_tokens": 611651241.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.0982264665757162, |
|
"grad_norm": 0.31591464715547607, |
|
"learning_rate": 4.409850097273113e-05, |
|
"loss": 0.4675, |
|
"num_tokens": 615434308.0, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.1050477489768076, |
|
"grad_norm": 0.26233476106629194, |
|
"learning_rate": 4.401256206021181e-05, |
|
"loss": 0.4657, |
|
"num_tokens": 619368833.0, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.111869031377899, |
|
"grad_norm": 0.27576262263266726, |
|
"learning_rate": 4.39260978534187e-05, |
|
"loss": 0.4639, |
|
"num_tokens": 623126366.0, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.1186903137789905, |
|
"grad_norm": 0.2746654826395091, |
|
"learning_rate": 4.383911110293363e-05, |
|
"loss": 0.4765, |
|
"num_tokens": 627107529.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.125511596180082, |
|
"grad_norm": 0.24775871464017238, |
|
"learning_rate": 4.375160457596144e-05, |
|
"loss": 0.4651, |
|
"num_tokens": 630903442.0, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.1323328785811733, |
|
"grad_norm": 0.24655144689998953, |
|
"learning_rate": 4.3663581056242e-05, |
|
"loss": 0.4606, |
|
"num_tokens": 634517705.0, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.1391541609822646, |
|
"grad_norm": 0.27895983869139385, |
|
"learning_rate": 4.357504334396168e-05, |
|
"loss": 0.4685, |
|
"num_tokens": 638096091.0, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.145975443383356, |
|
"grad_norm": 0.31816753414625654, |
|
"learning_rate": 4.348599425566422e-05, |
|
"loss": 0.4684, |
|
"num_tokens": 642002612.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.1527967257844476, |
|
"grad_norm": 0.26724451611429256, |
|
"learning_rate": 4.3396436624161125e-05, |
|
"loss": 0.4735, |
|
"num_tokens": 645977691.0, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.159618008185539, |
|
"grad_norm": 0.32927796404239074, |
|
"learning_rate": 4.330637329844162e-05, |
|
"loss": 0.4667, |
|
"num_tokens": 649827816.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.1664392905866303, |
|
"grad_norm": 0.34404133032544415, |
|
"learning_rate": 4.321580714358193e-05, |
|
"loss": 0.4765, |
|
"num_tokens": 653510445.0, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.1732605729877217, |
|
"grad_norm": 0.27768133752278, |
|
"learning_rate": 4.3124741040654217e-05, |
|
"loss": 0.4715, |
|
"num_tokens": 657384998.0, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.180081855388813, |
|
"grad_norm": 0.25025345440677405, |
|
"learning_rate": 4.3033177886634845e-05, |
|
"loss": 0.4665, |
|
"num_tokens": 661293472.0, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.1869031377899044, |
|
"grad_norm": 0.30457046427971657, |
|
"learning_rate": 4.2941120594312315e-05, |
|
"loss": 0.4748, |
|
"num_tokens": 665091673.0, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.1937244201909958, |
|
"grad_norm": 0.23877952147739753, |
|
"learning_rate": 4.2848572092194513e-05, |
|
"loss": 0.4728, |
|
"num_tokens": 668922968.0, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.2005457025920874, |
|
"grad_norm": 0.2520844593923701, |
|
"learning_rate": 4.275553532441562e-05, |
|
"loss": 0.4644, |
|
"num_tokens": 672846634.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.2073669849931787, |
|
"grad_norm": 0.3147581673447297, |
|
"learning_rate": 4.266201325064242e-05, |
|
"loss": 0.4627, |
|
"num_tokens": 676655826.0, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.21418826739427, |
|
"grad_norm": 0.24800806438269005, |
|
"learning_rate": 4.256800884598013e-05, |
|
"loss": 0.4786, |
|
"num_tokens": 680322801.0, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.2210095497953615, |
|
"grad_norm": 2.4907630647282, |
|
"learning_rate": 4.2473525100877823e-05, |
|
"loss": 0.4739, |
|
"num_tokens": 684119272.0, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.2278308321964528, |
|
"grad_norm": 0.30229873985443184, |
|
"learning_rate": 4.23785650210332e-05, |
|
"loss": 0.467, |
|
"num_tokens": 687991406.0, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.2346521145975444, |
|
"grad_norm": 0.26793479462106135, |
|
"learning_rate": 4.228313162729706e-05, |
|
"loss": 0.4768, |
|
"num_tokens": 691695054.0, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.2414733969986358, |
|
"grad_norm": 0.2849637202449397, |
|
"learning_rate": 4.218722795557717e-05, |
|
"loss": 0.4681, |
|
"num_tokens": 695621817.0, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.2482946793997272, |
|
"grad_norm": 0.25879554486736855, |
|
"learning_rate": 4.2090857056741676e-05, |
|
"loss": 0.4553, |
|
"num_tokens": 699480008.0, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.2551159618008185, |
|
"grad_norm": 0.2878790201694137, |
|
"learning_rate": 4.199402199652205e-05, |
|
"loss": 0.4502, |
|
"num_tokens": 703195130.0, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.26193724420191, |
|
"grad_norm": 0.30195551712906077, |
|
"learning_rate": 4.189672585541558e-05, |
|
"loss": 0.4686, |
|
"num_tokens": 707113602.0, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.2687585266030013, |
|
"grad_norm": 0.2604967174312856, |
|
"learning_rate": 4.1798971728587375e-05, |
|
"loss": 0.4659, |
|
"num_tokens": 710925367.0, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.2755798090040928, |
|
"grad_norm": 0.2781371955226694, |
|
"learning_rate": 4.170076272577186e-05, |
|
"loss": 0.464, |
|
"num_tokens": 714722146.0, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.2824010914051842, |
|
"grad_norm": 0.27482737768170556, |
|
"learning_rate": 4.160210197117392e-05, |
|
"loss": 0.4608, |
|
"num_tokens": 718555460.0, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.2892223738062756, |
|
"grad_norm": 0.31633385102337147, |
|
"learning_rate": 4.150299260336947e-05, |
|
"loss": 0.4638, |
|
"num_tokens": 722379411.0, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.296043656207367, |
|
"grad_norm": 0.2982673873153893, |
|
"learning_rate": 4.14034377752056e-05, |
|
"loss": 0.4732, |
|
"num_tokens": 726285777.0, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.3028649386084583, |
|
"grad_norm": 0.2770667418801685, |
|
"learning_rate": 4.130344065370031e-05, |
|
"loss": 0.4786, |
|
"num_tokens": 730064545.0, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.30968622100955, |
|
"grad_norm": 0.26149171089033296, |
|
"learning_rate": 4.120300441994172e-05, |
|
"loss": 0.462, |
|
"num_tokens": 733841294.0, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.3165075034106413, |
|
"grad_norm": 0.31050190782640036, |
|
"learning_rate": 4.110213226898695e-05, |
|
"loss": 0.4732, |
|
"num_tokens": 737760285.0, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.3233287858117326, |
|
"grad_norm": 0.27836619089478065, |
|
"learning_rate": 4.100082740976036e-05, |
|
"loss": 0.4727, |
|
"num_tokens": 741596226.0, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.330150068212824, |
|
"grad_norm": 0.2579696461553874, |
|
"learning_rate": 4.08990930649516e-05, |
|
"loss": 0.4649, |
|
"num_tokens": 745437763.0, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.3369713506139154, |
|
"grad_norm": 0.2740627527065771, |
|
"learning_rate": 4.079693247091302e-05, |
|
"loss": 0.4645, |
|
"num_tokens": 749302808.0, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.3437926330150067, |
|
"grad_norm": 0.2790716204058442, |
|
"learning_rate": 4.069434887755667e-05, |
|
"loss": 0.4689, |
|
"num_tokens": 753127588.0, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.350613915416098, |
|
"grad_norm": 0.264519833216715, |
|
"learning_rate": 4.059134554825104e-05, |
|
"loss": 0.4686, |
|
"num_tokens": 756971687.0, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.3574351978171897, |
|
"grad_norm": 0.28990937969651126, |
|
"learning_rate": 4.048792575971713e-05, |
|
"loss": 0.4598, |
|
"num_tokens": 760697602.0, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.364256480218281, |
|
"grad_norm": 0.24357301496138234, |
|
"learning_rate": 4.038409280192427e-05, |
|
"loss": 0.4658, |
|
"num_tokens": 764490647.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.3710777626193724, |
|
"grad_norm": 0.26005750254814686, |
|
"learning_rate": 4.0279849977985434e-05, |
|
"loss": 0.4653, |
|
"num_tokens": 768330897.0, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.3778990450204638, |
|
"grad_norm": 0.2593892684840051, |
|
"learning_rate": 4.01752006040522e-05, |
|
"loss": 0.4665, |
|
"num_tokens": 772161013.0, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.3847203274215554, |
|
"grad_norm": 0.2554846973907493, |
|
"learning_rate": 4.007014800920921e-05, |
|
"loss": 0.4733, |
|
"num_tokens": 776138853.0, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.3915416098226467, |
|
"grad_norm": 0.2841717548251602, |
|
"learning_rate": 3.9964695535368306e-05, |
|
"loss": 0.4608, |
|
"num_tokens": 779984056.0, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.398362892223738, |
|
"grad_norm": 0.25032452540370875, |
|
"learning_rate": 3.985884653716218e-05, |
|
"loss": 0.4627, |
|
"num_tokens": 783898134.0, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.4051841746248295, |
|
"grad_norm": 0.29802875985167504, |
|
"learning_rate": 3.9752604381837676e-05, |
|
"loss": 0.4645, |
|
"num_tokens": 787897711.0, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.4120054570259208, |
|
"grad_norm": 0.2706705220606769, |
|
"learning_rate": 3.96459724491487e-05, |
|
"loss": 0.4704, |
|
"num_tokens": 791719537.0, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.4188267394270122, |
|
"grad_norm": 0.2840824587208515, |
|
"learning_rate": 3.953895413124866e-05, |
|
"loss": 0.465, |
|
"num_tokens": 795317290.0, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.4256480218281036, |
|
"grad_norm": 0.28504683741140885, |
|
"learning_rate": 3.9431552832582544e-05, |
|
"loss": 0.4645, |
|
"num_tokens": 799156629.0, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.4324693042291952, |
|
"grad_norm": 0.23464120441174716, |
|
"learning_rate": 3.932377196977871e-05, |
|
"loss": 0.4552, |
|
"num_tokens": 803069636.0, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.4392905866302865, |
|
"grad_norm": 0.27338514780251155, |
|
"learning_rate": 3.9215614971540064e-05, |
|
"loss": 0.4655, |
|
"num_tokens": 807046765.0, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.446111869031378, |
|
"grad_norm": 0.26309603739381, |
|
"learning_rate": 3.9107085278535105e-05, |
|
"loss": 0.4643, |
|
"num_tokens": 810878242.0, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.4529331514324693, |
|
"grad_norm": 0.28263057841312744, |
|
"learning_rate": 3.8998186343288403e-05, |
|
"loss": 0.464, |
|
"num_tokens": 814684093.0, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.4597544338335606, |
|
"grad_norm": 0.27658909194801223, |
|
"learning_rate": 3.888892163007079e-05, |
|
"loss": 0.4612, |
|
"num_tokens": 818487158.0, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.4665757162346522, |
|
"grad_norm": 0.31309736931041143, |
|
"learning_rate": 3.877929461478915e-05, |
|
"loss": 0.4612, |
|
"num_tokens": 822359745.0, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.4733969986357436, |
|
"grad_norm": 0.27357218874453054, |
|
"learning_rate": 3.8669308784875855e-05, |
|
"loss": 0.4593, |
|
"num_tokens": 826031854.0, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.480218281036835, |
|
"grad_norm": 0.2702103817079497, |
|
"learning_rate": 3.8558967639177795e-05, |
|
"loss": 0.4652, |
|
"num_tokens": 829831729.0, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.4870395634379263, |
|
"grad_norm": 0.28660991886409803, |
|
"learning_rate": 3.844827468784513e-05, |
|
"loss": 0.4635, |
|
"num_tokens": 833559495.0, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.4938608458390177, |
|
"grad_norm": 0.2831089788366953, |
|
"learning_rate": 3.8337233452219554e-05, |
|
"loss": 0.4742, |
|
"num_tokens": 837452717.0, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.500682128240109, |
|
"grad_norm": 0.2726925683985379, |
|
"learning_rate": 3.822584746472232e-05, |
|
"loss": 0.4685, |
|
"num_tokens": 841494894.0, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.5075034106412004, |
|
"grad_norm": 0.2945202746465597, |
|
"learning_rate": 3.811412026874187e-05, |
|
"loss": 0.4588, |
|
"num_tokens": 845244637.0, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.514324693042292, |
|
"grad_norm": 0.2947105252840782, |
|
"learning_rate": 3.800205541852109e-05, |
|
"loss": 0.4648, |
|
"num_tokens": 849040114.0, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.5211459754433834, |
|
"grad_norm": 0.26767448377121167, |
|
"learning_rate": 3.788965647904426e-05, |
|
"loss": 0.4639, |
|
"num_tokens": 852855908.0, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.5279672578444747, |
|
"grad_norm": 0.2728941868971984, |
|
"learning_rate": 3.777692702592363e-05, |
|
"loss": 0.4633, |
|
"num_tokens": 856606710.0, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.5347885402455663, |
|
"grad_norm": 0.3161592951745081, |
|
"learning_rate": 3.76638706452857e-05, |
|
"loss": 0.4717, |
|
"num_tokens": 860530719.0, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.5416098226466577, |
|
"grad_norm": 0.30392644830663357, |
|
"learning_rate": 3.755049093365709e-05, |
|
"loss": 0.4624, |
|
"num_tokens": 864403759.0, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.548431105047749, |
|
"grad_norm": 0.27628995163838427, |
|
"learning_rate": 3.74367914978502e-05, |
|
"loss": 0.4668, |
|
"num_tokens": 868277615.0, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.5552523874488404, |
|
"grad_norm": 0.24659333313063722, |
|
"learning_rate": 3.73227759548484e-05, |
|
"loss": 0.4762, |
|
"num_tokens": 872188105.0, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.5620736698499318, |
|
"grad_norm": 0.2541498419510143, |
|
"learning_rate": 3.7208447931691034e-05, |
|
"loss": 0.4656, |
|
"num_tokens": 876046118.0, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.5688949522510232, |
|
"grad_norm": 0.2483276804595362, |
|
"learning_rate": 3.7093811065357934e-05, |
|
"loss": 0.4646, |
|
"num_tokens": 879952254.0, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.5757162346521145, |
|
"grad_norm": 0.2736551066476727, |
|
"learning_rate": 3.6978869002653884e-05, |
|
"loss": 0.4684, |
|
"num_tokens": 883721474.0, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.5825375170532059, |
|
"grad_norm": 0.2606556505747233, |
|
"learning_rate": 3.6863625400092407e-05, |
|
"loss": 0.4849, |
|
"num_tokens": 887512011.0, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.5893587994542973, |
|
"grad_norm": 0.26523699927966526, |
|
"learning_rate": 3.674808392377964e-05, |
|
"loss": 0.456, |
|
"num_tokens": 891332828.0, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.5961800818553888, |
|
"grad_norm": 0.2413639875443443, |
|
"learning_rate": 3.663224824929758e-05, |
|
"loss": 0.4602, |
|
"num_tokens": 895212128.0, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.6030013642564802, |
|
"grad_norm": 0.22966581710695846, |
|
"learning_rate": 3.6516122061587184e-05, |
|
"loss": 0.461, |
|
"num_tokens": 899092598.0, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.6098226466575716, |
|
"grad_norm": 0.23035517437667388, |
|
"learning_rate": 3.639970905483119e-05, |
|
"loss": 0.4652, |
|
"num_tokens": 902821411.0, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.6166439290586632, |
|
"grad_norm": 0.25105961460570336, |
|
"learning_rate": 3.628301293233653e-05, |
|
"loss": 0.4631, |
|
"num_tokens": 906650186.0, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.6234652114597545, |
|
"grad_norm": 0.2671204882477136, |
|
"learning_rate": 3.6166037406416586e-05, |
|
"loss": 0.4667, |
|
"num_tokens": 910561700.0, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.630286493860846, |
|
"grad_norm": 0.24391887686783595, |
|
"learning_rate": 3.604878619827306e-05, |
|
"loss": 0.46, |
|
"num_tokens": 914471979.0, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.6371077762619373, |
|
"grad_norm": 0.24606602365286445, |
|
"learning_rate": 3.593126303787758e-05, |
|
"loss": 0.4628, |
|
"num_tokens": 918407451.0, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.6439290586630286, |
|
"grad_norm": 0.24488025231455127, |
|
"learning_rate": 3.5813471663853086e-05, |
|
"loss": 0.4527, |
|
"num_tokens": 922149664.0, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.65075034106412, |
|
"grad_norm": 0.29502679494434914, |
|
"learning_rate": 3.569541582335487e-05, |
|
"loss": 0.4651, |
|
"num_tokens": 925906310.0, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.6575716234652114, |
|
"grad_norm": 0.29080410988093525, |
|
"learning_rate": 3.557709927195137e-05, |
|
"loss": 0.4579, |
|
"num_tokens": 929886517.0, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.6643929058663027, |
|
"grad_norm": 0.27995909033633837, |
|
"learning_rate": 3.545852577350472e-05, |
|
"loss": 0.4504, |
|
"num_tokens": 933572845.0, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.6712141882673943, |
|
"grad_norm": 0.24168573385650868, |
|
"learning_rate": 3.5339699100051e-05, |
|
"loss": 0.4545, |
|
"num_tokens": 937133457.0, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.6780354706684857, |
|
"grad_norm": 0.25669243958426285, |
|
"learning_rate": 3.522062303168021e-05, |
|
"loss": 0.4575, |
|
"num_tokens": 941038842.0, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.684856753069577, |
|
"grad_norm": 0.2992047414518027, |
|
"learning_rate": 3.510130135641608e-05, |
|
"loss": 0.4656, |
|
"num_tokens": 944824346.0, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.6916780354706686, |
|
"grad_norm": 0.25421993454486885, |
|
"learning_rate": 3.498173787009555e-05, |
|
"loss": 0.4554, |
|
"num_tokens": 948787821.0, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.69849931787176, |
|
"grad_norm": 0.28639702078451146, |
|
"learning_rate": 3.4861936376247994e-05, |
|
"loss": 0.4624, |
|
"num_tokens": 952546196.0, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.7053206002728514, |
|
"grad_norm": 0.2858597251246765, |
|
"learning_rate": 3.474190068597419e-05, |
|
"loss": 0.4622, |
|
"num_tokens": 956413312.0, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.7121418826739427, |
|
"grad_norm": 0.25432985447412165, |
|
"learning_rate": 3.4621634617825195e-05, |
|
"loss": 0.4563, |
|
"num_tokens": 960097484.0, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.718963165075034, |
|
"grad_norm": 0.24911852049104072, |
|
"learning_rate": 3.450114199768076e-05, |
|
"loss": 0.4529, |
|
"num_tokens": 963903477.0, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.7257844474761255, |
|
"grad_norm": 0.2604051041082986, |
|
"learning_rate": 3.4380426658627644e-05, |
|
"loss": 0.4583, |
|
"num_tokens": 967716958.0, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.7326057298772168, |
|
"grad_norm": 0.25401591712991045, |
|
"learning_rate": 3.425949244083775e-05, |
|
"loss": 0.4423, |
|
"num_tokens": 971436672.0, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.7394270122783082, |
|
"grad_norm": 0.2400128552050296, |
|
"learning_rate": 3.413834319144587e-05, |
|
"loss": 0.4689, |
|
"num_tokens": 975282872.0, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.7462482946793996, |
|
"grad_norm": 0.24326085857094784, |
|
"learning_rate": 3.401698276442732e-05, |
|
"loss": 0.4616, |
|
"num_tokens": 979113101.0, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.7530695770804912, |
|
"grad_norm": 0.23363258646620005, |
|
"learning_rate": 3.389541502047541e-05, |
|
"loss": 0.4563, |
|
"num_tokens": 983117582.0, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.7598908594815825, |
|
"grad_norm": 0.25465374186874845, |
|
"learning_rate": 3.377364382687852e-05, |
|
"loss": 0.4673, |
|
"num_tokens": 986965249.0, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.766712141882674, |
|
"grad_norm": 0.24340135543067581, |
|
"learning_rate": 3.365167305739717e-05, |
|
"loss": 0.4603, |
|
"num_tokens": 990643231.0, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.7735334242837655, |
|
"grad_norm": 0.26565317786588566, |
|
"learning_rate": 3.3529506592140724e-05, |
|
"loss": 0.4518, |
|
"num_tokens": 994547720.0, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.7803547066848568, |
|
"grad_norm": 0.2430267355793424, |
|
"learning_rate": 3.3407148317443986e-05, |
|
"loss": 0.4542, |
|
"num_tokens": 998141405.0, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.7871759890859482, |
|
"grad_norm": 0.2500900864567087, |
|
"learning_rate": 3.328460212574356e-05, |
|
"loss": 0.4517, |
|
"num_tokens": 1001828735.0, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.7939972714870396, |
|
"grad_norm": 0.23416160552607826, |
|
"learning_rate": 3.3161871915454045e-05, |
|
"loss": 0.4649, |
|
"num_tokens": 1005562335.0, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.800818553888131, |
|
"grad_norm": 0.26708075086752775, |
|
"learning_rate": 3.303896159084397e-05, |
|
"loss": 0.4578, |
|
"num_tokens": 1009141104.0, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.8076398362892223, |
|
"grad_norm": 0.26198497586871977, |
|
"learning_rate": 3.291587506191166e-05, |
|
"loss": 0.4632, |
|
"num_tokens": 1012723565.0, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.8144611186903137, |
|
"grad_norm": 0.24433603104500154, |
|
"learning_rate": 3.2792616244260774e-05, |
|
"loss": 0.4545, |
|
"num_tokens": 1016477651.0, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.821282401091405, |
|
"grad_norm": 0.22663713003772984, |
|
"learning_rate": 3.266918905897583e-05, |
|
"loss": 0.4569, |
|
"num_tokens": 1020301965.0, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.8281036834924966, |
|
"grad_norm": 0.21349710127379007, |
|
"learning_rate": 3.254559743249741e-05, |
|
"loss": 0.4475, |
|
"num_tokens": 1024157287.0, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.834924965893588, |
|
"grad_norm": 0.2300837335568015, |
|
"learning_rate": 3.2421845296497234e-05, |
|
"loss": 0.4666, |
|
"num_tokens": 1027955809.0, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.8417462482946794, |
|
"grad_norm": 0.23573100830820137, |
|
"learning_rate": 3.229793658775316e-05, |
|
"loss": 0.4583, |
|
"num_tokens": 1031820433.0, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.848567530695771, |
|
"grad_norm": 0.2595808132513396, |
|
"learning_rate": 3.217387524802387e-05, |
|
"loss": 0.4596, |
|
"num_tokens": 1035628174.0, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.8553888130968623, |
|
"grad_norm": 0.25623846534093714, |
|
"learning_rate": 3.204966522392355e-05, |
|
"loss": 0.4684, |
|
"num_tokens": 1039538259.0, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.8622100954979537, |
|
"grad_norm": 0.2508726901197259, |
|
"learning_rate": 3.1925310466796284e-05, |
|
"loss": 0.457, |
|
"num_tokens": 1043323479.0, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.869031377899045, |
|
"grad_norm": 0.2421604869066339, |
|
"learning_rate": 3.180081493259036e-05, |
|
"loss": 0.4627, |
|
"num_tokens": 1047143725.0, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.8758526603001364, |
|
"grad_norm": 0.23737080041525266, |
|
"learning_rate": 3.1676182581732454e-05, |
|
"loss": 0.4578, |
|
"num_tokens": 1051058729.0, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.8826739427012278, |
|
"grad_norm": 0.27755810889548954, |
|
"learning_rate": 3.155141737900162e-05, |
|
"loss": 0.4529, |
|
"num_tokens": 1054707187.0, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.8894952251023192, |
|
"grad_norm": 0.24769759366411626, |
|
"learning_rate": 3.142652329340319e-05, |
|
"loss": 0.4525, |
|
"num_tokens": 1058505830.0, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.8963165075034105, |
|
"grad_norm": 0.24993371360135405, |
|
"learning_rate": 3.1301504298042464e-05, |
|
"loss": 0.4463, |
|
"num_tokens": 1062382498.0, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.9031377899045019, |
|
"grad_norm": 0.26275979680991063, |
|
"learning_rate": 3.117636436999835e-05, |
|
"loss": 0.4641, |
|
"num_tokens": 1066230098.0, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.9099590723055935, |
|
"grad_norm": 0.2432200698117419, |
|
"learning_rate": 3.105110749019684e-05, |
|
"loss": 0.4549, |
|
"num_tokens": 1070100345.0, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.9167803547066848, |
|
"grad_norm": 0.26061462825889503, |
|
"learning_rate": 3.0925737643284405e-05, |
|
"loss": 0.4542, |
|
"num_tokens": 1073919676.0, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.9236016371077762, |
|
"grad_norm": 0.23086737216549277, |
|
"learning_rate": 3.080025881750116e-05, |
|
"loss": 0.4606, |
|
"num_tokens": 1077765957.0, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.9304229195088678, |
|
"grad_norm": 0.24203650893105913, |
|
"learning_rate": 3.067467500455404e-05, |
|
"loss": 0.4505, |
|
"num_tokens": 1081540834.0, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.9372442019099592, |
|
"grad_norm": 0.23619147847327127, |
|
"learning_rate": 3.054899019948984e-05, |
|
"loss": 0.4654, |
|
"num_tokens": 1085254168.0, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.9440654843110505, |
|
"grad_norm": 0.22649221738372532, |
|
"learning_rate": 3.042320840056807e-05, |
|
"loss": 0.4583, |
|
"num_tokens": 1089204273.0, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.950886766712142, |
|
"grad_norm": 0.23354218616389152, |
|
"learning_rate": 3.0297333609133806e-05, |
|
"loss": 0.4571, |
|
"num_tokens": 1093059842.0, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.9577080491132333, |
|
"grad_norm": 0.25797343469814343, |
|
"learning_rate": 3.017136982949035e-05, |
|
"loss": 0.4627, |
|
"num_tokens": 1096876504.0, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.9645293315143246, |
|
"grad_norm": 0.26072636154582024, |
|
"learning_rate": 3.004532106877191e-05, |
|
"loss": 0.4508, |
|
"num_tokens": 1100833806.0, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.971350613915416, |
|
"grad_norm": 0.24072481782703328, |
|
"learning_rate": 2.9919191336816094e-05, |
|
"loss": 0.4516, |
|
"num_tokens": 1104630572.0, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.9781718963165074, |
|
"grad_norm": 0.2571530398132939, |
|
"learning_rate": 2.9792984646036336e-05, |
|
"loss": 0.4466, |
|
"num_tokens": 1108388900.0, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.984993178717599, |
|
"grad_norm": 0.2526050017737328, |
|
"learning_rate": 2.966670501129427e-05, |
|
"loss": 0.4617, |
|
"num_tokens": 1112326009.0, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.9918144611186903, |
|
"grad_norm": 0.25326046223431775, |
|
"learning_rate": 2.9540356449772034e-05, |
|
"loss": 0.4584, |
|
"num_tokens": 1116108995.0, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.9986357435197817, |
|
"grad_norm": 0.2564772607600422, |
|
"learning_rate": 2.941394298084441e-05, |
|
"loss": 0.4484, |
|
"num_tokens": 1119855365.0, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 2.0054570259208733, |
|
"grad_norm": 0.3147035418636063, |
|
"learning_rate": 2.9287468625951025e-05, |
|
"loss": 0.4223, |
|
"num_tokens": 1123684625.0, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.0122783083219646, |
|
"grad_norm": 0.2565619666869903, |
|
"learning_rate": 2.9160937408468396e-05, |
|
"loss": 0.416, |
|
"num_tokens": 1127572305.0, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 2.019099590723056, |
|
"grad_norm": 0.32184315313361184, |
|
"learning_rate": 2.9034353353581956e-05, |
|
"loss": 0.4247, |
|
"num_tokens": 1131358945.0, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.0259208731241474, |
|
"grad_norm": 0.27261892484724387, |
|
"learning_rate": 2.8907720488157948e-05, |
|
"loss": 0.4194, |
|
"num_tokens": 1135163182.0, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 2.0327421555252387, |
|
"grad_norm": 0.23968605532571555, |
|
"learning_rate": 2.87810428406154e-05, |
|
"loss": 0.4321, |
|
"num_tokens": 1139156691.0, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.03956343792633, |
|
"grad_norm": 0.23673185470909455, |
|
"learning_rate": 2.8654324440797948e-05, |
|
"loss": 0.4215, |
|
"num_tokens": 1142963803.0, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 2.0463847203274215, |
|
"grad_norm": 0.2491861063584158, |
|
"learning_rate": 2.8527569319845597e-05, |
|
"loss": 0.4089, |
|
"num_tokens": 1146661529.0, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.053206002728513, |
|
"grad_norm": 0.23530867040961626, |
|
"learning_rate": 2.8400781510066536e-05, |
|
"loss": 0.4281, |
|
"num_tokens": 1150631571.0, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 2.060027285129604, |
|
"grad_norm": 0.2631434011973304, |
|
"learning_rate": 2.8273965044808864e-05, |
|
"loss": 0.4176, |
|
"num_tokens": 1154372520.0, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.0668485675306956, |
|
"grad_norm": 0.2284833410164006, |
|
"learning_rate": 2.8147123958332216e-05, |
|
"loss": 0.4229, |
|
"num_tokens": 1158377799.0, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 2.0736698499317874, |
|
"grad_norm": 0.2678035655026217, |
|
"learning_rate": 2.8020262285679523e-05, |
|
"loss": 0.4258, |
|
"num_tokens": 1162137345.0, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.0804911323328787, |
|
"grad_norm": 0.2712552795411336, |
|
"learning_rate": 2.7893384062548554e-05, |
|
"loss": 0.4254, |
|
"num_tokens": 1165931384.0, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 2.08731241473397, |
|
"grad_norm": 0.22877487888785064, |
|
"learning_rate": 2.7766493325163606e-05, |
|
"loss": 0.418, |
|
"num_tokens": 1169758424.0, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.0941336971350615, |
|
"grad_norm": 0.23596015312602944, |
|
"learning_rate": 2.7639594110147073e-05, |
|
"loss": 0.4233, |
|
"num_tokens": 1173501069.0, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 2.100954979536153, |
|
"grad_norm": 0.24515986360845682, |
|
"learning_rate": 2.7512690454391032e-05, |
|
"loss": 0.4303, |
|
"num_tokens": 1177335233.0, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.107776261937244, |
|
"grad_norm": 0.22909429894313327, |
|
"learning_rate": 2.7385786394928827e-05, |
|
"loss": 0.4194, |
|
"num_tokens": 1181125439.0, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 2.1145975443383356, |
|
"grad_norm": 0.2336846178361885, |
|
"learning_rate": 2.725888596880666e-05, |
|
"loss": 0.4286, |
|
"num_tokens": 1185004310.0, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.121418826739427, |
|
"grad_norm": 0.24571070856958216, |
|
"learning_rate": 2.7131993212955126e-05, |
|
"loss": 0.4342, |
|
"num_tokens": 1188730220.0, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 2.1282401091405183, |
|
"grad_norm": 0.22792129868881894, |
|
"learning_rate": 2.7005112164060832e-05, |
|
"loss": 0.4132, |
|
"num_tokens": 1192452885.0, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.1350613915416097, |
|
"grad_norm": 0.2410539139234191, |
|
"learning_rate": 2.6878246858437957e-05, |
|
"loss": 0.42, |
|
"num_tokens": 1196235271.0, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 2.141882673942701, |
|
"grad_norm": 0.25078836647709346, |
|
"learning_rate": 2.675140133189986e-05, |
|
"loss": 0.4163, |
|
"num_tokens": 1200064083.0, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.148703956343793, |
|
"grad_norm": 0.22906696585845734, |
|
"learning_rate": 2.66245796196307e-05, |
|
"loss": 0.4242, |
|
"num_tokens": 1203975750.0, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 2.155525238744884, |
|
"grad_norm": 0.24781657545420474, |
|
"learning_rate": 2.649778575605706e-05, |
|
"loss": 0.4281, |
|
"num_tokens": 1207941601.0, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.1623465211459756, |
|
"grad_norm": 0.24090982192799237, |
|
"learning_rate": 2.6371023774719595e-05, |
|
"loss": 0.4182, |
|
"num_tokens": 1211722253.0, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 2.169167803547067, |
|
"grad_norm": 0.25749659183735263, |
|
"learning_rate": 2.624429770814473e-05, |
|
"loss": 0.425, |
|
"num_tokens": 1215472991.0, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.1759890859481583, |
|
"grad_norm": 0.2472677398883936, |
|
"learning_rate": 2.6117611587716384e-05, |
|
"loss": 0.4288, |
|
"num_tokens": 1219314367.0, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 2.1828103683492497, |
|
"grad_norm": 0.25092458443075377, |
|
"learning_rate": 2.599096944354772e-05, |
|
"loss": 0.4301, |
|
"num_tokens": 1223192538.0, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.189631650750341, |
|
"grad_norm": 0.247907240580044, |
|
"learning_rate": 2.5864375304352918e-05, |
|
"loss": 0.4074, |
|
"num_tokens": 1226911643.0, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 2.1964529331514324, |
|
"grad_norm": 0.23295476928581843, |
|
"learning_rate": 2.5737833197319062e-05, |
|
"loss": 0.4225, |
|
"num_tokens": 1230700907.0, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.203274215552524, |
|
"grad_norm": 0.24438835145296037, |
|
"learning_rate": 2.5611347147977982e-05, |
|
"loss": 0.4263, |
|
"num_tokens": 1234481818.0, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 2.210095497953615, |
|
"grad_norm": 0.24089894593772532, |
|
"learning_rate": 2.5484921180078213e-05, |
|
"loss": 0.4254, |
|
"num_tokens": 1238353778.0, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.2169167803547065, |
|
"grad_norm": 0.21464574119985996, |
|
"learning_rate": 2.5358559315456993e-05, |
|
"loss": 0.4227, |
|
"num_tokens": 1242159504.0, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 2.223738062755798, |
|
"grad_norm": 0.22839035226091423, |
|
"learning_rate": 2.5232265573912327e-05, |
|
"loss": 0.4244, |
|
"num_tokens": 1246049292.0, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.2305593451568897, |
|
"grad_norm": 0.22402310267370926, |
|
"learning_rate": 2.5106043973075076e-05, |
|
"loss": 0.4328, |
|
"num_tokens": 1249978353.0, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 2.237380627557981, |
|
"grad_norm": 0.23206367405395442, |
|
"learning_rate": 2.4979898528281214e-05, |
|
"loss": 0.4211, |
|
"num_tokens": 1253553694.0, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.2442019099590724, |
|
"grad_norm": 0.2333554639072696, |
|
"learning_rate": 2.485383325244403e-05, |
|
"loss": 0.4246, |
|
"num_tokens": 1257413581.0, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 2.251023192360164, |
|
"grad_norm": 0.23454156642634885, |
|
"learning_rate": 2.4727852155926497e-05, |
|
"loss": 0.4279, |
|
"num_tokens": 1261372211.0, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.257844474761255, |
|
"grad_norm": 0.21421952276925463, |
|
"learning_rate": 2.4601959246413696e-05, |
|
"loss": 0.4232, |
|
"num_tokens": 1265194682.0, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 2.2646657571623465, |
|
"grad_norm": 0.22105746329030013, |
|
"learning_rate": 2.447615852878533e-05, |
|
"loss": 0.421, |
|
"num_tokens": 1268958243.0, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.271487039563438, |
|
"grad_norm": 0.23675555949897692, |
|
"learning_rate": 2.4350454004988283e-05, |
|
"loss": 0.4132, |
|
"num_tokens": 1272721552.0, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 2.2783083219645293, |
|
"grad_norm": 0.23313280614805948, |
|
"learning_rate": 2.4224849673909374e-05, |
|
"loss": 0.4201, |
|
"num_tokens": 1276637460.0, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.2851296043656206, |
|
"grad_norm": 0.24879594808091673, |
|
"learning_rate": 2.409934953124809e-05, |
|
"loss": 0.4277, |
|
"num_tokens": 1280437716.0, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 2.291950886766712, |
|
"grad_norm": 0.23629762626791306, |
|
"learning_rate": 2.3973957569389503e-05, |
|
"loss": 0.4225, |
|
"num_tokens": 1284303404.0, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.2987721691678034, |
|
"grad_norm": 0.23426015282967028, |
|
"learning_rate": 2.3848677777277278e-05, |
|
"loss": 0.4207, |
|
"num_tokens": 1288266060.0, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 2.305593451568895, |
|
"grad_norm": 0.2474806470725563, |
|
"learning_rate": 2.3723514140286734e-05, |
|
"loss": 0.4263, |
|
"num_tokens": 1292088844.0, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.3124147339699865, |
|
"grad_norm": 0.26321888944320493, |
|
"learning_rate": 2.359847064009808e-05, |
|
"loss": 0.419, |
|
"num_tokens": 1295907279.0, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 2.319236016371078, |
|
"grad_norm": 0.2330696578889981, |
|
"learning_rate": 2.3473551254569794e-05, |
|
"loss": 0.4132, |
|
"num_tokens": 1299636811.0, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.3260572987721693, |
|
"grad_norm": 0.23243524196557758, |
|
"learning_rate": 2.3348759957611998e-05, |
|
"loss": 0.4282, |
|
"num_tokens": 1303503660.0, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 2.3328785811732606, |
|
"grad_norm": 0.2353602509324825, |
|
"learning_rate": 2.3224100719060127e-05, |
|
"loss": 0.4286, |
|
"num_tokens": 1307218513.0, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.339699863574352, |
|
"grad_norm": 0.2233695529503808, |
|
"learning_rate": 2.309957750454858e-05, |
|
"loss": 0.4157, |
|
"num_tokens": 1311018055.0, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 2.3465211459754434, |
|
"grad_norm": 0.22408309313922292, |
|
"learning_rate": 2.2975194275384594e-05, |
|
"loss": 0.4175, |
|
"num_tokens": 1314633285.0, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.3533424283765347, |
|
"grad_norm": 0.22428384354773337, |
|
"learning_rate": 2.2850954988422207e-05, |
|
"loss": 0.4171, |
|
"num_tokens": 1318426856.0, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 2.360163710777626, |
|
"grad_norm": 0.22843262783520524, |
|
"learning_rate": 2.272686359593642e-05, |
|
"loss": 0.417, |
|
"num_tokens": 1322084417.0, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.3669849931787175, |
|
"grad_norm": 0.21581659433548242, |
|
"learning_rate": 2.2602924045497425e-05, |
|
"loss": 0.4214, |
|
"num_tokens": 1325960721.0, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 2.373806275579809, |
|
"grad_norm": 0.22722846051005088, |
|
"learning_rate": 2.247914027984505e-05, |
|
"loss": 0.41, |
|
"num_tokens": 1329717684.0, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.3806275579809, |
|
"grad_norm": 0.22477800449961008, |
|
"learning_rate": 2.2355516236763324e-05, |
|
"loss": 0.4138, |
|
"num_tokens": 1333545659.0, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 2.3874488403819916, |
|
"grad_norm": 0.22050953995659528, |
|
"learning_rate": 2.2232055848955248e-05, |
|
"loss": 0.4198, |
|
"num_tokens": 1337260747.0, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.3942701227830834, |
|
"grad_norm": 0.22917354851814678, |
|
"learning_rate": 2.2108763043917608e-05, |
|
"loss": 0.4211, |
|
"num_tokens": 1341043858.0, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 2.4010914051841747, |
|
"grad_norm": 0.2284572798651079, |
|
"learning_rate": 2.1985641743816105e-05, |
|
"loss": 0.4319, |
|
"num_tokens": 1344938218.0, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.407912687585266, |
|
"grad_norm": 0.21929109499301586, |
|
"learning_rate": 2.1862695865360554e-05, |
|
"loss": 0.4303, |
|
"num_tokens": 1348788919.0, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 2.4147339699863575, |
|
"grad_norm": 0.2256346107569212, |
|
"learning_rate": 2.17399293196803e-05, |
|
"loss": 0.4214, |
|
"num_tokens": 1352691877.0, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.421555252387449, |
|
"grad_norm": 0.23019155066996747, |
|
"learning_rate": 2.1617346012199778e-05, |
|
"loss": 0.4235, |
|
"num_tokens": 1356448019.0, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 2.42837653478854, |
|
"grad_norm": 0.24309913308062492, |
|
"learning_rate": 2.1494949842514288e-05, |
|
"loss": 0.4272, |
|
"num_tokens": 1360291674.0, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.4351978171896316, |
|
"grad_norm": 0.2173000207873259, |
|
"learning_rate": 2.137274470426596e-05, |
|
"loss": 0.4182, |
|
"num_tokens": 1364175739.0, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 2.442019099590723, |
|
"grad_norm": 0.21860172592053967, |
|
"learning_rate": 2.125073448501985e-05, |
|
"loss": 0.4264, |
|
"num_tokens": 1368149466.0, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.4488403819918143, |
|
"grad_norm": 0.2176204658866239, |
|
"learning_rate": 2.11289230661403e-05, |
|
"loss": 0.4198, |
|
"num_tokens": 1371952154.0, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 2.4556616643929057, |
|
"grad_norm": 0.2354590603024919, |
|
"learning_rate": 2.1007314322667436e-05, |
|
"loss": 0.4232, |
|
"num_tokens": 1375728846.0, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.4624829467939975, |
|
"grad_norm": 0.21712276199851965, |
|
"learning_rate": 2.0885912123193945e-05, |
|
"loss": 0.4157, |
|
"num_tokens": 1379383893.0, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 2.469304229195089, |
|
"grad_norm": 0.22832909360309003, |
|
"learning_rate": 2.0764720329741953e-05, |
|
"loss": 0.4229, |
|
"num_tokens": 1383235087.0, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.47612551159618, |
|
"grad_norm": 0.22951251208399123, |
|
"learning_rate": 2.064374279764022e-05, |
|
"loss": 0.4132, |
|
"num_tokens": 1387022452.0, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 2.4829467939972716, |
|
"grad_norm": 0.22838348706069087, |
|
"learning_rate": 2.052298337540142e-05, |
|
"loss": 0.4199, |
|
"num_tokens": 1390822247.0, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.489768076398363, |
|
"grad_norm": 0.21887085907417755, |
|
"learning_rate": 2.0402445904599827e-05, |
|
"loss": 0.4191, |
|
"num_tokens": 1394564010.0, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 2.4965893587994543, |
|
"grad_norm": 0.23482857590479367, |
|
"learning_rate": 2.0282134219748983e-05, |
|
"loss": 0.4149, |
|
"num_tokens": 1398382007.0, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.5034106412005457, |
|
"grad_norm": 0.24606812348910376, |
|
"learning_rate": 2.0162052148179798e-05, |
|
"loss": 0.4205, |
|
"num_tokens": 1402127581.0, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 2.510231923601637, |
|
"grad_norm": 0.21575272826753958, |
|
"learning_rate": 2.0042203509918768e-05, |
|
"loss": 0.4267, |
|
"num_tokens": 1406014675.0, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.5170532060027284, |
|
"grad_norm": 0.21192778882004046, |
|
"learning_rate": 1.992259211756645e-05, |
|
"loss": 0.4175, |
|
"num_tokens": 1409722672.0, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 2.52387448840382, |
|
"grad_norm": 0.22811496251814561, |
|
"learning_rate": 1.98032217761762e-05, |
|
"loss": 0.4261, |
|
"num_tokens": 1413666158.0, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.530695770804911, |
|
"grad_norm": 0.22908632960325515, |
|
"learning_rate": 1.9684096283133084e-05, |
|
"loss": 0.4269, |
|
"num_tokens": 1417402571.0, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 2.5375170532060025, |
|
"grad_norm": 0.22178872945976558, |
|
"learning_rate": 1.9565219428033127e-05, |
|
"loss": 0.4163, |
|
"num_tokens": 1421340412.0, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.544338335607094, |
|
"grad_norm": 0.22694151179861227, |
|
"learning_rate": 1.9446594992562716e-05, |
|
"loss": 0.4249, |
|
"num_tokens": 1425199953.0, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 2.5511596180081857, |
|
"grad_norm": 0.236572889052039, |
|
"learning_rate": 1.932822675037833e-05, |
|
"loss": 0.4181, |
|
"num_tokens": 1429272848.0, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.557980900409277, |
|
"grad_norm": 0.23388899031526594, |
|
"learning_rate": 1.921011846698646e-05, |
|
"loss": 0.4147, |
|
"num_tokens": 1433091944.0, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 2.5648021828103684, |
|
"grad_norm": 0.23974990168673324, |
|
"learning_rate": 1.9092273899623864e-05, |
|
"loss": 0.42, |
|
"num_tokens": 1436974840.0, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.57162346521146, |
|
"grad_norm": 0.2309497009298231, |
|
"learning_rate": 1.8974696797137996e-05, |
|
"loss": 0.4254, |
|
"num_tokens": 1440774914.0, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 2.578444747612551, |
|
"grad_norm": 0.23368473252531424, |
|
"learning_rate": 1.885739089986779e-05, |
|
"loss": 0.418, |
|
"num_tokens": 1444556279.0, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.5852660300136425, |
|
"grad_norm": 0.21692135999687384, |
|
"learning_rate": 1.8740359939524655e-05, |
|
"loss": 0.4231, |
|
"num_tokens": 1448438583.0, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 2.592087312414734, |
|
"grad_norm": 0.21687707639488846, |
|
"learning_rate": 1.8623607639073743e-05, |
|
"loss": 0.4274, |
|
"num_tokens": 1452265791.0, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.5989085948158253, |
|
"grad_norm": 0.25141521675980416, |
|
"learning_rate": 1.8507137712615553e-05, |
|
"loss": 0.4257, |
|
"num_tokens": 1455960099.0, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 2.6057298772169166, |
|
"grad_norm": 0.2487430740484725, |
|
"learning_rate": 1.8390953865267756e-05, |
|
"loss": 0.4223, |
|
"num_tokens": 1459864902.0, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.6125511596180084, |
|
"grad_norm": 0.2348067182488432, |
|
"learning_rate": 1.8275059793047318e-05, |
|
"loss": 0.4113, |
|
"num_tokens": 1463560724.0, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 2.6193724420191, |
|
"grad_norm": 0.20831466486828573, |
|
"learning_rate": 1.8159459182752958e-05, |
|
"loss": 0.4153, |
|
"num_tokens": 1467318533.0, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.626193724420191, |
|
"grad_norm": 0.221088737686775, |
|
"learning_rate": 1.8044155711847833e-05, |
|
"loss": 0.4189, |
|
"num_tokens": 1471225951.0, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 2.6330150068212825, |
|
"grad_norm": 0.23573336270806225, |
|
"learning_rate": 1.792915304834256e-05, |
|
"loss": 0.414, |
|
"num_tokens": 1474976948.0, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.639836289222374, |
|
"grad_norm": 0.23412991747166595, |
|
"learning_rate": 1.781445485067854e-05, |
|
"loss": 0.4181, |
|
"num_tokens": 1478830865.0, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 2.6466575716234653, |
|
"grad_norm": 0.22999200373661702, |
|
"learning_rate": 1.770006476761157e-05, |
|
"loss": 0.4309, |
|
"num_tokens": 1482679267.0, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.6534788540245566, |
|
"grad_norm": 0.2311723989723882, |
|
"learning_rate": 1.7585986438095763e-05, |
|
"loss": 0.4237, |
|
"num_tokens": 1486426058.0, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 2.660300136425648, |
|
"grad_norm": 0.22020293262737, |
|
"learning_rate": 1.7472223491167767e-05, |
|
"loss": 0.4101, |
|
"num_tokens": 1490166733.0, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.6671214188267394, |
|
"grad_norm": 0.21795987471310851, |
|
"learning_rate": 1.735877954583139e-05, |
|
"loss": 0.4245, |
|
"num_tokens": 1493992930.0, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 2.6739427012278307, |
|
"grad_norm": 0.23745451187143324, |
|
"learning_rate": 1.724565821094239e-05, |
|
"loss": 0.4098, |
|
"num_tokens": 1497730531.0, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.680763983628922, |
|
"grad_norm": 0.226178899379112, |
|
"learning_rate": 1.7132863085093728e-05, |
|
"loss": 0.425, |
|
"num_tokens": 1501593876.0, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 2.6875852660300135, |
|
"grad_norm": 0.231976304295542, |
|
"learning_rate": 1.7020397756501062e-05, |
|
"loss": 0.4215, |
|
"num_tokens": 1505411865.0, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.694406548431105, |
|
"grad_norm": 0.2081553021114587, |
|
"learning_rate": 1.6908265802888605e-05, |
|
"loss": 0.4144, |
|
"num_tokens": 1509039077.0, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 2.701227830832196, |
|
"grad_norm": 0.23286471006088036, |
|
"learning_rate": 1.6796470791375302e-05, |
|
"loss": 0.4158, |
|
"num_tokens": 1512763400.0, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.708049113233288, |
|
"grad_norm": 0.21963420285235397, |
|
"learning_rate": 1.668501627836138e-05, |
|
"loss": 0.4221, |
|
"num_tokens": 1516724445.0, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 2.7148703956343794, |
|
"grad_norm": 0.2102372089422378, |
|
"learning_rate": 1.657390580941521e-05, |
|
"loss": 0.4149, |
|
"num_tokens": 1520614981.0, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.7216916780354707, |
|
"grad_norm": 0.2103099247946458, |
|
"learning_rate": 1.646314291916045e-05, |
|
"loss": 0.4225, |
|
"num_tokens": 1524397796.0, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 2.728512960436562, |
|
"grad_norm": 0.20651298615731473, |
|
"learning_rate": 1.6352731131163724e-05, |
|
"loss": 0.4176, |
|
"num_tokens": 1528158624.0, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.7353342428376535, |
|
"grad_norm": 0.22256304912813424, |
|
"learning_rate": 1.624267395782242e-05, |
|
"loss": 0.4413, |
|
"num_tokens": 1532032964.0, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 2.742155525238745, |
|
"grad_norm": 0.2141320219657911, |
|
"learning_rate": 1.6132974900252988e-05, |
|
"loss": 0.42, |
|
"num_tokens": 1535931108.0, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.748976807639836, |
|
"grad_norm": 0.21399803252659633, |
|
"learning_rate": 1.6023637448179608e-05, |
|
"loss": 0.4079, |
|
"num_tokens": 1539611481.0, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 2.7557980900409276, |
|
"grad_norm": 0.21363759608230212, |
|
"learning_rate": 1.591466507982312e-05, |
|
"loss": 0.4285, |
|
"num_tokens": 1543769258.0, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.762619372442019, |
|
"grad_norm": 0.2085416000025761, |
|
"learning_rate": 1.580606126179038e-05, |
|
"loss": 0.4202, |
|
"num_tokens": 1547704714.0, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 2.7694406548431107, |
|
"grad_norm": 0.21003686502862656, |
|
"learning_rate": 1.569782944896402e-05, |
|
"loss": 0.4189, |
|
"num_tokens": 1551607979.0, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.776261937244202, |
|
"grad_norm": 0.22506606570671495, |
|
"learning_rate": 1.5589973084392513e-05, |
|
"loss": 0.4233, |
|
"num_tokens": 1555548130.0, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 2.7830832196452935, |
|
"grad_norm": 0.24072762404877676, |
|
"learning_rate": 1.5482495599180637e-05, |
|
"loss": 0.4094, |
|
"num_tokens": 1559374280.0, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.789904502046385, |
|
"grad_norm": 0.20730508474415332, |
|
"learning_rate": 1.5375400412380347e-05, |
|
"loss": 0.421, |
|
"num_tokens": 1563231974.0, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 2.796725784447476, |
|
"grad_norm": 0.23906963103039622, |
|
"learning_rate": 1.5268690930882e-05, |
|
"loss": 0.4261, |
|
"num_tokens": 1567090495.0, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.8035470668485676, |
|
"grad_norm": 0.22062602067558976, |
|
"learning_rate": 1.5162370549305962e-05, |
|
"loss": 0.4308, |
|
"num_tokens": 1570887007.0, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 2.810368349249659, |
|
"grad_norm": 0.20431590447856307, |
|
"learning_rate": 1.505644264989464e-05, |
|
"loss": 0.4136, |
|
"num_tokens": 1574806787.0, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.8171896316507503, |
|
"grad_norm": 0.24109171460042456, |
|
"learning_rate": 1.4950910602404886e-05, |
|
"loss": 0.4191, |
|
"num_tokens": 1578732597.0, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 2.8240109140518417, |
|
"grad_norm": 0.21187950729408436, |
|
"learning_rate": 1.4845777764000757e-05, |
|
"loss": 0.423, |
|
"num_tokens": 1582527990.0, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.830832196452933, |
|
"grad_norm": 0.21453562191523357, |
|
"learning_rate": 1.4741047479146803e-05, |
|
"loss": 0.4108, |
|
"num_tokens": 1586163936.0, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 2.8376534788540244, |
|
"grad_norm": 0.22588422097050712, |
|
"learning_rate": 1.463672307950159e-05, |
|
"loss": 0.4087, |
|
"num_tokens": 1589894282.0, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.844474761255116, |
|
"grad_norm": 0.2261754562696023, |
|
"learning_rate": 1.4532807883811745e-05, |
|
"loss": 0.4241, |
|
"num_tokens": 1593669447.0, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 2.851296043656207, |
|
"grad_norm": 0.20337839289851203, |
|
"learning_rate": 1.4429305197806386e-05, |
|
"loss": 0.4164, |
|
"num_tokens": 1597406094.0, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.8581173260572985, |
|
"grad_norm": 0.19221798653016375, |
|
"learning_rate": 1.4326218314091971e-05, |
|
"loss": 0.4197, |
|
"num_tokens": 1601353966.0, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 2.8649386084583903, |
|
"grad_norm": 0.21350869825951038, |
|
"learning_rate": 1.4223550512047517e-05, |
|
"loss": 0.4238, |
|
"num_tokens": 1605354453.0, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.8717598908594817, |
|
"grad_norm": 0.21516182131932057, |
|
"learning_rate": 1.4121305057720305e-05, |
|
"loss": 0.4277, |
|
"num_tokens": 1609346678.0, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 2.878581173260573, |
|
"grad_norm": 0.20487482373295057, |
|
"learning_rate": 1.4019485203722004e-05, |
|
"loss": 0.4233, |
|
"num_tokens": 1613213477.0, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.8854024556616644, |
|
"grad_norm": 0.19971661497573895, |
|
"learning_rate": 1.391809418912513e-05, |
|
"loss": 0.4134, |
|
"num_tokens": 1616990063.0, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 2.892223738062756, |
|
"grad_norm": 0.21479965480668722, |
|
"learning_rate": 1.3817135239360079e-05, |
|
"loss": 0.4122, |
|
"num_tokens": 1620738563.0, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.899045020463847, |
|
"grad_norm": 0.21149002401342695, |
|
"learning_rate": 1.371661156611247e-05, |
|
"loss": 0.4254, |
|
"num_tokens": 1624503206.0, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 2.9058663028649385, |
|
"grad_norm": 0.2064220920477961, |
|
"learning_rate": 1.3616526367220999e-05, |
|
"loss": 0.4218, |
|
"num_tokens": 1628389551.0, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.91268758526603, |
|
"grad_norm": 0.224336611095809, |
|
"learning_rate": 1.3516882826575699e-05, |
|
"loss": 0.4168, |
|
"num_tokens": 1632099673.0, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 2.9195088676671213, |
|
"grad_norm": 0.20231394272285003, |
|
"learning_rate": 1.3417684114016682e-05, |
|
"loss": 0.4208, |
|
"num_tokens": 1636056127.0, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.926330150068213, |
|
"grad_norm": 0.2342068060643603, |
|
"learning_rate": 1.3318933385233252e-05, |
|
"loss": 0.4134, |
|
"num_tokens": 1639823774.0, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 2.9331514324693044, |
|
"grad_norm": 0.22071791865366464, |
|
"learning_rate": 1.3220633781663561e-05, |
|
"loss": 0.4205, |
|
"num_tokens": 1643643243.0, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.939972714870396, |
|
"grad_norm": 0.20280836857510123, |
|
"learning_rate": 1.3122788430394659e-05, |
|
"loss": 0.4137, |
|
"num_tokens": 1647326744.0, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 2.946793997271487, |
|
"grad_norm": 0.21111204058607796, |
|
"learning_rate": 1.3025400444062991e-05, |
|
"loss": 0.4128, |
|
"num_tokens": 1651209275.0, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.9536152796725785, |
|
"grad_norm": 0.22752544012779208, |
|
"learning_rate": 1.2928472920755427e-05, |
|
"loss": 0.4197, |
|
"num_tokens": 1654945680.0, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 2.96043656207367, |
|
"grad_norm": 0.20457377739676993, |
|
"learning_rate": 1.2832008943910679e-05, |
|
"loss": 0.4126, |
|
"num_tokens": 1658670255.0, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.9672578444747613, |
|
"grad_norm": 0.215676001216208, |
|
"learning_rate": 1.273601158222118e-05, |
|
"loss": 0.421, |
|
"num_tokens": 1662421467.0, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 2.9740791268758526, |
|
"grad_norm": 0.20052662912803418, |
|
"learning_rate": 1.2640483889535548e-05, |
|
"loss": 0.4155, |
|
"num_tokens": 1666233128.0, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.980900409276944, |
|
"grad_norm": 0.22018742684050363, |
|
"learning_rate": 1.2545428904761358e-05, |
|
"loss": 0.4206, |
|
"num_tokens": 1670143796.0, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 2.9877216916780354, |
|
"grad_norm": 0.20843817841523296, |
|
"learning_rate": 1.2450849651768482e-05, |
|
"loss": 0.4166, |
|
"num_tokens": 1674122886.0, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.9945429740791267, |
|
"grad_norm": 0.21312098205498078, |
|
"learning_rate": 1.2356749139292936e-05, |
|
"loss": 0.4191, |
|
"num_tokens": 1677987646.0, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 3.001364256480218, |
|
"grad_norm": 0.25313048875354344, |
|
"learning_rate": 1.2263130360841133e-05, |
|
"loss": 0.4077, |
|
"num_tokens": 1681710780.0, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.00818553888131, |
|
"grad_norm": 0.2518733265600338, |
|
"learning_rate": 1.2169996294594647e-05, |
|
"loss": 0.3943, |
|
"num_tokens": 1685570373.0, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 3.0150068212824013, |
|
"grad_norm": 0.21608953499384176, |
|
"learning_rate": 1.2077349903315494e-05, |
|
"loss": 0.3942, |
|
"num_tokens": 1689499335.0, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.0218281036834926, |
|
"grad_norm": 0.21094067692735943, |
|
"learning_rate": 1.1985194134251893e-05, |
|
"loss": 0.3885, |
|
"num_tokens": 1693267116.0, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 3.028649386084584, |
|
"grad_norm": 0.22097130594934408, |
|
"learning_rate": 1.1893531919044455e-05, |
|
"loss": 0.392, |
|
"num_tokens": 1696911966.0, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.0354706684856754, |
|
"grad_norm": 0.23081007114913413, |
|
"learning_rate": 1.1802366173632978e-05, |
|
"loss": 0.3911, |
|
"num_tokens": 1700717213.0, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 3.0422919508867667, |
|
"grad_norm": 0.22034774494074957, |
|
"learning_rate": 1.1711699798163662e-05, |
|
"loss": 0.3914, |
|
"num_tokens": 1704583362.0, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.049113233287858, |
|
"grad_norm": 0.20795664906099545, |
|
"learning_rate": 1.1621535676896832e-05, |
|
"loss": 0.3913, |
|
"num_tokens": 1708328029.0, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 3.0559345156889495, |
|
"grad_norm": 0.20846407848313514, |
|
"learning_rate": 1.153187667811523e-05, |
|
"loss": 0.3981, |
|
"num_tokens": 1712138483.0, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.062755798090041, |
|
"grad_norm": 0.20753109208216994, |
|
"learning_rate": 1.1442725654032726e-05, |
|
"loss": 0.3941, |
|
"num_tokens": 1715891615.0, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 3.069577080491132, |
|
"grad_norm": 0.20038731471295618, |
|
"learning_rate": 1.1354085440703613e-05, |
|
"loss": 0.4021, |
|
"num_tokens": 1719889011.0, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.0763983628922236, |
|
"grad_norm": 0.21209283757585357, |
|
"learning_rate": 1.1265958857932374e-05, |
|
"loss": 0.3896, |
|
"num_tokens": 1723608607.0, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 3.083219645293315, |
|
"grad_norm": 0.23047949629485057, |
|
"learning_rate": 1.1178348709183984e-05, |
|
"loss": 0.4023, |
|
"num_tokens": 1727372956.0, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.0900409276944067, |
|
"grad_norm": 0.20869366897756297, |
|
"learning_rate": 1.1091257781494702e-05, |
|
"loss": 0.3921, |
|
"num_tokens": 1731315261.0, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 3.096862210095498, |
|
"grad_norm": 0.21058145322360297, |
|
"learning_rate": 1.1004688845383456e-05, |
|
"loss": 0.3963, |
|
"num_tokens": 1735219718.0, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 3.1036834924965895, |
|
"grad_norm": 0.2074326101986663, |
|
"learning_rate": 1.0918644654763688e-05, |
|
"loss": 0.3896, |
|
"num_tokens": 1739047080.0, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 3.110504774897681, |
|
"grad_norm": 0.20481355721124894, |
|
"learning_rate": 1.0833127946855707e-05, |
|
"loss": 0.3826, |
|
"num_tokens": 1742971837.0, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.117326057298772, |
|
"grad_norm": 0.19813330790330563, |
|
"learning_rate": 1.0748141442099694e-05, |
|
"loss": 0.3878, |
|
"num_tokens": 1746927074.0, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 3.1241473396998636, |
|
"grad_norm": 0.2116417089035285, |
|
"learning_rate": 1.0663687844069093e-05, |
|
"loss": 0.3878, |
|
"num_tokens": 1750733072.0, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 3.130968622100955, |
|
"grad_norm": 0.22256327065708215, |
|
"learning_rate": 1.0579769839384614e-05, |
|
"loss": 0.3946, |
|
"num_tokens": 1754529527.0, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 3.1377899045020463, |
|
"grad_norm": 0.2041840929582227, |
|
"learning_rate": 1.0496390097628808e-05, |
|
"loss": 0.3935, |
|
"num_tokens": 1758441793.0, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.1446111869031377, |
|
"grad_norm": 0.19564312042697185, |
|
"learning_rate": 1.0413551271261101e-05, |
|
"loss": 0.3836, |
|
"num_tokens": 1762275922.0, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 3.151432469304229, |
|
"grad_norm": 0.2068432147143155, |
|
"learning_rate": 1.0331255995533418e-05, |
|
"loss": 0.3906, |
|
"num_tokens": 1766113412.0, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 3.1582537517053204, |
|
"grad_norm": 0.22243916899267124, |
|
"learning_rate": 1.0249506888406379e-05, |
|
"loss": 0.3948, |
|
"num_tokens": 1769906086.0, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 3.1650750341064118, |
|
"grad_norm": 0.2048461202536205, |
|
"learning_rate": 1.0168306550465994e-05, |
|
"loss": 0.3966, |
|
"num_tokens": 1773669464.0, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 3.1718963165075036, |
|
"grad_norm": 0.21617012487326628, |
|
"learning_rate": 1.0087657564840935e-05, |
|
"loss": 0.3938, |
|
"num_tokens": 1777499688.0, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 3.178717598908595, |
|
"grad_norm": 0.20554623338455671, |
|
"learning_rate": 1.000756249712037e-05, |
|
"loss": 0.399, |
|
"num_tokens": 1781484075.0, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 3.1855388813096863, |
|
"grad_norm": 0.19715452218076857, |
|
"learning_rate": 9.928023895272351e-06, |
|
"loss": 0.3949, |
|
"num_tokens": 1785359652.0, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 3.1923601637107777, |
|
"grad_norm": 0.2213446767833458, |
|
"learning_rate": 9.849044289562725e-06, |
|
"loss": 0.3933, |
|
"num_tokens": 1789122591.0, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 3.199181446111869, |
|
"grad_norm": 0.20417351278770474, |
|
"learning_rate": 9.770626192474689e-06, |
|
"loss": 0.402, |
|
"num_tokens": 1793107659.0, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 3.2060027285129604, |
|
"grad_norm": 0.20314880208976857, |
|
"learning_rate": 9.692772098628843e-06, |
|
"loss": 0.391, |
|
"num_tokens": 1796804821.0, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.212824010914052, |
|
"grad_norm": 0.19634303877612064, |
|
"learning_rate": 9.615484484703807e-06, |
|
"loss": 0.3875, |
|
"num_tokens": 1800706093.0, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 3.219645293315143, |
|
"grad_norm": 0.20356531552961224, |
|
"learning_rate": 9.53876580935749e-06, |
|
"loss": 0.3892, |
|
"num_tokens": 1804621910.0, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 3.2264665757162345, |
|
"grad_norm": 0.20498462055547575, |
|
"learning_rate": 9.462618513148825e-06, |
|
"loss": 0.3898, |
|
"num_tokens": 1808407310.0, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 3.233287858117326, |
|
"grad_norm": 0.21014352436728398, |
|
"learning_rate": 9.387045018460136e-06, |
|
"loss": 0.3808, |
|
"num_tokens": 1812224911.0, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 3.2401091405184177, |
|
"grad_norm": 0.21372227802314, |
|
"learning_rate": 9.312047729420112e-06, |
|
"loss": 0.389, |
|
"num_tokens": 1816033009.0, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 3.246930422919509, |
|
"grad_norm": 0.21668432624109216, |
|
"learning_rate": 9.237629031827294e-06, |
|
"loss": 0.3909, |
|
"num_tokens": 1819832724.0, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 3.2537517053206004, |
|
"grad_norm": 0.2119446997252824, |
|
"learning_rate": 9.163791293074183e-06, |
|
"loss": 0.3951, |
|
"num_tokens": 1823734105.0, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 3.260572987721692, |
|
"grad_norm": 0.20740892554253706, |
|
"learning_rate": 9.09053686207194e-06, |
|
"loss": 0.3974, |
|
"num_tokens": 1827553738.0, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 3.267394270122783, |
|
"grad_norm": 0.2059967519948797, |
|
"learning_rate": 9.017868069175678e-06, |
|
"loss": 0.3914, |
|
"num_tokens": 1831290342.0, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 3.2742155525238745, |
|
"grad_norm": 0.21234381046184714, |
|
"learning_rate": 8.945787226110273e-06, |
|
"loss": 0.3965, |
|
"num_tokens": 1835037993.0, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.281036834924966, |
|
"grad_norm": 0.20237389246899862, |
|
"learning_rate": 8.874296625896888e-06, |
|
"loss": 0.3861, |
|
"num_tokens": 1838807356.0, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 3.2878581173260573, |
|
"grad_norm": 0.19098693481860202, |
|
"learning_rate": 8.803398542779994e-06, |
|
"loss": 0.4008, |
|
"num_tokens": 1842745596.0, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 3.2946793997271486, |
|
"grad_norm": 0.21338378176043743, |
|
"learning_rate": 8.73309523215502e-06, |
|
"loss": 0.3911, |
|
"num_tokens": 1846517419.0, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 3.30150068212824, |
|
"grad_norm": 0.20688191017456503, |
|
"learning_rate": 8.663388930496616e-06, |
|
"loss": 0.397, |
|
"num_tokens": 1850310616.0, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 3.3083219645293314, |
|
"grad_norm": 0.19608884555809325, |
|
"learning_rate": 8.594281855287512e-06, |
|
"loss": 0.3896, |
|
"num_tokens": 1854227804.0, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 3.3151432469304227, |
|
"grad_norm": 0.20671461791807003, |
|
"learning_rate": 8.525776204947961e-06, |
|
"loss": 0.3844, |
|
"num_tokens": 1858022696.0, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 3.321964529331514, |
|
"grad_norm": 0.20319736048116888, |
|
"learning_rate": 8.45787415876581e-06, |
|
"loss": 0.3831, |
|
"num_tokens": 1861807749.0, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 3.328785811732606, |
|
"grad_norm": 0.2050817882021739, |
|
"learning_rate": 8.390577876827183e-06, |
|
"loss": 0.4052, |
|
"num_tokens": 1865749161.0, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 3.3356070941336973, |
|
"grad_norm": 0.21735850171481977, |
|
"learning_rate": 8.323889499947733e-06, |
|
"loss": 0.3865, |
|
"num_tokens": 1869507562.0, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 3.3424283765347886, |
|
"grad_norm": 0.20478772687116364, |
|
"learning_rate": 8.257811149604578e-06, |
|
"loss": 0.3903, |
|
"num_tokens": 1873354894.0, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.34924965893588, |
|
"grad_norm": 0.22023262268774746, |
|
"learning_rate": 8.1923449278688e-06, |
|
"loss": 0.3926, |
|
"num_tokens": 1877168182.0, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 3.3560709413369714, |
|
"grad_norm": 0.2253966050871356, |
|
"learning_rate": 8.127492917338545e-06, |
|
"loss": 0.3969, |
|
"num_tokens": 1881013690.0, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 3.3628922237380627, |
|
"grad_norm": 0.20017364506269658, |
|
"learning_rate": 8.063257181072827e-06, |
|
"loss": 0.3949, |
|
"num_tokens": 1884975840.0, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 3.369713506139154, |
|
"grad_norm": 0.2043509066053362, |
|
"learning_rate": 7.999639762525855e-06, |
|
"loss": 0.3902, |
|
"num_tokens": 1888839607.0, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 3.3765347885402455, |
|
"grad_norm": 0.20210724992772502, |
|
"learning_rate": 7.936642685482029e-06, |
|
"loss": 0.3924, |
|
"num_tokens": 1892784948.0, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 3.383356070941337, |
|
"grad_norm": 0.21968340995681965, |
|
"learning_rate": 7.874267953991589e-06, |
|
"loss": 0.3933, |
|
"num_tokens": 1896452542.0, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 3.390177353342428, |
|
"grad_norm": 0.2094714463856819, |
|
"learning_rate": 7.812517552306842e-06, |
|
"loss": 0.3939, |
|
"num_tokens": 1900341954.0, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 3.39699863574352, |
|
"grad_norm": 0.2091973568099767, |
|
"learning_rate": 7.751393444819021e-06, |
|
"loss": 0.3964, |
|
"num_tokens": 1904200124.0, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 3.4038199181446114, |
|
"grad_norm": 0.20170653729583482, |
|
"learning_rate": 7.690897575995838e-06, |
|
"loss": 0.3843, |
|
"num_tokens": 1908080037.0, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 3.4106412005457027, |
|
"grad_norm": 0.23178079669342305, |
|
"learning_rate": 7.63103187031961e-06, |
|
"loss": 0.3838, |
|
"num_tokens": 1911799657.0, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.417462482946794, |
|
"grad_norm": 0.23412806397059693, |
|
"learning_rate": 7.571798232226003e-06, |
|
"loss": 0.3951, |
|
"num_tokens": 1915633789.0, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 3.4242837653478855, |
|
"grad_norm": 0.2262063695861437, |
|
"learning_rate": 7.5131985460434985e-06, |
|
"loss": 0.3917, |
|
"num_tokens": 1919464952.0, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 3.431105047748977, |
|
"grad_norm": 0.21906232852290922, |
|
"learning_rate": 7.4552346759334285e-06, |
|
"loss": 0.3893, |
|
"num_tokens": 1923208468.0, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 3.437926330150068, |
|
"grad_norm": 0.19736609233735516, |
|
"learning_rate": 7.3979084658306535e-06, |
|
"loss": 0.3944, |
|
"num_tokens": 1927046873.0, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 3.4447476125511596, |
|
"grad_norm": 0.20436848581764777, |
|
"learning_rate": 7.34122173938495e-06, |
|
"loss": 0.3842, |
|
"num_tokens": 1930903387.0, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 3.451568894952251, |
|
"grad_norm": 0.20859725244629027, |
|
"learning_rate": 7.285176299902956e-06, |
|
"loss": 0.3849, |
|
"num_tokens": 1934670669.0, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 3.4583901773533423, |
|
"grad_norm": 0.20852983402642666, |
|
"learning_rate": 7.229773930290816e-06, |
|
"loss": 0.3904, |
|
"num_tokens": 1938387280.0, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 3.4652114597544337, |
|
"grad_norm": 0.19522927512353125, |
|
"learning_rate": 7.175016392997473e-06, |
|
"loss": 0.379, |
|
"num_tokens": 1942343131.0, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 3.472032742155525, |
|
"grad_norm": 0.20050884496880403, |
|
"learning_rate": 7.1209054299585965e-06, |
|
"loss": 0.3876, |
|
"num_tokens": 1945923779.0, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 3.4788540245566164, |
|
"grad_norm": 0.2057634574949913, |
|
"learning_rate": 7.0674427625411585e-06, |
|
"loss": 0.3923, |
|
"num_tokens": 1949676825.0, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.485675306957708, |
|
"grad_norm": 0.1972141823138041, |
|
"learning_rate": 7.014630091488686e-06, |
|
"loss": 0.3816, |
|
"num_tokens": 1953537310.0, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 3.4924965893587996, |
|
"grad_norm": 0.20211921503834646, |
|
"learning_rate": 6.962469096867162e-06, |
|
"loss": 0.3885, |
|
"num_tokens": 1957305222.0, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 3.499317871759891, |
|
"grad_norm": 0.21038414370930605, |
|
"learning_rate": 6.910961438011552e-06, |
|
"loss": 0.385, |
|
"num_tokens": 1960958556.0, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 3.5061391541609823, |
|
"grad_norm": 0.21665071977354775, |
|
"learning_rate": 6.860108753473055e-06, |
|
"loss": 0.3941, |
|
"num_tokens": 1964999544.0, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 3.5129604365620737, |
|
"grad_norm": 0.19728776121424704, |
|
"learning_rate": 6.809912660966959e-06, |
|
"loss": 0.3882, |
|
"num_tokens": 1968883238.0, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 3.519781718963165, |
|
"grad_norm": 0.22398814515162693, |
|
"learning_rate": 6.760374757321162e-06, |
|
"loss": 0.3949, |
|
"num_tokens": 1972701391.0, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 3.5266030013642564, |
|
"grad_norm": 0.2002701521202298, |
|
"learning_rate": 6.711496618425414e-06, |
|
"loss": 0.3921, |
|
"num_tokens": 1976508939.0, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 3.533424283765348, |
|
"grad_norm": 0.19910839515627227, |
|
"learning_rate": 6.663279799181149e-06, |
|
"loss": 0.3956, |
|
"num_tokens": 1980362572.0, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 3.540245566166439, |
|
"grad_norm": 0.2041823149699508, |
|
"learning_rate": 6.6157258334520285e-06, |
|
"loss": 0.3973, |
|
"num_tokens": 1984261238.0, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 3.547066848567531, |
|
"grad_norm": 0.2000905521095183, |
|
"learning_rate": 6.568836234015172e-06, |
|
"loss": 0.3897, |
|
"num_tokens": 1988149913.0, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.5538881309686223, |
|
"grad_norm": 0.18563333401941437, |
|
"learning_rate": 6.522612492512997e-06, |
|
"loss": 0.3869, |
|
"num_tokens": 1992084412.0, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 3.5607094133697137, |
|
"grad_norm": 0.20277027008106682, |
|
"learning_rate": 6.477056079405794e-06, |
|
"loss": 0.3886, |
|
"num_tokens": 1995921814.0, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 3.567530695770805, |
|
"grad_norm": 0.20852766092177452, |
|
"learning_rate": 6.432168443924929e-06, |
|
"loss": 0.3883, |
|
"num_tokens": 1999809637.0, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 3.5743519781718964, |
|
"grad_norm": 0.21203977784508607, |
|
"learning_rate": 6.387951014026755e-06, |
|
"loss": 0.384, |
|
"num_tokens": 2003526409.0, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 3.581173260572988, |
|
"grad_norm": 0.2069840864292817, |
|
"learning_rate": 6.3444051963471806e-06, |
|
"loss": 0.3979, |
|
"num_tokens": 2007455906.0, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 3.587994542974079, |
|
"grad_norm": 0.20048934543362684, |
|
"learning_rate": 6.301532376156921e-06, |
|
"loss": 0.3934, |
|
"num_tokens": 2011337751.0, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 3.5948158253751705, |
|
"grad_norm": 0.19834381113774482, |
|
"learning_rate": 6.259333917317436e-06, |
|
"loss": 0.3992, |
|
"num_tokens": 2015319415.0, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 3.601637107776262, |
|
"grad_norm": 0.21282358337605378, |
|
"learning_rate": 6.21781116223753e-06, |
|
"loss": 0.4006, |
|
"num_tokens": 2019219253.0, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 3.6084583901773533, |
|
"grad_norm": 0.2029670555965298, |
|
"learning_rate": 6.176965431830666e-06, |
|
"loss": 0.3984, |
|
"num_tokens": 2023102339.0, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 3.6152796725784446, |
|
"grad_norm": 0.21004596795864844, |
|
"learning_rate": 6.136798025472937e-06, |
|
"loss": 0.3904, |
|
"num_tokens": 2026777557.0, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 3.622100954979536, |
|
"grad_norm": 0.20450488637042483, |
|
"learning_rate": 6.097310220961715e-06, |
|
"loss": 0.3926, |
|
"num_tokens": 2030544635.0, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 3.6289222373806274, |
|
"grad_norm": 0.20780498338670383, |
|
"learning_rate": 6.058503274475029e-06, |
|
"loss": 0.3959, |
|
"num_tokens": 2034457342.0, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 3.6357435197817187, |
|
"grad_norm": 0.20299175264729932, |
|
"learning_rate": 6.020378420531589e-06, |
|
"loss": 0.3935, |
|
"num_tokens": 2038160685.0, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 3.64256480218281, |
|
"grad_norm": 0.203714903198161, |
|
"learning_rate": 5.982936871951507e-06, |
|
"loss": 0.3893, |
|
"num_tokens": 2042057043.0, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 3.649386084583902, |
|
"grad_norm": 0.18615329351658916, |
|
"learning_rate": 5.946179819817731e-06, |
|
"loss": 0.381, |
|
"num_tokens": 2045931129.0, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 3.6562073669849933, |
|
"grad_norm": 0.2045589475870847, |
|
"learning_rate": 5.910108433438151e-06, |
|
"loss": 0.3942, |
|
"num_tokens": 2049650512.0, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 3.6630286493860846, |
|
"grad_norm": 0.20203376952951235, |
|
"learning_rate": 5.874723860308384e-06, |
|
"loss": 0.3822, |
|
"num_tokens": 2053328762.0, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 3.669849931787176, |
|
"grad_norm": 0.19587913638609752, |
|
"learning_rate": 5.840027226075295e-06, |
|
"loss": 0.3862, |
|
"num_tokens": 2057162099.0, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 3.6766712141882674, |
|
"grad_norm": 0.20886702731734277, |
|
"learning_rate": 5.806019634501175e-06, |
|
"loss": 0.3961, |
|
"num_tokens": 2060972297.0, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 3.6834924965893587, |
|
"grad_norm": 0.21067356253291641, |
|
"learning_rate": 5.772702167428618e-06, |
|
"loss": 0.3802, |
|
"num_tokens": 2064586221.0, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.69031377899045, |
|
"grad_norm": 0.20718394506562157, |
|
"learning_rate": 5.7400758847461315e-06, |
|
"loss": 0.3983, |
|
"num_tokens": 2068459718.0, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 3.6971350613915415, |
|
"grad_norm": 0.19555241045212834, |
|
"learning_rate": 5.7081418243544e-06, |
|
"loss": 0.3895, |
|
"num_tokens": 2072325609.0, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 3.7039563437926333, |
|
"grad_norm": 0.1963361038784244, |
|
"learning_rate": 5.676901002133273e-06, |
|
"loss": 0.3929, |
|
"num_tokens": 2076390764.0, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 3.7107776261937246, |
|
"grad_norm": 0.20786774861006216, |
|
"learning_rate": 5.646354411909446e-06, |
|
"loss": 0.4022, |
|
"num_tokens": 2080252567.0, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 3.717598908594816, |
|
"grad_norm": 0.20572707253355813, |
|
"learning_rate": 5.616503025424856e-06, |
|
"loss": 0.3877, |
|
"num_tokens": 2084020477.0, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 3.7244201909959074, |
|
"grad_norm": 0.20508204818855866, |
|
"learning_rate": 5.587347792305745e-06, |
|
"loss": 0.3832, |
|
"num_tokens": 2087723825.0, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 3.7312414733969987, |
|
"grad_norm": 0.19829256497358866, |
|
"learning_rate": 5.558889640032476e-06, |
|
"loss": 0.3959, |
|
"num_tokens": 2091548707.0, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 3.73806275579809, |
|
"grad_norm": 0.20585067559570674, |
|
"learning_rate": 5.531129473910013e-06, |
|
"loss": 0.395, |
|
"num_tokens": 2095277696.0, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 3.7448840381991815, |
|
"grad_norm": 0.20584618251918524, |
|
"learning_rate": 5.504068177039132e-06, |
|
"loss": 0.3843, |
|
"num_tokens": 2099065289.0, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 3.751705320600273, |
|
"grad_norm": 0.1961381624274159, |
|
"learning_rate": 5.477706610288317e-06, |
|
"loss": 0.3914, |
|
"num_tokens": 2102889019.0, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 3.758526603001364, |
|
"grad_norm": 0.20088944941627254, |
|
"learning_rate": 5.45204561226638e-06, |
|
"loss": 0.3895, |
|
"num_tokens": 2106610423.0, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 3.7653478854024556, |
|
"grad_norm": 0.19125391439188097, |
|
"learning_rate": 5.42708599929578e-06, |
|
"loss": 0.396, |
|
"num_tokens": 2110417885.0, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 3.772169167803547, |
|
"grad_norm": 0.21608047273020625, |
|
"learning_rate": 5.402828565386665e-06, |
|
"loss": 0.3968, |
|
"num_tokens": 2114204416.0, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 3.7789904502046383, |
|
"grad_norm": 0.1942534200680033, |
|
"learning_rate": 5.3792740822116025e-06, |
|
"loss": 0.3886, |
|
"num_tokens": 2118096713.0, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 3.7858117326057297, |
|
"grad_norm": 0.19802316385817678, |
|
"learning_rate": 5.356423299081025e-06, |
|
"loss": 0.4007, |
|
"num_tokens": 2121992916.0, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 3.792633015006821, |
|
"grad_norm": 0.20270167705867925, |
|
"learning_rate": 5.33427694291941e-06, |
|
"loss": 0.393, |
|
"num_tokens": 2125730480.0, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 3.799454297407913, |
|
"grad_norm": 0.20021094011750865, |
|
"learning_rate": 5.31283571824215e-06, |
|
"loss": 0.3879, |
|
"num_tokens": 2129639321.0, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 3.806275579809004, |
|
"grad_norm": 0.20135100813170187, |
|
"learning_rate": 5.292100307133135e-06, |
|
"loss": 0.3876, |
|
"num_tokens": 2133458245.0, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 3.8130968622100956, |
|
"grad_norm": 0.20308852278461587, |
|
"learning_rate": 5.27207136922305e-06, |
|
"loss": 0.3924, |
|
"num_tokens": 2137312410.0, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 3.819918144611187, |
|
"grad_norm": 0.19442660123469885, |
|
"learning_rate": 5.25274954166841e-06, |
|
"loss": 0.3905, |
|
"num_tokens": 2141197422.0, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.8267394270122783, |
|
"grad_norm": 0.2038756065211056, |
|
"learning_rate": 5.234135439131267e-06, |
|
"loss": 0.3891, |
|
"num_tokens": 2145011937.0, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 3.8335607094133697, |
|
"grad_norm": 0.20340134857081124, |
|
"learning_rate": 5.2162296537596785e-06, |
|
"loss": 0.3868, |
|
"num_tokens": 2148857263.0, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 3.840381991814461, |
|
"grad_norm": 0.19451669508345293, |
|
"learning_rate": 5.199032755168853e-06, |
|
"loss": 0.3906, |
|
"num_tokens": 2152616579.0, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 3.8472032742155524, |
|
"grad_norm": 0.19947015975252247, |
|
"learning_rate": 5.1825452904230384e-06, |
|
"loss": 0.3878, |
|
"num_tokens": 2156280215.0, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 3.854024556616644, |
|
"grad_norm": 0.19242022707343331, |
|
"learning_rate": 5.166767784018122e-06, |
|
"loss": 0.3908, |
|
"num_tokens": 2160088451.0, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 3.8608458390177356, |
|
"grad_norm": 0.20797387240019166, |
|
"learning_rate": 5.151700737864934e-06, |
|
"loss": 0.3894, |
|
"num_tokens": 2163915194.0, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 3.867667121418827, |
|
"grad_norm": 0.2017238446572756, |
|
"learning_rate": 5.137344631273288e-06, |
|
"loss": 0.3969, |
|
"num_tokens": 2167621168.0, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 3.8744884038199183, |
|
"grad_norm": 0.20071816929517722, |
|
"learning_rate": 5.123699920936733e-06, |
|
"loss": 0.3951, |
|
"num_tokens": 2171481655.0, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 3.8813096862210097, |
|
"grad_norm": 0.19141590926200375, |
|
"learning_rate": 5.110767040918028e-06, |
|
"loss": 0.3956, |
|
"num_tokens": 2175244240.0, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 3.888130968622101, |
|
"grad_norm": 0.1998247337143825, |
|
"learning_rate": 5.0985464026353306e-06, |
|
"loss": 0.3921, |
|
"num_tokens": 2178959600.0, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 3.8949522510231924, |
|
"grad_norm": 0.1866653423880014, |
|
"learning_rate": 5.0870383948491004e-06, |
|
"loss": 0.3894, |
|
"num_tokens": 2182711952.0, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 3.901773533424284, |
|
"grad_norm": 0.19710468274556872, |
|
"learning_rate": 5.07624338364975e-06, |
|
"loss": 0.3885, |
|
"num_tokens": 2186594283.0, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 3.908594815825375, |
|
"grad_norm": 0.21264165496615145, |
|
"learning_rate": 5.066161712445985e-06, |
|
"loss": 0.3865, |
|
"num_tokens": 2190266795.0, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 3.9154160982264665, |
|
"grad_norm": 0.2101090061168048, |
|
"learning_rate": 5.0567937019538814e-06, |
|
"loss": 0.397, |
|
"num_tokens": 2194018553.0, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 3.922237380627558, |
|
"grad_norm": 0.20800078607789743, |
|
"learning_rate": 5.0481396501866925e-06, |
|
"loss": 0.3948, |
|
"num_tokens": 2197673799.0, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 3.9290586630286493, |
|
"grad_norm": 0.19793740700955814, |
|
"learning_rate": 5.040199832445351e-06, |
|
"loss": 0.3923, |
|
"num_tokens": 2201714258.0, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 3.9358799454297406, |
|
"grad_norm": 0.20386936192836522, |
|
"learning_rate": 5.032974501309735e-06, |
|
"loss": 0.3862, |
|
"num_tokens": 2205348220.0, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 3.942701227830832, |
|
"grad_norm": 0.2091053406878887, |
|
"learning_rate": 5.026463886630607e-06, |
|
"loss": 0.4021, |
|
"num_tokens": 2209250198.0, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 3.9495225102319234, |
|
"grad_norm": 0.19787040691237082, |
|
"learning_rate": 5.020668195522323e-06, |
|
"loss": 0.3926, |
|
"num_tokens": 2213251612.0, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 3.956343792633015, |
|
"grad_norm": 0.20667483997814673, |
|
"learning_rate": 5.015587612356232e-06, |
|
"loss": 0.3832, |
|
"num_tokens": 2217075610.0, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.9631650750341065, |
|
"grad_norm": 0.1986326433013966, |
|
"learning_rate": 5.011222298754814e-06, |
|
"loss": 0.3889, |
|
"num_tokens": 2220901736.0, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 3.969986357435198, |
|
"grad_norm": 0.218645478760147, |
|
"learning_rate": 5.007572393586543e-06, |
|
"loss": 0.3784, |
|
"num_tokens": 2224698938.0, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 3.9768076398362893, |
|
"grad_norm": 0.20263059309127532, |
|
"learning_rate": 5.004638012961454e-06, |
|
"loss": 0.3852, |
|
"num_tokens": 2228505486.0, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 3.9836289222373806, |
|
"grad_norm": 0.21296036316974268, |
|
"learning_rate": 5.002419250227476e-06, |
|
"loss": 0.3879, |
|
"num_tokens": 2232055237.0, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 3.990450204638472, |
|
"grad_norm": 0.20148472020199598, |
|
"learning_rate": 5.000916175967434e-06, |
|
"loss": 0.3939, |
|
"num_tokens": 2235867011.0, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 3.9972714870395634, |
|
"grad_norm": 0.21024070606119777, |
|
"learning_rate": 5.000128837996827e-06, |
|
"loss": 0.3915, |
|
"num_tokens": 2239685763.0, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 2932, |
|
"total_flos": 4583117197672448.0, |
|
"train_loss": 0.0, |
|
"train_runtime": 0.0145, |
|
"train_samples_per_second": 25827825.108, |
|
"train_steps_per_second": 201975.78 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2932, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4583117197672448.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|