{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 740, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06756756756756757, "grad_norm": 4.363862700534323, "learning_rate": 6.7567567567567575e-06, "loss": 1.4367, "step": 5 }, { "epoch": 0.13513513513513514, "grad_norm": 2.705123580523887, "learning_rate": 1.3513513513513515e-05, "loss": 1.2876, "step": 10 }, { "epoch": 0.20270270270270271, "grad_norm": 1.9103615735976536, "learning_rate": 2.0270270270270273e-05, "loss": 1.1519, "step": 15 }, { "epoch": 0.2702702702702703, "grad_norm": 1.4290192342216463, "learning_rate": 2.702702702702703e-05, "loss": 1.0869, "step": 20 }, { "epoch": 0.33783783783783783, "grad_norm": 1.6272402064271356, "learning_rate": 3.3783783783783784e-05, "loss": 1.0231, "step": 25 }, { "epoch": 0.40540540540540543, "grad_norm": 1.5586320099924926, "learning_rate": 4.0540540540540545e-05, "loss": 1.0081, "step": 30 }, { "epoch": 0.47297297297297297, "grad_norm": 1.495486561607122, "learning_rate": 4.72972972972973e-05, "loss": 0.9979, "step": 35 }, { "epoch": 0.5405405405405406, "grad_norm": 1.5561925026806689, "learning_rate": 4.9997978016429605e-05, "loss": 1.0, "step": 40 }, { "epoch": 0.6081081081081081, "grad_norm": 1.2919640366032352, "learning_rate": 4.9985622766211935e-05, "loss": 0.9753, "step": 45 }, { "epoch": 0.6756756756756757, "grad_norm": 1.1989455363385786, "learning_rate": 4.996204175076325e-05, "loss": 0.962, "step": 50 }, { "epoch": 0.7432432432432432, "grad_norm": 1.3974962823237607, "learning_rate": 4.99272467427147e-05, "loss": 0.9677, "step": 55 }, { "epoch": 0.8108108108108109, "grad_norm": 1.0651268932789728, "learning_rate": 4.9881255113192526e-05, "loss": 0.9513, "step": 60 }, { "epoch": 0.8783783783783784, "grad_norm": 1.155576062356755, "learning_rate": 4.982408982314565e-05, "loss": 0.9619, "step": 65 }, { "epoch": 0.9459459459459459, "grad_norm": 1.153407826787955, "learning_rate": 4.975577941188258e-05, "loss": 0.9535, "step": 70 }, { "epoch": 1.0135135135135136, "grad_norm": 1.3775017230764597, "learning_rate": 4.967635798282344e-05, "loss": 0.9128, "step": 75 }, { "epoch": 1.0810810810810811, "grad_norm": 1.1598834626986259, "learning_rate": 4.958586518647407e-05, "loss": 0.8023, "step": 80 }, { "epoch": 1.1486486486486487, "grad_norm": 1.1199196718082898, "learning_rate": 4.9484346200630855e-05, "loss": 0.7744, "step": 85 }, { "epoch": 1.2162162162162162, "grad_norm": 1.1102270321512686, "learning_rate": 4.937185170782607e-05, "loss": 0.7707, "step": 90 }, { "epoch": 1.2837837837837838, "grad_norm": 1.0203324363707762, "learning_rate": 4.9248437870025035e-05, "loss": 0.7768, "step": 95 }, { "epoch": 1.3513513513513513, "grad_norm": 0.9984006736146105, "learning_rate": 4.911416630058772e-05, "loss": 0.7818, "step": 100 }, { "epoch": 1.4189189189189189, "grad_norm": 1.0288042499970187, "learning_rate": 4.896910403350873e-05, "loss": 0.7864, "step": 105 }, { "epoch": 1.4864864864864864, "grad_norm": 1.0840779872937458, "learning_rate": 4.88133234899512e-05, "loss": 0.7836, "step": 110 }, { "epoch": 1.554054054054054, "grad_norm": 1.0695079470515958, "learning_rate": 4.864690244209105e-05, "loss": 0.7957, "step": 115 }, { "epoch": 1.6216216216216215, "grad_norm": 1.103923214171777, "learning_rate": 4.8469923974289874e-05, "loss": 0.7943, "step": 120 }, { "epoch": 1.689189189189189, "grad_norm": 0.9711733919756039, "learning_rate": 4.828247644161577e-05, "loss": 0.7776, "step": 125 }, { "epoch": 1.7567567567567568, "grad_norm": 1.064368713968981, "learning_rate": 4.808465342573274e-05, "loss": 0.7932, "step": 130 }, { "epoch": 1.8243243243243243, "grad_norm": 1.0013597653110469, "learning_rate": 4.787655368818087e-05, "loss": 0.7923, "step": 135 }, { "epoch": 1.8918918918918919, "grad_norm": 0.9961148156869974, "learning_rate": 4.765828112107034e-05, "loss": 0.7802, "step": 140 }, { "epoch": 1.9594594594594594, "grad_norm": 1.037903883462848, "learning_rate": 4.742994469521421e-05, "loss": 0.7854, "step": 145 }, { "epoch": 2.027027027027027, "grad_norm": 1.2622580047357654, "learning_rate": 4.719165840572557e-05, "loss": 0.7063, "step": 150 }, { "epoch": 2.0945945945945947, "grad_norm": 1.5388907462936379, "learning_rate": 4.694354121510644e-05, "loss": 0.5487, "step": 155 }, { "epoch": 2.1621621621621623, "grad_norm": 1.0947103494672035, "learning_rate": 4.668571699385668e-05, "loss": 0.5219, "step": 160 }, { "epoch": 2.22972972972973, "grad_norm": 1.1651788766706233, "learning_rate": 4.641831445863265e-05, "loss": 0.5231, "step": 165 }, { "epoch": 2.2972972972972974, "grad_norm": 1.1521096427634065, "learning_rate": 4.614146710798645e-05, "loss": 0.5234, "step": 170 }, { "epoch": 2.364864864864865, "grad_norm": 1.1421748306651989, "learning_rate": 4.585531315571788e-05, "loss": 0.5294, "step": 175 }, { "epoch": 2.4324324324324325, "grad_norm": 1.1514841731143928, "learning_rate": 4.555999546187229e-05, "loss": 0.5381, "step": 180 }, { "epoch": 2.5, "grad_norm": 1.0852945108986, "learning_rate": 4.5255661461418854e-05, "loss": 0.5493, "step": 185 }, { "epoch": 2.5675675675675675, "grad_norm": 1.1684382683077765, "learning_rate": 4.4942463090644896e-05, "loss": 0.5525, "step": 190 }, { "epoch": 2.635135135135135, "grad_norm": 1.0788294127649594, "learning_rate": 4.462055671130289e-05, "loss": 0.5546, "step": 195 }, { "epoch": 2.7027027027027026, "grad_norm": 1.0795463247201529, "learning_rate": 4.4290103032548094e-05, "loss": 0.5619, "step": 200 }, { "epoch": 2.77027027027027, "grad_norm": 1.016128366175836, "learning_rate": 4.395126703070589e-05, "loss": 0.5614, "step": 205 }, { "epoch": 2.8378378378378377, "grad_norm": 1.0844580663640953, "learning_rate": 4.360421786690862e-05, "loss": 0.562, "step": 210 }, { "epoch": 2.9054054054054053, "grad_norm": 1.1432810387672725, "learning_rate": 4.324912880264326e-05, "loss": 0.576, "step": 215 }, { "epoch": 2.972972972972973, "grad_norm": 1.0380987391797107, "learning_rate": 4.288617711325207e-05, "loss": 0.5697, "step": 220 }, { "epoch": 3.0405405405405403, "grad_norm": 1.2503070135517047, "learning_rate": 4.251554399942928e-05, "loss": 0.4353, "step": 225 }, { "epoch": 3.108108108108108, "grad_norm": 1.6777536935067219, "learning_rate": 4.21374144967581e-05, "loss": 0.3142, "step": 230 }, { "epoch": 3.175675675675676, "grad_norm": 1.1024202684426092, "learning_rate": 4.1751977383333224e-05, "loss": 0.3107, "step": 235 }, { "epoch": 3.2432432432432434, "grad_norm": 1.1910388632740423, "learning_rate": 4.1359425085514906e-05, "loss": 0.3036, "step": 240 }, { "epoch": 3.310810810810811, "grad_norm": 1.231739603690704, "learning_rate": 4.095995358186162e-05, "loss": 0.3016, "step": 245 }, { "epoch": 3.3783783783783785, "grad_norm": 1.1140736912933251, "learning_rate": 4.055376230528936e-05, "loss": 0.3056, "step": 250 }, { "epoch": 3.445945945945946, "grad_norm": 1.1025657791763896, "learning_rate": 4.0141054043506406e-05, "loss": 0.3185, "step": 255 }, { "epoch": 3.5135135135135136, "grad_norm": 1.1791795190101504, "learning_rate": 3.972203483777315e-05, "loss": 0.3223, "step": 260 }, { "epoch": 3.581081081081081, "grad_norm": 1.1814268159199415, "learning_rate": 3.929691388003772e-05, "loss": 0.3282, "step": 265 }, { "epoch": 3.6486486486486487, "grad_norm": 1.2187792717576609, "learning_rate": 3.886590340849852e-05, "loss": 0.3282, "step": 270 }, { "epoch": 3.7162162162162162, "grad_norm": 1.0539496180965828, "learning_rate": 3.842921860164607e-05, "loss": 0.3317, "step": 275 }, { "epoch": 3.7837837837837838, "grad_norm": 1.0934561034346522, "learning_rate": 3.798707747083694e-05, "loss": 0.3284, "step": 280 }, { "epoch": 3.8513513513513513, "grad_norm": 1.1471678068019302, "learning_rate": 3.753970075145322e-05, "loss": 0.3371, "step": 285 }, { "epoch": 3.918918918918919, "grad_norm": 6.758529885837581, "learning_rate": 3.7087311792702265e-05, "loss": 0.3967, "step": 290 }, { "epoch": 3.9864864864864864, "grad_norm": 1.179358411267412, "learning_rate": 3.663013644611139e-05, "loss": 0.3428, "step": 295 }, { "epoch": 4.054054054054054, "grad_norm": 1.0255720909850174, "learning_rate": 3.616840295277328e-05, "loss": 0.2063, "step": 300 }, { "epoch": 4.121621621621622, "grad_norm": 1.1681577021267184, "learning_rate": 3.5702341829398525e-05, "loss": 0.1656, "step": 305 }, { "epoch": 4.1891891891891895, "grad_norm": 1.016291058117435, "learning_rate": 3.523218575323198e-05, "loss": 0.1664, "step": 310 }, { "epoch": 4.256756756756757, "grad_norm": 0.9723982730571757, "learning_rate": 3.475816944589058e-05, "loss": 0.162, "step": 315 }, { "epoch": 4.324324324324325, "grad_norm": 0.990019839692624, "learning_rate": 3.4280529556180404e-05, "loss": 0.1607, "step": 320 }, { "epoch": 4.391891891891892, "grad_norm": 1.067612756627841, "learning_rate": 3.379950454195172e-05, "loss": 0.163, "step": 325 }, { "epoch": 4.45945945945946, "grad_norm": 1.0726343998852998, "learning_rate": 3.331533455105084e-05, "loss": 0.1709, "step": 330 }, { "epoch": 4.527027027027027, "grad_norm": 1.1256065554143042, "learning_rate": 3.2828261301428206e-05, "loss": 0.1717, "step": 335 }, { "epoch": 4.594594594594595, "grad_norm": 1.1551746250455754, "learning_rate": 3.23385279604627e-05, "loss": 0.1728, "step": 340 }, { "epoch": 4.662162162162162, "grad_norm": 1.1309776267452107, "learning_rate": 3.18463790235623e-05, "loss": 0.1698, "step": 345 }, { "epoch": 4.72972972972973, "grad_norm": 1.0126763046166452, "learning_rate": 3.135206019210167e-05, "loss": 0.1794, "step": 350 }, { "epoch": 4.797297297297297, "grad_norm": 1.0529263553265555, "learning_rate": 3.085581825075782e-05, "loss": 0.1756, "step": 355 }, { "epoch": 4.864864864864865, "grad_norm": 1.0824352188484838, "learning_rate": 3.0357900944304774e-05, "loss": 0.1746, "step": 360 }, { "epoch": 4.9324324324324325, "grad_norm": 1.0284704574724253, "learning_rate": 2.9858556853929048e-05, "loss": 0.1794, "step": 365 }, { "epoch": 5.0, "grad_norm": 1.0663445785184333, "learning_rate": 2.9358035273127483e-05, "loss": 0.1758, "step": 370 }, { "epoch": 5.0675675675675675, "grad_norm": 0.8110449745410689, "learning_rate": 2.8856586083249487e-05, "loss": 0.0799, "step": 375 }, { "epoch": 5.135135135135135, "grad_norm": 0.8986402275236202, "learning_rate": 2.83544596287458e-05, "loss": 0.0808, "step": 380 }, { "epoch": 5.202702702702703, "grad_norm": 0.9006038868524785, "learning_rate": 2.785190659218604e-05, "loss": 0.0766, "step": 385 }, { "epoch": 5.27027027027027, "grad_norm": 0.8349550048269416, "learning_rate": 2.7349177869107462e-05, "loss": 0.0771, "step": 390 }, { "epoch": 5.337837837837838, "grad_norm": 0.7752306018868771, "learning_rate": 2.684652444275741e-05, "loss": 0.0782, "step": 395 }, { "epoch": 5.405405405405405, "grad_norm": 0.7897117962987327, "learning_rate": 2.634419725879193e-05, "loss": 0.0775, "step": 400 }, { "epoch": 5.472972972972973, "grad_norm": 0.8055715103673516, "learning_rate": 2.58424470999932e-05, "loss": 0.0783, "step": 405 }, { "epoch": 5.54054054054054, "grad_norm": 0.9037399195717022, "learning_rate": 2.534152446106825e-05, "loss": 0.0762, "step": 410 }, { "epoch": 5.608108108108108, "grad_norm": 0.8301708524422988, "learning_rate": 2.4841679423591523e-05, "loss": 0.0762, "step": 415 }, { "epoch": 5.675675675675675, "grad_norm": 0.8143273116216752, "learning_rate": 2.4343161531153647e-05, "loss": 0.0781, "step": 420 }, { "epoch": 5.743243243243243, "grad_norm": 0.8439330014425397, "learning_rate": 2.3846219664778824e-05, "loss": 0.0793, "step": 425 }, { "epoch": 5.8108108108108105, "grad_norm": 0.8142776194662777, "learning_rate": 2.3351101918672985e-05, "loss": 0.0795, "step": 430 }, { "epoch": 5.878378378378378, "grad_norm": 0.8039988882173097, "learning_rate": 2.2858055476364822e-05, "loss": 0.0784, "step": 435 }, { "epoch": 5.945945945945946, "grad_norm": 0.7425653621209469, "learning_rate": 2.2367326487301317e-05, "loss": 0.0775, "step": 440 }, { "epoch": 6.013513513513513, "grad_norm": 0.5266747729292991, "learning_rate": 2.1879159943959686e-05, "loss": 0.0698, "step": 445 }, { "epoch": 6.081081081081081, "grad_norm": 0.5903636908264376, "learning_rate": 2.139379955953686e-05, "loss": 0.037, "step": 450 }, { "epoch": 6.148648648648648, "grad_norm": 0.7057760060859144, "learning_rate": 2.0911487646277623e-05, "loss": 0.0354, "step": 455 }, { "epoch": 6.216216216216216, "grad_norm": 0.6615031551157701, "learning_rate": 2.0432464994502203e-05, "loss": 0.0338, "step": 460 }, { "epoch": 6.283783783783784, "grad_norm": 0.6697026743057446, "learning_rate": 1.995697075239365e-05, "loss": 0.0333, "step": 465 }, { "epoch": 6.351351351351352, "grad_norm": 0.5843277936844908, "learning_rate": 1.9485242306605028e-05, "loss": 0.0344, "step": 470 }, { "epoch": 6.418918918918919, "grad_norm": 0.6767666813757078, "learning_rate": 1.9017515163746058e-05, "loss": 0.0333, "step": 475 }, { "epoch": 6.486486486486487, "grad_norm": 0.612009036541925, "learning_rate": 1.855402283280836e-05, "loss": 0.0335, "step": 480 }, { "epoch": 6.554054054054054, "grad_norm": 0.6242119830720047, "learning_rate": 1.8094996708587958e-05, "loss": 0.0312, "step": 485 }, { "epoch": 6.621621621621622, "grad_norm": 0.6635428778841779, "learning_rate": 1.7640665956163306e-05, "loss": 0.0343, "step": 490 }, { "epoch": 6.6891891891891895, "grad_norm": 0.5640405200429803, "learning_rate": 1.719125739648648e-05, "loss": 0.0331, "step": 495 }, { "epoch": 6.756756756756757, "grad_norm": 0.5504509159363488, "learning_rate": 1.6746995393144668e-05, "loss": 0.0302, "step": 500 }, { "epoch": 6.824324324324325, "grad_norm": 0.6458168054375821, "learning_rate": 1.6308101740348433e-05, "loss": 0.0328, "step": 505 }, { "epoch": 6.891891891891892, "grad_norm": 0.5736755972086186, "learning_rate": 1.5874795552202773e-05, "loss": 0.0297, "step": 510 }, { "epoch": 6.95945945945946, "grad_norm": 0.5847553965309075, "learning_rate": 1.5447293153316163e-05, "loss": 0.0322, "step": 515 }, { "epoch": 7.027027027027027, "grad_norm": 0.3234292566878911, "learning_rate": 1.5025807970802252e-05, "loss": 0.0237, "step": 520 }, { "epoch": 7.094594594594595, "grad_norm": 0.3229946666578016, "learning_rate": 1.4610550427728103e-05, "loss": 0.0127, "step": 525 }, { "epoch": 7.162162162162162, "grad_norm": 0.39666829499344397, "learning_rate": 1.4201727838062181e-05, "loss": 0.0136, "step": 530 }, { "epoch": 7.22972972972973, "grad_norm": 0.4146267850301746, "learning_rate": 1.3799544303174514e-05, "loss": 0.0129, "step": 535 }, { "epoch": 7.297297297297297, "grad_norm": 0.4006318813346767, "learning_rate": 1.3404200609940754e-05, "loss": 0.0125, "step": 540 }, { "epoch": 7.364864864864865, "grad_norm": 0.32483290169015805, "learning_rate": 1.3015894130500977e-05, "loss": 0.0121, "step": 545 }, { "epoch": 7.4324324324324325, "grad_norm": 0.38263371508283517, "learning_rate": 1.2634818723723174e-05, "loss": 0.0123, "step": 550 }, { "epoch": 7.5, "grad_norm": 0.3584723506304905, "learning_rate": 1.2261164638420832e-05, "loss": 0.0117, "step": 555 }, { "epoch": 7.5675675675675675, "grad_norm": 0.32948849737072256, "learning_rate": 1.1895118418372734e-05, "loss": 0.012, "step": 560 }, { "epoch": 7.635135135135135, "grad_norm": 0.3554679755053725, "learning_rate": 1.1536862809192518e-05, "loss": 0.0123, "step": 565 }, { "epoch": 7.702702702702703, "grad_norm": 0.38755695040762, "learning_rate": 1.1186576667094342e-05, "loss": 0.0126, "step": 570 }, { "epoch": 7.77027027027027, "grad_norm": 0.3596675709446449, "learning_rate": 1.0844434869600428e-05, "loss": 0.0119, "step": 575 }, { "epoch": 7.837837837837838, "grad_norm": 0.368664013370026, "learning_rate": 1.0510608228234848e-05, "loss": 0.0121, "step": 580 }, { "epoch": 7.905405405405405, "grad_norm": 0.3689497639593192, "learning_rate": 1.0185263403247256e-05, "loss": 0.0121, "step": 585 }, { "epoch": 7.972972972972973, "grad_norm": 0.37329998652806745, "learning_rate": 9.868562820409103e-06, "loss": 0.0116, "step": 590 }, { "epoch": 8.04054054054054, "grad_norm": 0.17516646949984094, "learning_rate": 9.560664589923895e-06, "loss": 0.0081, "step": 595 }, { "epoch": 8.108108108108109, "grad_norm": 0.19701860916567984, "learning_rate": 9.261722427491953e-06, "loss": 0.0053, "step": 600 }, { "epoch": 8.175675675675675, "grad_norm": 0.16761504423339055, "learning_rate": 8.971885577569058e-06, "loss": 0.0047, "step": 605 }, { "epoch": 8.243243243243244, "grad_norm": 0.23172610604646424, "learning_rate": 8.691298738857432e-06, "loss": 0.0044, "step": 610 }, { "epoch": 8.31081081081081, "grad_norm": 0.1734154562778805, "learning_rate": 8.420101992066028e-06, "loss": 0.004, "step": 615 }, { "epoch": 8.378378378378379, "grad_norm": 0.18641414092933023, "learning_rate": 8.158430729976372e-06, "loss": 0.0043, "step": 620 }, { "epoch": 8.445945945945946, "grad_norm": 0.2824274655739082, "learning_rate": 7.906415589848834e-06, "loss": 0.0045, "step": 625 }, { "epoch": 8.513513513513514, "grad_norm": 0.16976044334904716, "learning_rate": 7.664182388203037e-06, "loss": 0.0046, "step": 630 }, { "epoch": 8.58108108108108, "grad_norm": 0.24818454585022467, "learning_rate": 7.4318520580049444e-06, "loss": 0.004, "step": 635 }, { "epoch": 8.64864864864865, "grad_norm": 0.18073174531439173, "learning_rate": 7.209540588292083e-06, "loss": 0.0039, "step": 640 }, { "epoch": 8.716216216216216, "grad_norm": 0.23113449169489975, "learning_rate": 6.9973589662669455e-06, "loss": 0.0043, "step": 645 }, { "epoch": 8.783783783783784, "grad_norm": 0.2779198237595732, "learning_rate": 6.7954131218875404e-06, "loss": 0.0046, "step": 650 }, { "epoch": 8.85135135135135, "grad_norm": 0.21502404938318828, "learning_rate": 6.603803874982687e-06, "loss": 0.0039, "step": 655 }, { "epoch": 8.91891891891892, "grad_norm": 0.2706988607378338, "learning_rate": 6.422626884918559e-06, "loss": 0.0043, "step": 660 }, { "epoch": 8.986486486486486, "grad_norm": 0.19070682071375314, "learning_rate": 6.2519726028415145e-06, "loss": 0.0041, "step": 665 }, { "epoch": 9.054054054054054, "grad_norm": 0.13080557115625388, "learning_rate": 6.091926226521089e-06, "loss": 0.0025, "step": 670 }, { "epoch": 9.121621621621621, "grad_norm": 0.1076753182673924, "learning_rate": 5.942567657815696e-06, "loss": 0.0021, "step": 675 }, { "epoch": 9.18918918918919, "grad_norm": 0.1002126317760094, "learning_rate": 5.8039714627822754e-06, "loss": 0.0018, "step": 680 }, { "epoch": 9.256756756756756, "grad_norm": 0.12775201749030246, "learning_rate": 5.676206834449797e-06, "loss": 0.0021, "step": 685 }, { "epoch": 9.324324324324325, "grad_norm": 0.07863769364063682, "learning_rate": 5.55933755827518e-06, "loss": 0.0016, "step": 690 }, { "epoch": 9.391891891891891, "grad_norm": 0.12254710433245468, "learning_rate": 5.453421980298957e-06, "loss": 0.0019, "step": 695 }, { "epoch": 9.45945945945946, "grad_norm": 0.13332972976618485, "learning_rate": 5.358512978016445e-06, "loss": 0.0018, "step": 700 }, { "epoch": 9.527027027027026, "grad_norm": 0.12204772827962151, "learning_rate": 5.27465793397911e-06, "loss": 0.0019, "step": 705 }, { "epoch": 9.594594594594595, "grad_norm": 0.25418976259049003, "learning_rate": 5.201898712139201e-06, "loss": 0.0026, "step": 710 }, { "epoch": 9.662162162162161, "grad_norm": 0.12072355515085566, "learning_rate": 5.1402716369495194e-06, "loss": 0.0018, "step": 715 }, { "epoch": 9.72972972972973, "grad_norm": 0.1155679459376018, "learning_rate": 5.089807475228711e-06, "loss": 0.0021, "step": 720 }, { "epoch": 9.797297297297296, "grad_norm": 0.17751733060453512, "learning_rate": 5.050531420801205e-06, "loss": 0.002, "step": 725 }, { "epoch": 9.864864864864865, "grad_norm": 0.08558684167277145, "learning_rate": 5.022463081919386e-06, "loss": 0.002, "step": 730 }, { "epoch": 9.932432432432432, "grad_norm": 0.09909189210286934, "learning_rate": 5.005616471474332e-06, "loss": 0.0017, "step": 735 }, { "epoch": 10.0, "grad_norm": 0.09536830165909363, "learning_rate": 5e-06, "loss": 0.0018, "step": 740 }, { "epoch": 10.0, "step": 740, "total_flos": 1440572837462016.0, "train_loss": 0.301153065278069, "train_runtime": 18722.0332, "train_samples_per_second": 5.053, "train_steps_per_second": 0.04 } ], "logging_steps": 5, "max_steps": 740, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1440572837462016.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }