{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.09423752273112121, "eval_steps": 100000, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000235593806827803, "grad_norm": 103.0, "learning_rate": 1e-05, "loss": 0.7800231, "memory(GiB)": 63.62, "step": 1, "train_speed(iter/s)": 0.015931 }, { "epoch": 0.001177969034139015, "grad_norm": 5.8125, "learning_rate": 9.99997807127629e-06, "loss": 0.41946995, "memory(GiB)": 75.24, "step": 5, "train_speed(iter/s)": 0.017972 }, { "epoch": 0.00235593806827803, "grad_norm": 2.703125, "learning_rate": 9.999888986165874e-06, "loss": 0.0869894, "memory(GiB)": 75.24, "step": 10, "train_speed(iter/s)": 0.018238 }, { "epoch": 0.003533907102417045, "grad_norm": 2.140625, "learning_rate": 9.99973137534353e-06, "loss": 0.06987351, "memory(GiB)": 75.24, "step": 15, "train_speed(iter/s)": 0.018317 }, { "epoch": 0.00471187613655606, "grad_norm": 2.515625, "learning_rate": 9.999505240969388e-06, "loss": 0.0606461, "memory(GiB)": 75.24, "step": 20, "train_speed(iter/s)": 0.01837 }, { "epoch": 0.005889845170695076, "grad_norm": 2.4375, "learning_rate": 9.999210586142718e-06, "loss": 0.06591458, "memory(GiB)": 75.24, "step": 25, "train_speed(iter/s)": 0.018407 }, { "epoch": 0.00706781420483409, "grad_norm": 2.8125, "learning_rate": 9.998847414901898e-06, "loss": 0.06059705, "memory(GiB)": 75.24, "step": 30, "train_speed(iter/s)": 0.018432 }, { "epoch": 0.008245783238973105, "grad_norm": 1.9921875, "learning_rate": 9.998415732224352e-06, "loss": 0.06047676, "memory(GiB)": 75.24, "step": 35, "train_speed(iter/s)": 0.018453 }, { "epoch": 0.00942375227311212, "grad_norm": 1.921875, "learning_rate": 9.997915544026483e-06, "loss": 0.06190881, "memory(GiB)": 75.24, "step": 40, "train_speed(iter/s)": 0.018469 }, { "epoch": 0.010601721307251136, "grad_norm": 1.859375, "learning_rate": 9.997346857163591e-06, "loss": 0.05765554, "memory(GiB)": 75.24, "step": 45, "train_speed(iter/s)": 0.018482 }, { "epoch": 0.011779690341390151, "grad_norm": 2.5625, "learning_rate": 9.99670967942979e-06, "loss": 0.0662235, "memory(GiB)": 75.24, "step": 50, "train_speed(iter/s)": 0.01849 }, { "epoch": 0.012957659375529167, "grad_norm": 2.390625, "learning_rate": 9.996004019557879e-06, "loss": 0.06362078, "memory(GiB)": 75.24, "step": 55, "train_speed(iter/s)": 0.0185 }, { "epoch": 0.01413562840966818, "grad_norm": 2.875, "learning_rate": 9.995229887219246e-06, "loss": 0.06171583, "memory(GiB)": 75.24, "step": 60, "train_speed(iter/s)": 0.018512 }, { "epoch": 0.015313597443807196, "grad_norm": 2.109375, "learning_rate": 9.99438729302372e-06, "loss": 0.06211852, "memory(GiB)": 75.24, "step": 65, "train_speed(iter/s)": 0.018519 }, { "epoch": 0.01649156647794621, "grad_norm": 1.828125, "learning_rate": 9.993476248519429e-06, "loss": 0.06484153, "memory(GiB)": 75.24, "step": 70, "train_speed(iter/s)": 0.01852 }, { "epoch": 0.017669535512085225, "grad_norm": 1.90625, "learning_rate": 9.992496766192645e-06, "loss": 0.06099743, "memory(GiB)": 75.24, "step": 75, "train_speed(iter/s)": 0.018526 }, { "epoch": 0.01884750454622424, "grad_norm": 1.796875, "learning_rate": 9.991448859467611e-06, "loss": 0.05843818, "memory(GiB)": 75.24, "step": 80, "train_speed(iter/s)": 0.018543 }, { "epoch": 0.020025473580363256, "grad_norm": 1.8203125, "learning_rate": 9.99033254270636e-06, "loss": 0.05953899, "memory(GiB)": 75.24, "step": 85, "train_speed(iter/s)": 0.018546 }, { "epoch": 0.02120344261450227, "grad_norm": 1.9609375, "learning_rate": 9.989147831208508e-06, "loss": 0.06501681, "memory(GiB)": 75.24, "step": 90, "train_speed(iter/s)": 0.018554 }, { "epoch": 0.022381411648641287, "grad_norm": 2.609375, "learning_rate": 9.987894741211056e-06, "loss": 0.06521546, "memory(GiB)": 75.24, "step": 95, "train_speed(iter/s)": 0.01856 }, { "epoch": 0.023559380682780302, "grad_norm": 2.046875, "learning_rate": 9.986573289888164e-06, "loss": 0.06153967, "memory(GiB)": 75.24, "step": 100, "train_speed(iter/s)": 0.018562 }, { "epoch": 0.024737349716919318, "grad_norm": 2.109375, "learning_rate": 9.98518349535091e-06, "loss": 0.07089446, "memory(GiB)": 75.24, "step": 105, "train_speed(iter/s)": 0.018452 }, { "epoch": 0.025915318751058333, "grad_norm": 1.7578125, "learning_rate": 9.98372537664705e-06, "loss": 0.05478874, "memory(GiB)": 75.24, "step": 110, "train_speed(iter/s)": 0.018463 }, { "epoch": 0.027093287785197345, "grad_norm": 2.9375, "learning_rate": 9.982198953760752e-06, "loss": 0.06532571, "memory(GiB)": 75.24, "step": 115, "train_speed(iter/s)": 0.018473 }, { "epoch": 0.02827125681933636, "grad_norm": 2.234375, "learning_rate": 9.980604247612325e-06, "loss": 0.06488043, "memory(GiB)": 75.24, "step": 120, "train_speed(iter/s)": 0.018478 }, { "epoch": 0.029449225853475376, "grad_norm": 2.28125, "learning_rate": 9.978941280057928e-06, "loss": 0.06263313, "memory(GiB)": 75.24, "step": 125, "train_speed(iter/s)": 0.018482 }, { "epoch": 0.03062719488761439, "grad_norm": 2.21875, "learning_rate": 9.977210073889273e-06, "loss": 0.0654664, "memory(GiB)": 75.24, "step": 130, "train_speed(iter/s)": 0.018487 }, { "epoch": 0.03180516392175341, "grad_norm": 2.171875, "learning_rate": 9.975410652833316e-06, "loss": 0.06672717, "memory(GiB)": 75.24, "step": 135, "train_speed(iter/s)": 0.018489 }, { "epoch": 0.03298313295589242, "grad_norm": 2.875, "learning_rate": 9.973543041551924e-06, "loss": 0.06413687, "memory(GiB)": 75.24, "step": 140, "train_speed(iter/s)": 0.01849 }, { "epoch": 0.03416110199003144, "grad_norm": 1.9453125, "learning_rate": 9.971607265641547e-06, "loss": 0.0582508, "memory(GiB)": 75.24, "step": 145, "train_speed(iter/s)": 0.018495 }, { "epoch": 0.03533907102417045, "grad_norm": 1.9375, "learning_rate": 9.969603351632855e-06, "loss": 0.06022533, "memory(GiB)": 75.24, "step": 150, "train_speed(iter/s)": 0.0185 }, { "epoch": 0.03651704005830947, "grad_norm": 2.109375, "learning_rate": 9.967531326990387e-06, "loss": 0.06132371, "memory(GiB)": 75.24, "step": 155, "train_speed(iter/s)": 0.018504 }, { "epoch": 0.03769500909244848, "grad_norm": 2.078125, "learning_rate": 9.965391220112165e-06, "loss": 0.07101279, "memory(GiB)": 75.24, "step": 160, "train_speed(iter/s)": 0.018506 }, { "epoch": 0.0388729781265875, "grad_norm": 2.140625, "learning_rate": 9.96318306032931e-06, "loss": 0.0588982, "memory(GiB)": 75.24, "step": 165, "train_speed(iter/s)": 0.018505 }, { "epoch": 0.04005094716072651, "grad_norm": 2.125, "learning_rate": 9.96090687790564e-06, "loss": 0.06118761, "memory(GiB)": 75.24, "step": 170, "train_speed(iter/s)": 0.018511 }, { "epoch": 0.04122891619486553, "grad_norm": 1.8671875, "learning_rate": 9.95856270403725e-06, "loss": 0.06012461, "memory(GiB)": 75.24, "step": 175, "train_speed(iter/s)": 0.018517 }, { "epoch": 0.04240688522900454, "grad_norm": 2.234375, "learning_rate": 9.956150570852088e-06, "loss": 0.0591939, "memory(GiB)": 75.24, "step": 180, "train_speed(iter/s)": 0.01852 }, { "epoch": 0.043584854263143555, "grad_norm": 2.234375, "learning_rate": 9.95367051140952e-06, "loss": 0.06429687, "memory(GiB)": 75.24, "step": 185, "train_speed(iter/s)": 0.018524 }, { "epoch": 0.044762823297282574, "grad_norm": 1.59375, "learning_rate": 9.951122559699868e-06, "loss": 0.05647093, "memory(GiB)": 75.24, "step": 190, "train_speed(iter/s)": 0.018525 }, { "epoch": 0.045940792331421586, "grad_norm": 1.9140625, "learning_rate": 9.948506750643946e-06, "loss": 0.05816346, "memory(GiB)": 75.24, "step": 195, "train_speed(iter/s)": 0.018525 }, { "epoch": 0.047118761365560605, "grad_norm": 2.546875, "learning_rate": 9.94582312009259e-06, "loss": 0.05947306, "memory(GiB)": 75.24, "step": 200, "train_speed(iter/s)": 0.018527 }, { "epoch": 0.04829673039969962, "grad_norm": 2.359375, "learning_rate": 9.943071704826153e-06, "loss": 0.06321282, "memory(GiB)": 75.24, "step": 205, "train_speed(iter/s)": 0.018454 }, { "epoch": 0.049474699433838636, "grad_norm": 2.203125, "learning_rate": 9.940252542554007e-06, "loss": 0.06456767, "memory(GiB)": 75.24, "step": 210, "train_speed(iter/s)": 0.018455 }, { "epoch": 0.05065266846797765, "grad_norm": 2.15625, "learning_rate": 9.937365671914037e-06, "loss": 0.06057892, "memory(GiB)": 75.24, "step": 215, "train_speed(iter/s)": 0.018456 }, { "epoch": 0.05183063750211667, "grad_norm": 2.0, "learning_rate": 9.934411132472088e-06, "loss": 0.05920454, "memory(GiB)": 75.24, "step": 220, "train_speed(iter/s)": 0.018458 }, { "epoch": 0.05300860653625568, "grad_norm": 2.015625, "learning_rate": 9.931388964721446e-06, "loss": 0.05975649, "memory(GiB)": 75.24, "step": 225, "train_speed(iter/s)": 0.018461 }, { "epoch": 0.05418657557039469, "grad_norm": 2.0, "learning_rate": 9.92829921008227e-06, "loss": 0.06393375, "memory(GiB)": 75.24, "step": 230, "train_speed(iter/s)": 0.018462 }, { "epoch": 0.05536454460453371, "grad_norm": 2.28125, "learning_rate": 9.925141910901029e-06, "loss": 0.06334119, "memory(GiB)": 75.24, "step": 235, "train_speed(iter/s)": 0.018466 }, { "epoch": 0.05654251363867272, "grad_norm": 2.09375, "learning_rate": 9.921917110449914e-06, "loss": 0.06911048, "memory(GiB)": 75.24, "step": 240, "train_speed(iter/s)": 0.018468 }, { "epoch": 0.05772048267281174, "grad_norm": 1.984375, "learning_rate": 9.918624852926258e-06, "loss": 0.05916922, "memory(GiB)": 75.24, "step": 245, "train_speed(iter/s)": 0.01847 }, { "epoch": 0.05889845170695075, "grad_norm": 1.859375, "learning_rate": 9.915265183451923e-06, "loss": 0.06251335, "memory(GiB)": 75.24, "step": 250, "train_speed(iter/s)": 0.018471 }, { "epoch": 0.06007642074108977, "grad_norm": 1.8515625, "learning_rate": 9.911838148072678e-06, "loss": 0.06203491, "memory(GiB)": 75.24, "step": 255, "train_speed(iter/s)": 0.018477 }, { "epoch": 0.06125438977522878, "grad_norm": 2.265625, "learning_rate": 9.908343793757574e-06, "loss": 0.06085759, "memory(GiB)": 75.24, "step": 260, "train_speed(iter/s)": 0.01848 }, { "epoch": 0.062432358809367795, "grad_norm": 2.375, "learning_rate": 9.904782168398296e-06, "loss": 0.06250409, "memory(GiB)": 75.24, "step": 265, "train_speed(iter/s)": 0.018484 }, { "epoch": 0.06361032784350681, "grad_norm": 1.9609375, "learning_rate": 9.901153320808514e-06, "loss": 0.05536562, "memory(GiB)": 75.24, "step": 270, "train_speed(iter/s)": 0.018489 }, { "epoch": 0.06478829687764583, "grad_norm": 1.8359375, "learning_rate": 9.897457300723202e-06, "loss": 0.05569639, "memory(GiB)": 75.24, "step": 275, "train_speed(iter/s)": 0.018491 }, { "epoch": 0.06596626591178484, "grad_norm": 2.40625, "learning_rate": 9.893694158797968e-06, "loss": 0.05840618, "memory(GiB)": 75.24, "step": 280, "train_speed(iter/s)": 0.018494 }, { "epoch": 0.06714423494592386, "grad_norm": 2.265625, "learning_rate": 9.889863946608352e-06, "loss": 0.05661937, "memory(GiB)": 75.24, "step": 285, "train_speed(iter/s)": 0.018496 }, { "epoch": 0.06832220398006288, "grad_norm": 2.140625, "learning_rate": 9.885966716649125e-06, "loss": 0.06150655, "memory(GiB)": 75.24, "step": 290, "train_speed(iter/s)": 0.018497 }, { "epoch": 0.06950017301420189, "grad_norm": 2.09375, "learning_rate": 9.88200252233356e-06, "loss": 0.06209329, "memory(GiB)": 75.24, "step": 295, "train_speed(iter/s)": 0.018497 }, { "epoch": 0.0706781420483409, "grad_norm": 3.375, "learning_rate": 9.877971417992716e-06, "loss": 0.05904433, "memory(GiB)": 75.24, "step": 300, "train_speed(iter/s)": 0.018499 }, { "epoch": 0.07185611108247993, "grad_norm": 1.796875, "learning_rate": 9.873873458874676e-06, "loss": 0.05126434, "memory(GiB)": 75.24, "step": 305, "train_speed(iter/s)": 0.018458 }, { "epoch": 0.07303408011661894, "grad_norm": 2.0, "learning_rate": 9.8697087011438e-06, "loss": 0.05796698, "memory(GiB)": 75.24, "step": 310, "train_speed(iter/s)": 0.018459 }, { "epoch": 0.07421204915075795, "grad_norm": 1.875, "learning_rate": 9.865477201879953e-06, "loss": 0.05630487, "memory(GiB)": 75.24, "step": 315, "train_speed(iter/s)": 0.01846 }, { "epoch": 0.07539001818489696, "grad_norm": 2.515625, "learning_rate": 9.861179019077725e-06, "loss": 0.0567848, "memory(GiB)": 75.24, "step": 320, "train_speed(iter/s)": 0.018461 }, { "epoch": 0.07656798721903597, "grad_norm": 2.109375, "learning_rate": 9.856814211645627e-06, "loss": 0.05985626, "memory(GiB)": 75.24, "step": 325, "train_speed(iter/s)": 0.018463 }, { "epoch": 0.077745956253175, "grad_norm": 2.09375, "learning_rate": 9.852382839405298e-06, "loss": 0.05782009, "memory(GiB)": 75.24, "step": 330, "train_speed(iter/s)": 0.018466 }, { "epoch": 0.07892392528731401, "grad_norm": 2.28125, "learning_rate": 9.847884963090675e-06, "loss": 0.06585214, "memory(GiB)": 75.24, "step": 335, "train_speed(iter/s)": 0.018468 }, { "epoch": 0.08010189432145302, "grad_norm": 2.234375, "learning_rate": 9.843320644347156e-06, "loss": 0.06263242, "memory(GiB)": 75.24, "step": 340, "train_speed(iter/s)": 0.01847 }, { "epoch": 0.08127986335559204, "grad_norm": 2.203125, "learning_rate": 9.838689945730776e-06, "loss": 0.05163463, "memory(GiB)": 75.24, "step": 345, "train_speed(iter/s)": 0.018472 }, { "epoch": 0.08245783238973106, "grad_norm": 2.015625, "learning_rate": 9.833992930707321e-06, "loss": 0.05960041, "memory(GiB)": 75.24, "step": 350, "train_speed(iter/s)": 0.018475 }, { "epoch": 0.08363580142387007, "grad_norm": 2.5, "learning_rate": 9.829229663651483e-06, "loss": 0.05999585, "memory(GiB)": 75.24, "step": 355, "train_speed(iter/s)": 0.018477 }, { "epoch": 0.08481377045800909, "grad_norm": 1.671875, "learning_rate": 9.824400209845967e-06, "loss": 0.05059795, "memory(GiB)": 75.24, "step": 360, "train_speed(iter/s)": 0.018479 }, { "epoch": 0.0859917394921481, "grad_norm": 2.171875, "learning_rate": 9.81950463548059e-06, "loss": 0.05671123, "memory(GiB)": 75.24, "step": 365, "train_speed(iter/s)": 0.018481 }, { "epoch": 0.08716970852628711, "grad_norm": 2.625, "learning_rate": 9.814543007651389e-06, "loss": 0.05803382, "memory(GiB)": 75.24, "step": 370, "train_speed(iter/s)": 0.018483 }, { "epoch": 0.08834767756042614, "grad_norm": 1.890625, "learning_rate": 9.80951539435969e-06, "loss": 0.05704566, "memory(GiB)": 75.24, "step": 375, "train_speed(iter/s)": 0.018485 }, { "epoch": 0.08952564659456515, "grad_norm": 2.03125, "learning_rate": 9.804421864511175e-06, "loss": 0.05998203, "memory(GiB)": 75.24, "step": 380, "train_speed(iter/s)": 0.018487 }, { "epoch": 0.09070361562870416, "grad_norm": 2.53125, "learning_rate": 9.79926248791495e-06, "loss": 0.06044774, "memory(GiB)": 75.24, "step": 385, "train_speed(iter/s)": 0.018488 }, { "epoch": 0.09188158466284317, "grad_norm": 2.1875, "learning_rate": 9.794037335282572e-06, "loss": 0.06596763, "memory(GiB)": 75.24, "step": 390, "train_speed(iter/s)": 0.018489 }, { "epoch": 0.0930595536969822, "grad_norm": 2.171875, "learning_rate": 9.788746478227097e-06, "loss": 0.06313769, "memory(GiB)": 75.24, "step": 395, "train_speed(iter/s)": 0.018489 }, { "epoch": 0.09423752273112121, "grad_norm": 1.9296875, "learning_rate": 9.783389989262078e-06, "loss": 0.05841722, "memory(GiB)": 75.24, "step": 400, "train_speed(iter/s)": 0.018489 } ], "logging_steps": 5, "max_steps": 4244, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.4341415068565504e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }