|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.09423752273112121, |
|
"eval_steps": 100000, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000235593806827803, |
|
"grad_norm": 103.0, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7800231, |
|
"memory(GiB)": 63.62, |
|
"step": 1, |
|
"train_speed(iter/s)": 0.015931 |
|
}, |
|
{ |
|
"epoch": 0.001177969034139015, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 9.99997807127629e-06, |
|
"loss": 0.41946995, |
|
"memory(GiB)": 75.24, |
|
"step": 5, |
|
"train_speed(iter/s)": 0.017972 |
|
}, |
|
{ |
|
"epoch": 0.00235593806827803, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 9.999888986165874e-06, |
|
"loss": 0.0869894, |
|
"memory(GiB)": 75.24, |
|
"step": 10, |
|
"train_speed(iter/s)": 0.018238 |
|
}, |
|
{ |
|
"epoch": 0.003533907102417045, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 9.99973137534353e-06, |
|
"loss": 0.06987351, |
|
"memory(GiB)": 75.24, |
|
"step": 15, |
|
"train_speed(iter/s)": 0.018317 |
|
}, |
|
{ |
|
"epoch": 0.00471187613655606, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 9.999505240969388e-06, |
|
"loss": 0.0606461, |
|
"memory(GiB)": 75.24, |
|
"step": 20, |
|
"train_speed(iter/s)": 0.01837 |
|
}, |
|
{ |
|
"epoch": 0.005889845170695076, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 9.999210586142718e-06, |
|
"loss": 0.06591458, |
|
"memory(GiB)": 75.24, |
|
"step": 25, |
|
"train_speed(iter/s)": 0.018407 |
|
}, |
|
{ |
|
"epoch": 0.00706781420483409, |
|
"grad_norm": 2.8125, |
|
"learning_rate": 9.998847414901898e-06, |
|
"loss": 0.06059705, |
|
"memory(GiB)": 75.24, |
|
"step": 30, |
|
"train_speed(iter/s)": 0.018432 |
|
}, |
|
{ |
|
"epoch": 0.008245783238973105, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 9.998415732224352e-06, |
|
"loss": 0.06047676, |
|
"memory(GiB)": 75.24, |
|
"step": 35, |
|
"train_speed(iter/s)": 0.018453 |
|
}, |
|
{ |
|
"epoch": 0.00942375227311212, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 9.997915544026483e-06, |
|
"loss": 0.06190881, |
|
"memory(GiB)": 75.24, |
|
"step": 40, |
|
"train_speed(iter/s)": 0.018469 |
|
}, |
|
{ |
|
"epoch": 0.010601721307251136, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 9.997346857163591e-06, |
|
"loss": 0.05765554, |
|
"memory(GiB)": 75.24, |
|
"step": 45, |
|
"train_speed(iter/s)": 0.018482 |
|
}, |
|
{ |
|
"epoch": 0.011779690341390151, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 9.99670967942979e-06, |
|
"loss": 0.0662235, |
|
"memory(GiB)": 75.24, |
|
"step": 50, |
|
"train_speed(iter/s)": 0.01849 |
|
}, |
|
{ |
|
"epoch": 0.012957659375529167, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 9.996004019557879e-06, |
|
"loss": 0.06362078, |
|
"memory(GiB)": 75.24, |
|
"step": 55, |
|
"train_speed(iter/s)": 0.0185 |
|
}, |
|
{ |
|
"epoch": 0.01413562840966818, |
|
"grad_norm": 2.875, |
|
"learning_rate": 9.995229887219246e-06, |
|
"loss": 0.06171583, |
|
"memory(GiB)": 75.24, |
|
"step": 60, |
|
"train_speed(iter/s)": 0.018512 |
|
}, |
|
{ |
|
"epoch": 0.015313597443807196, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 9.99438729302372e-06, |
|
"loss": 0.06211852, |
|
"memory(GiB)": 75.24, |
|
"step": 65, |
|
"train_speed(iter/s)": 0.018519 |
|
}, |
|
{ |
|
"epoch": 0.01649156647794621, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 9.993476248519429e-06, |
|
"loss": 0.06484153, |
|
"memory(GiB)": 75.24, |
|
"step": 70, |
|
"train_speed(iter/s)": 0.01852 |
|
}, |
|
{ |
|
"epoch": 0.017669535512085225, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 9.992496766192645e-06, |
|
"loss": 0.06099743, |
|
"memory(GiB)": 75.24, |
|
"step": 75, |
|
"train_speed(iter/s)": 0.018526 |
|
}, |
|
{ |
|
"epoch": 0.01884750454622424, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 9.991448859467611e-06, |
|
"loss": 0.05843818, |
|
"memory(GiB)": 75.24, |
|
"step": 80, |
|
"train_speed(iter/s)": 0.018543 |
|
}, |
|
{ |
|
"epoch": 0.020025473580363256, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 9.99033254270636e-06, |
|
"loss": 0.05953899, |
|
"memory(GiB)": 75.24, |
|
"step": 85, |
|
"train_speed(iter/s)": 0.018546 |
|
}, |
|
{ |
|
"epoch": 0.02120344261450227, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 9.989147831208508e-06, |
|
"loss": 0.06501681, |
|
"memory(GiB)": 75.24, |
|
"step": 90, |
|
"train_speed(iter/s)": 0.018554 |
|
}, |
|
{ |
|
"epoch": 0.022381411648641287, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 9.987894741211056e-06, |
|
"loss": 0.06521546, |
|
"memory(GiB)": 75.24, |
|
"step": 95, |
|
"train_speed(iter/s)": 0.01856 |
|
}, |
|
{ |
|
"epoch": 0.023559380682780302, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 9.986573289888164e-06, |
|
"loss": 0.06153967, |
|
"memory(GiB)": 75.24, |
|
"step": 100, |
|
"train_speed(iter/s)": 0.018562 |
|
}, |
|
{ |
|
"epoch": 0.024737349716919318, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 9.98518349535091e-06, |
|
"loss": 0.07089446, |
|
"memory(GiB)": 75.24, |
|
"step": 105, |
|
"train_speed(iter/s)": 0.018452 |
|
}, |
|
{ |
|
"epoch": 0.025915318751058333, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 9.98372537664705e-06, |
|
"loss": 0.05478874, |
|
"memory(GiB)": 75.24, |
|
"step": 110, |
|
"train_speed(iter/s)": 0.018463 |
|
}, |
|
{ |
|
"epoch": 0.027093287785197345, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 9.982198953760752e-06, |
|
"loss": 0.06532571, |
|
"memory(GiB)": 75.24, |
|
"step": 115, |
|
"train_speed(iter/s)": 0.018473 |
|
}, |
|
{ |
|
"epoch": 0.02827125681933636, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 9.980604247612325e-06, |
|
"loss": 0.06488043, |
|
"memory(GiB)": 75.24, |
|
"step": 120, |
|
"train_speed(iter/s)": 0.018478 |
|
}, |
|
{ |
|
"epoch": 0.029449225853475376, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 9.978941280057928e-06, |
|
"loss": 0.06263313, |
|
"memory(GiB)": 75.24, |
|
"step": 125, |
|
"train_speed(iter/s)": 0.018482 |
|
}, |
|
{ |
|
"epoch": 0.03062719488761439, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 9.977210073889273e-06, |
|
"loss": 0.0654664, |
|
"memory(GiB)": 75.24, |
|
"step": 130, |
|
"train_speed(iter/s)": 0.018487 |
|
}, |
|
{ |
|
"epoch": 0.03180516392175341, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 9.975410652833316e-06, |
|
"loss": 0.06672717, |
|
"memory(GiB)": 75.24, |
|
"step": 135, |
|
"train_speed(iter/s)": 0.018489 |
|
}, |
|
{ |
|
"epoch": 0.03298313295589242, |
|
"grad_norm": 2.875, |
|
"learning_rate": 9.973543041551924e-06, |
|
"loss": 0.06413687, |
|
"memory(GiB)": 75.24, |
|
"step": 140, |
|
"train_speed(iter/s)": 0.01849 |
|
}, |
|
{ |
|
"epoch": 0.03416110199003144, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 9.971607265641547e-06, |
|
"loss": 0.0582508, |
|
"memory(GiB)": 75.24, |
|
"step": 145, |
|
"train_speed(iter/s)": 0.018495 |
|
}, |
|
{ |
|
"epoch": 0.03533907102417045, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 9.969603351632855e-06, |
|
"loss": 0.06022533, |
|
"memory(GiB)": 75.24, |
|
"step": 150, |
|
"train_speed(iter/s)": 0.0185 |
|
}, |
|
{ |
|
"epoch": 0.03651704005830947, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 9.967531326990387e-06, |
|
"loss": 0.06132371, |
|
"memory(GiB)": 75.24, |
|
"step": 155, |
|
"train_speed(iter/s)": 0.018504 |
|
}, |
|
{ |
|
"epoch": 0.03769500909244848, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 9.965391220112165e-06, |
|
"loss": 0.07101279, |
|
"memory(GiB)": 75.24, |
|
"step": 160, |
|
"train_speed(iter/s)": 0.018506 |
|
}, |
|
{ |
|
"epoch": 0.0388729781265875, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 9.96318306032931e-06, |
|
"loss": 0.0588982, |
|
"memory(GiB)": 75.24, |
|
"step": 165, |
|
"train_speed(iter/s)": 0.018505 |
|
}, |
|
{ |
|
"epoch": 0.04005094716072651, |
|
"grad_norm": 2.125, |
|
"learning_rate": 9.96090687790564e-06, |
|
"loss": 0.06118761, |
|
"memory(GiB)": 75.24, |
|
"step": 170, |
|
"train_speed(iter/s)": 0.018511 |
|
}, |
|
{ |
|
"epoch": 0.04122891619486553, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 9.95856270403725e-06, |
|
"loss": 0.06012461, |
|
"memory(GiB)": 75.24, |
|
"step": 175, |
|
"train_speed(iter/s)": 0.018517 |
|
}, |
|
{ |
|
"epoch": 0.04240688522900454, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 9.956150570852088e-06, |
|
"loss": 0.0591939, |
|
"memory(GiB)": 75.24, |
|
"step": 180, |
|
"train_speed(iter/s)": 0.01852 |
|
}, |
|
{ |
|
"epoch": 0.043584854263143555, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 9.95367051140952e-06, |
|
"loss": 0.06429687, |
|
"memory(GiB)": 75.24, |
|
"step": 185, |
|
"train_speed(iter/s)": 0.018524 |
|
}, |
|
{ |
|
"epoch": 0.044762823297282574, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 9.951122559699868e-06, |
|
"loss": 0.05647093, |
|
"memory(GiB)": 75.24, |
|
"step": 190, |
|
"train_speed(iter/s)": 0.018525 |
|
}, |
|
{ |
|
"epoch": 0.045940792331421586, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 9.948506750643946e-06, |
|
"loss": 0.05816346, |
|
"memory(GiB)": 75.24, |
|
"step": 195, |
|
"train_speed(iter/s)": 0.018525 |
|
}, |
|
{ |
|
"epoch": 0.047118761365560605, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 9.94582312009259e-06, |
|
"loss": 0.05947306, |
|
"memory(GiB)": 75.24, |
|
"step": 200, |
|
"train_speed(iter/s)": 0.018527 |
|
}, |
|
{ |
|
"epoch": 0.04829673039969962, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 9.943071704826153e-06, |
|
"loss": 0.06321282, |
|
"memory(GiB)": 75.24, |
|
"step": 205, |
|
"train_speed(iter/s)": 0.018454 |
|
}, |
|
{ |
|
"epoch": 0.049474699433838636, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 9.940252542554007e-06, |
|
"loss": 0.06456767, |
|
"memory(GiB)": 75.24, |
|
"step": 210, |
|
"train_speed(iter/s)": 0.018455 |
|
}, |
|
{ |
|
"epoch": 0.05065266846797765, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 9.937365671914037e-06, |
|
"loss": 0.06057892, |
|
"memory(GiB)": 75.24, |
|
"step": 215, |
|
"train_speed(iter/s)": 0.018456 |
|
}, |
|
{ |
|
"epoch": 0.05183063750211667, |
|
"grad_norm": 2.0, |
|
"learning_rate": 9.934411132472088e-06, |
|
"loss": 0.05920454, |
|
"memory(GiB)": 75.24, |
|
"step": 220, |
|
"train_speed(iter/s)": 0.018458 |
|
}, |
|
{ |
|
"epoch": 0.05300860653625568, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 9.931388964721446e-06, |
|
"loss": 0.05975649, |
|
"memory(GiB)": 75.24, |
|
"step": 225, |
|
"train_speed(iter/s)": 0.018461 |
|
}, |
|
{ |
|
"epoch": 0.05418657557039469, |
|
"grad_norm": 2.0, |
|
"learning_rate": 9.92829921008227e-06, |
|
"loss": 0.06393375, |
|
"memory(GiB)": 75.24, |
|
"step": 230, |
|
"train_speed(iter/s)": 0.018462 |
|
}, |
|
{ |
|
"epoch": 0.05536454460453371, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 9.925141910901029e-06, |
|
"loss": 0.06334119, |
|
"memory(GiB)": 75.24, |
|
"step": 235, |
|
"train_speed(iter/s)": 0.018466 |
|
}, |
|
{ |
|
"epoch": 0.05654251363867272, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 9.921917110449914e-06, |
|
"loss": 0.06911048, |
|
"memory(GiB)": 75.24, |
|
"step": 240, |
|
"train_speed(iter/s)": 0.018468 |
|
}, |
|
{ |
|
"epoch": 0.05772048267281174, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 9.918624852926258e-06, |
|
"loss": 0.05916922, |
|
"memory(GiB)": 75.24, |
|
"step": 245, |
|
"train_speed(iter/s)": 0.01847 |
|
}, |
|
{ |
|
"epoch": 0.05889845170695075, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 9.915265183451923e-06, |
|
"loss": 0.06251335, |
|
"memory(GiB)": 75.24, |
|
"step": 250, |
|
"train_speed(iter/s)": 0.018471 |
|
}, |
|
{ |
|
"epoch": 0.06007642074108977, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 9.911838148072678e-06, |
|
"loss": 0.06203491, |
|
"memory(GiB)": 75.24, |
|
"step": 255, |
|
"train_speed(iter/s)": 0.018477 |
|
}, |
|
{ |
|
"epoch": 0.06125438977522878, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 9.908343793757574e-06, |
|
"loss": 0.06085759, |
|
"memory(GiB)": 75.24, |
|
"step": 260, |
|
"train_speed(iter/s)": 0.01848 |
|
}, |
|
{ |
|
"epoch": 0.062432358809367795, |
|
"grad_norm": 2.375, |
|
"learning_rate": 9.904782168398296e-06, |
|
"loss": 0.06250409, |
|
"memory(GiB)": 75.24, |
|
"step": 265, |
|
"train_speed(iter/s)": 0.018484 |
|
}, |
|
{ |
|
"epoch": 0.06361032784350681, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 9.901153320808514e-06, |
|
"loss": 0.05536562, |
|
"memory(GiB)": 75.24, |
|
"step": 270, |
|
"train_speed(iter/s)": 0.018489 |
|
}, |
|
{ |
|
"epoch": 0.06478829687764583, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 9.897457300723202e-06, |
|
"loss": 0.05569639, |
|
"memory(GiB)": 75.24, |
|
"step": 275, |
|
"train_speed(iter/s)": 0.018491 |
|
}, |
|
{ |
|
"epoch": 0.06596626591178484, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 9.893694158797968e-06, |
|
"loss": 0.05840618, |
|
"memory(GiB)": 75.24, |
|
"step": 280, |
|
"train_speed(iter/s)": 0.018494 |
|
}, |
|
{ |
|
"epoch": 0.06714423494592386, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 9.889863946608352e-06, |
|
"loss": 0.05661937, |
|
"memory(GiB)": 75.24, |
|
"step": 285, |
|
"train_speed(iter/s)": 0.018496 |
|
}, |
|
{ |
|
"epoch": 0.06832220398006288, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 9.885966716649125e-06, |
|
"loss": 0.06150655, |
|
"memory(GiB)": 75.24, |
|
"step": 290, |
|
"train_speed(iter/s)": 0.018497 |
|
}, |
|
{ |
|
"epoch": 0.06950017301420189, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 9.88200252233356e-06, |
|
"loss": 0.06209329, |
|
"memory(GiB)": 75.24, |
|
"step": 295, |
|
"train_speed(iter/s)": 0.018497 |
|
}, |
|
{ |
|
"epoch": 0.0706781420483409, |
|
"grad_norm": 3.375, |
|
"learning_rate": 9.877971417992716e-06, |
|
"loss": 0.05904433, |
|
"memory(GiB)": 75.24, |
|
"step": 300, |
|
"train_speed(iter/s)": 0.018499 |
|
}, |
|
{ |
|
"epoch": 0.07185611108247993, |
|
"grad_norm": 1.796875, |
|
"learning_rate": 9.873873458874676e-06, |
|
"loss": 0.05126434, |
|
"memory(GiB)": 75.24, |
|
"step": 305, |
|
"train_speed(iter/s)": 0.018458 |
|
}, |
|
{ |
|
"epoch": 0.07303408011661894, |
|
"grad_norm": 2.0, |
|
"learning_rate": 9.8697087011438e-06, |
|
"loss": 0.05796698, |
|
"memory(GiB)": 75.24, |
|
"step": 310, |
|
"train_speed(iter/s)": 0.018459 |
|
}, |
|
{ |
|
"epoch": 0.07421204915075795, |
|
"grad_norm": 1.875, |
|
"learning_rate": 9.865477201879953e-06, |
|
"loss": 0.05630487, |
|
"memory(GiB)": 75.24, |
|
"step": 315, |
|
"train_speed(iter/s)": 0.01846 |
|
}, |
|
{ |
|
"epoch": 0.07539001818489696, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 9.861179019077725e-06, |
|
"loss": 0.0567848, |
|
"memory(GiB)": 75.24, |
|
"step": 320, |
|
"train_speed(iter/s)": 0.018461 |
|
}, |
|
{ |
|
"epoch": 0.07656798721903597, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 9.856814211645627e-06, |
|
"loss": 0.05985626, |
|
"memory(GiB)": 75.24, |
|
"step": 325, |
|
"train_speed(iter/s)": 0.018463 |
|
}, |
|
{ |
|
"epoch": 0.077745956253175, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 9.852382839405298e-06, |
|
"loss": 0.05782009, |
|
"memory(GiB)": 75.24, |
|
"step": 330, |
|
"train_speed(iter/s)": 0.018466 |
|
}, |
|
{ |
|
"epoch": 0.07892392528731401, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 9.847884963090675e-06, |
|
"loss": 0.06585214, |
|
"memory(GiB)": 75.24, |
|
"step": 335, |
|
"train_speed(iter/s)": 0.018468 |
|
}, |
|
{ |
|
"epoch": 0.08010189432145302, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 9.843320644347156e-06, |
|
"loss": 0.06263242, |
|
"memory(GiB)": 75.24, |
|
"step": 340, |
|
"train_speed(iter/s)": 0.01847 |
|
}, |
|
{ |
|
"epoch": 0.08127986335559204, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 9.838689945730776e-06, |
|
"loss": 0.05163463, |
|
"memory(GiB)": 75.24, |
|
"step": 345, |
|
"train_speed(iter/s)": 0.018472 |
|
}, |
|
{ |
|
"epoch": 0.08245783238973106, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 9.833992930707321e-06, |
|
"loss": 0.05960041, |
|
"memory(GiB)": 75.24, |
|
"step": 350, |
|
"train_speed(iter/s)": 0.018475 |
|
}, |
|
{ |
|
"epoch": 0.08363580142387007, |
|
"grad_norm": 2.5, |
|
"learning_rate": 9.829229663651483e-06, |
|
"loss": 0.05999585, |
|
"memory(GiB)": 75.24, |
|
"step": 355, |
|
"train_speed(iter/s)": 0.018477 |
|
}, |
|
{ |
|
"epoch": 0.08481377045800909, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 9.824400209845967e-06, |
|
"loss": 0.05059795, |
|
"memory(GiB)": 75.24, |
|
"step": 360, |
|
"train_speed(iter/s)": 0.018479 |
|
}, |
|
{ |
|
"epoch": 0.0859917394921481, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 9.81950463548059e-06, |
|
"loss": 0.05671123, |
|
"memory(GiB)": 75.24, |
|
"step": 365, |
|
"train_speed(iter/s)": 0.018481 |
|
}, |
|
{ |
|
"epoch": 0.08716970852628711, |
|
"grad_norm": 2.625, |
|
"learning_rate": 9.814543007651389e-06, |
|
"loss": 0.05803382, |
|
"memory(GiB)": 75.24, |
|
"step": 370, |
|
"train_speed(iter/s)": 0.018483 |
|
}, |
|
{ |
|
"epoch": 0.08834767756042614, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 9.80951539435969e-06, |
|
"loss": 0.05704566, |
|
"memory(GiB)": 75.24, |
|
"step": 375, |
|
"train_speed(iter/s)": 0.018485 |
|
}, |
|
{ |
|
"epoch": 0.08952564659456515, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 9.804421864511175e-06, |
|
"loss": 0.05998203, |
|
"memory(GiB)": 75.24, |
|
"step": 380, |
|
"train_speed(iter/s)": 0.018487 |
|
}, |
|
{ |
|
"epoch": 0.09070361562870416, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 9.79926248791495e-06, |
|
"loss": 0.06044774, |
|
"memory(GiB)": 75.24, |
|
"step": 385, |
|
"train_speed(iter/s)": 0.018488 |
|
}, |
|
{ |
|
"epoch": 0.09188158466284317, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 9.794037335282572e-06, |
|
"loss": 0.06596763, |
|
"memory(GiB)": 75.24, |
|
"step": 390, |
|
"train_speed(iter/s)": 0.018489 |
|
}, |
|
{ |
|
"epoch": 0.0930595536969822, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 9.788746478227097e-06, |
|
"loss": 0.06313769, |
|
"memory(GiB)": 75.24, |
|
"step": 395, |
|
"train_speed(iter/s)": 0.018489 |
|
}, |
|
{ |
|
"epoch": 0.09423752273112121, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 9.783389989262078e-06, |
|
"loss": 0.05841722, |
|
"memory(GiB)": 75.24, |
|
"step": 400, |
|
"train_speed(iter/s)": 0.018489 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 4244, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.4341415068565504e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|