diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,3265 +1,565 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.42406885229004543, + "epoch": 0.2827125681933636, "eval_steps": 100000, - "global_step": 1800, + "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.000235593806827803, - "grad_norm": 103.0, + "epoch": 0.000942375227311212, + "grad_norm": 14.25, "learning_rate": 1e-05, - "loss": 0.7800231, - "memory(GiB)": 63.62, + "loss": 0.46586317, + "memory(GiB)": 64.76, "step": 1, - "train_speed(iter/s)": 0.015931 + "train_speed(iter/s)": 0.003324 }, { - "epoch": 0.001177969034139015, - "grad_norm": 5.8125, - "learning_rate": 9.99997807127629e-06, - "loss": 0.41946995, - "memory(GiB)": 75.24, + "epoch": 0.00471187613655606, + "grad_norm": 2.46875, + "learning_rate": 9.999648647603774e-06, + "loss": 0.26192743, + "memory(GiB)": 75.3, "step": 5, - "train_speed(iter/s)": 0.017972 + "train_speed(iter/s)": 0.003362 }, { - "epoch": 0.00235593806827803, - "grad_norm": 2.703125, - "learning_rate": 9.999888986165874e-06, - "loss": 0.0869894, - "memory(GiB)": 75.24, + "epoch": 0.00942375227311212, + "grad_norm": 1.15625, + "learning_rate": 9.998221363123425e-06, + "loss": 0.10271888, + "memory(GiB)": 75.3, "step": 10, - "train_speed(iter/s)": 0.018238 + "train_speed(iter/s)": 0.003359 }, { - "epoch": 0.003533907102417045, - "grad_norm": 2.140625, - "learning_rate": 9.99973137534353e-06, - "loss": 0.06987351, - "memory(GiB)": 75.24, + "epoch": 0.01413562840966818, + "grad_norm": 1.1796875, + "learning_rate": 9.995696500215899e-06, + "loss": 0.09046092, + "memory(GiB)": 75.3, "step": 15, - "train_speed(iter/s)": 0.018317 + "train_speed(iter/s)": 0.003358 }, { - "epoch": 0.00471187613655606, - "grad_norm": 2.515625, - "learning_rate": 9.999505240969388e-06, - "loss": 0.0606461, - "memory(GiB)": 75.24, + "epoch": 0.01884750454622424, + "grad_norm": 1.140625, + "learning_rate": 9.992074613325435e-06, + "loss": 0.08653282, + "memory(GiB)": 75.3, "step": 20, - "train_speed(iter/s)": 0.01837 + "train_speed(iter/s)": 0.00335 }, { - "epoch": 0.005889845170695076, - "grad_norm": 2.4375, - "learning_rate": 9.999210586142718e-06, - "loss": 0.06591458, - "memory(GiB)": 75.24, + "epoch": 0.023559380682780302, + "grad_norm": 1.078125, + "learning_rate": 9.987356497795944e-06, + "loss": 0.08451628, + "memory(GiB)": 75.3, "step": 25, - "train_speed(iter/s)": 0.018407 + "train_speed(iter/s)": 0.003358 }, { - "epoch": 0.00706781420483409, - "grad_norm": 2.8125, - "learning_rate": 9.998847414901898e-06, - "loss": 0.06059705, - "memory(GiB)": 75.24, + "epoch": 0.02827125681933636, + "grad_norm": 1.09375, + "learning_rate": 9.981543189696349e-06, + "loss": 0.0772208, + "memory(GiB)": 75.3, "step": 30, - "train_speed(iter/s)": 0.018432 + "train_speed(iter/s)": 0.003356 }, { - "epoch": 0.008245783238973105, - "grad_norm": 1.9921875, - "learning_rate": 9.998415732224352e-06, - "loss": 0.06047676, - "memory(GiB)": 75.24, + "epoch": 0.03298313295589242, + "grad_norm": 1.125, + "learning_rate": 9.97463596559307e-06, + "loss": 0.08322463, + "memory(GiB)": 75.3, "step": 35, - "train_speed(iter/s)": 0.018453 + "train_speed(iter/s)": 0.003356 }, { - "epoch": 0.00942375227311212, - "grad_norm": 1.921875, - "learning_rate": 9.997915544026483e-06, - "loss": 0.06190881, - "memory(GiB)": 75.24, + "epoch": 0.03769500909244848, + "grad_norm": 1.09375, + "learning_rate": 9.966636342269706e-06, + "loss": 0.07725406, + "memory(GiB)": 75.3, "step": 40, - "train_speed(iter/s)": 0.018469 + "train_speed(iter/s)": 0.003355 }, { - "epoch": 0.010601721307251136, - "grad_norm": 1.859375, - "learning_rate": 9.997346857163591e-06, - "loss": 0.05765554, - "memory(GiB)": 75.24, + "epoch": 0.04240688522900454, + "grad_norm": 1.15625, + "learning_rate": 9.957546076393944e-06, + "loss": 0.07683957, + "memory(GiB)": 75.3, "step": 45, - "train_speed(iter/s)": 0.018482 + "train_speed(iter/s)": 0.003356 }, { - "epoch": 0.011779690341390151, - "grad_norm": 2.5625, - "learning_rate": 9.99670967942979e-06, - "loss": 0.0662235, - "memory(GiB)": 75.24, + "epoch": 0.047118761365560605, + "grad_norm": 1.1328125, + "learning_rate": 9.947367164131823e-06, + "loss": 0.07508552, + "memory(GiB)": 75.3, "step": 50, - "train_speed(iter/s)": 0.01849 + "train_speed(iter/s)": 0.003355 }, { - "epoch": 0.012957659375529167, - "grad_norm": 2.390625, - "learning_rate": 9.996004019557879e-06, - "loss": 0.06362078, - "memory(GiB)": 75.24, + "epoch": 0.05183063750211667, + "grad_norm": 1.0703125, + "learning_rate": 9.936101840709373e-06, + "loss": 0.07236413, + "memory(GiB)": 75.3, "step": 55, - "train_speed(iter/s)": 0.0185 + "train_speed(iter/s)": 0.003353 }, { - "epoch": 0.01413562840966818, - "grad_norm": 2.875, - "learning_rate": 9.995229887219246e-06, - "loss": 0.06171583, - "memory(GiB)": 75.24, + "epoch": 0.05654251363867272, + "grad_norm": 1.0703125, + "learning_rate": 9.923752579921787e-06, + "loss": 0.07231579, + "memory(GiB)": 75.3, "step": 60, - "train_speed(iter/s)": 0.018512 + "train_speed(iter/s)": 0.003349 }, { - "epoch": 0.015313597443807196, - "grad_norm": 2.109375, - "learning_rate": 9.99438729302372e-06, - "loss": 0.06211852, - "memory(GiB)": 75.24, + "epoch": 0.06125438977522878, + "grad_norm": 1.0234375, + "learning_rate": 9.910322093590177e-06, + "loss": 0.07145001, + "memory(GiB)": 75.3, "step": 65, - "train_speed(iter/s)": 0.018519 + "train_speed(iter/s)": 0.003348 }, { - "epoch": 0.01649156647794621, - "grad_norm": 1.828125, - "learning_rate": 9.993476248519429e-06, - "loss": 0.06484153, - "memory(GiB)": 75.24, + "epoch": 0.06596626591178484, + "grad_norm": 1.0546875, + "learning_rate": 9.895813330966086e-06, + "loss": 0.07301619, + "memory(GiB)": 75.3, "step": 70, - "train_speed(iter/s)": 0.01852 + "train_speed(iter/s)": 0.003349 }, { - "epoch": 0.017669535512085225, - "grad_norm": 1.90625, - "learning_rate": 9.992496766192645e-06, - "loss": 0.06099743, - "memory(GiB)": 75.24, + "epoch": 0.0706781420483409, + "grad_norm": 1.1015625, + "learning_rate": 9.880229478083849e-06, + "loss": 0.0724276, + "memory(GiB)": 75.3, "step": 75, - "train_speed(iter/s)": 0.018526 + "train_speed(iter/s)": 0.00335 }, { - "epoch": 0.01884750454622424, - "grad_norm": 1.796875, - "learning_rate": 9.991448859467611e-06, - "loss": 0.05843818, - "memory(GiB)": 75.24, + "epoch": 0.07539001818489696, + "grad_norm": 1.0390625, + "learning_rate": 9.863573957060953e-06, + "loss": 0.06874905, + "memory(GiB)": 75.3, "step": 80, - "train_speed(iter/s)": 0.018543 + "train_speed(iter/s)": 0.003349 }, { - "epoch": 0.020025473580363256, - "grad_norm": 1.8203125, - "learning_rate": 9.99033254270636e-06, - "loss": 0.05953899, - "memory(GiB)": 75.24, + "epoch": 0.08010189432145302, + "grad_norm": 1.0859375, + "learning_rate": 9.845850425346563e-06, + "loss": 0.07212579, + "memory(GiB)": 75.3, "step": 85, - "train_speed(iter/s)": 0.018546 + "train_speed(iter/s)": 0.003347 }, { - "epoch": 0.02120344261450227, - "grad_norm": 1.9609375, - "learning_rate": 9.989147831208508e-06, - "loss": 0.06501681, - "memory(GiB)": 75.24, + "epoch": 0.08481377045800909, + "grad_norm": 1.1171875, + "learning_rate": 9.827062774918377e-06, + "loss": 0.07294501, + "memory(GiB)": 75.3, "step": 90, - "train_speed(iter/s)": 0.018554 + "train_speed(iter/s)": 0.003346 }, { - "epoch": 0.022381411648641287, - "grad_norm": 2.609375, - "learning_rate": 9.987894741211056e-06, - "loss": 0.06521546, - "memory(GiB)": 75.24, + "epoch": 0.08952564659456515, + "grad_norm": 0.98828125, + "learning_rate": 9.807215131427966e-06, + "loss": 0.06517277, + "memory(GiB)": 75.3, "step": 95, - "train_speed(iter/s)": 0.01856 + "train_speed(iter/s)": 0.003345 }, { - "epoch": 0.023559380682780302, - "grad_norm": 2.046875, - "learning_rate": 9.986573289888164e-06, - "loss": 0.06153967, - "memory(GiB)": 75.24, + "epoch": 0.09423752273112121, + "grad_norm": 0.984375, + "learning_rate": 9.786311853294799e-06, + "loss": 0.06962139, + "memory(GiB)": 75.3, "step": 100, - "train_speed(iter/s)": 0.018562 + "train_speed(iter/s)": 0.003346 }, { - "epoch": 0.024737349716919318, - "grad_norm": 2.109375, - "learning_rate": 9.98518349535091e-06, - "loss": 0.07089446, - "memory(GiB)": 75.24, + "epoch": 0.09894939886767727, + "grad_norm": 0.98828125, + "learning_rate": 9.764357530749178e-06, + "loss": 0.06724482, + "memory(GiB)": 75.3, "step": 105, - "train_speed(iter/s)": 0.018452 + "train_speed(iter/s)": 0.003339 }, { - "epoch": 0.025915318751058333, - "grad_norm": 1.7578125, - "learning_rate": 9.98372537664705e-06, - "loss": 0.05478874, - "memory(GiB)": 75.24, + "epoch": 0.10366127500423333, + "grad_norm": 1.015625, + "learning_rate": 9.741356984824234e-06, + "loss": 0.06572815, + "memory(GiB)": 75.3, "step": 110, - "train_speed(iter/s)": 0.018463 + "train_speed(iter/s)": 0.003339 }, { - "epoch": 0.027093287785197345, - "grad_norm": 2.9375, - "learning_rate": 9.982198953760752e-06, - "loss": 0.06532571, - "memory(GiB)": 75.24, + "epoch": 0.10837315114078938, + "grad_norm": 1.0390625, + "learning_rate": 9.717315266297277e-06, + "loss": 0.06739124, + "memory(GiB)": 75.3, "step": 115, - "train_speed(iter/s)": 0.018473 + "train_speed(iter/s)": 0.003342 }, { - "epoch": 0.02827125681933636, - "grad_norm": 2.234375, - "learning_rate": 9.980604247612325e-06, - "loss": 0.06488043, - "memory(GiB)": 75.24, + "epoch": 0.11308502727734544, + "grad_norm": 0.9375, + "learning_rate": 9.692237654580658e-06, + "loss": 0.06834027, + "memory(GiB)": 75.3, "step": 120, - "train_speed(iter/s)": 0.018478 + "train_speed(iter/s)": 0.003342 }, { - "epoch": 0.029449225853475376, - "grad_norm": 2.28125, - "learning_rate": 9.978941280057928e-06, - "loss": 0.06263313, - "memory(GiB)": 75.24, + "epoch": 0.1177969034139015, + "grad_norm": 1.078125, + "learning_rate": 9.66612965656245e-06, + "loss": 0.0658385, + "memory(GiB)": 75.3, "step": 125, - "train_speed(iter/s)": 0.018482 + "train_speed(iter/s)": 0.003343 }, { - "epoch": 0.03062719488761439, - "grad_norm": 2.21875, - "learning_rate": 9.977210073889273e-06, - "loss": 0.0654664, - "memory(GiB)": 75.24, + "epoch": 0.12250877955045757, + "grad_norm": 1.0859375, + "learning_rate": 9.638997005397174e-06, + "loss": 0.0717117, + "memory(GiB)": 75.3, "step": 130, - "train_speed(iter/s)": 0.018487 + "train_speed(iter/s)": 0.003344 }, { - "epoch": 0.03180516392175341, - "grad_norm": 2.171875, - "learning_rate": 9.975410652833316e-06, - "loss": 0.06672717, - "memory(GiB)": 75.24, + "epoch": 0.12722065568701363, + "grad_norm": 0.9765625, + "learning_rate": 9.610845659246833e-06, + "loss": 0.0667814, + "memory(GiB)": 75.3, "step": 135, - "train_speed(iter/s)": 0.018489 + "train_speed(iter/s)": 0.003344 }, { - "epoch": 0.03298313295589242, - "grad_norm": 2.875, - "learning_rate": 9.973543041551924e-06, - "loss": 0.06413687, - "memory(GiB)": 75.24, + "epoch": 0.13193253182356968, + "grad_norm": 0.9140625, + "learning_rate": 9.581681799972528e-06, + "loss": 0.06573244, + "memory(GiB)": 75.3, "step": 140, - "train_speed(iter/s)": 0.01849 + "train_speed(iter/s)": 0.003345 }, { - "epoch": 0.03416110199003144, - "grad_norm": 1.9453125, - "learning_rate": 9.971607265641547e-06, - "loss": 0.0582508, - "memory(GiB)": 75.24, + "epoch": 0.13664440796012575, + "grad_norm": 1.03125, + "learning_rate": 9.551511831776966e-06, + "loss": 0.06967602, + "memory(GiB)": 75.3, "step": 145, - "train_speed(iter/s)": 0.018495 + "train_speed(iter/s)": 0.003346 }, { - "epoch": 0.03533907102417045, - "grad_norm": 1.9375, - "learning_rate": 9.969603351632855e-06, - "loss": 0.06022533, - "memory(GiB)": 75.24, + "epoch": 0.1413562840966818, + "grad_norm": 0.90625, + "learning_rate": 9.520342379798141e-06, + "loss": 0.06216406, + "memory(GiB)": 75.3, "step": 150, - "train_speed(iter/s)": 0.0185 + "train_speed(iter/s)": 0.003345 }, { - "epoch": 0.03651704005830947, - "grad_norm": 2.109375, - "learning_rate": 9.967531326990387e-06, - "loss": 0.06132371, - "memory(GiB)": 75.24, + "epoch": 0.14606816023323788, + "grad_norm": 1.0390625, + "learning_rate": 9.488180288654485e-06, + "loss": 0.06460171, + "memory(GiB)": 75.3, "step": 155, - "train_speed(iter/s)": 0.018504 + "train_speed(iter/s)": 0.003346 }, { - "epoch": 0.03769500909244848, - "grad_norm": 2.078125, - "learning_rate": 9.965391220112165e-06, - "loss": 0.07101279, - "memory(GiB)": 75.24, + "epoch": 0.15078003636979392, + "grad_norm": 1.078125, + "learning_rate": 9.45503262094184e-06, + "loss": 0.06467786, + "memory(GiB)": 75.3, "step": 160, - "train_speed(iter/s)": 0.018506 + "train_speed(iter/s)": 0.003346 }, { - "epoch": 0.0388729781265875, - "grad_norm": 2.140625, - "learning_rate": 9.96318306032931e-06, - "loss": 0.0588982, - "memory(GiB)": 75.24, + "epoch": 0.15549191250635, + "grad_norm": 1.0625, + "learning_rate": 9.420906655682553e-06, + "loss": 0.06358048, + "memory(GiB)": 75.3, "step": 165, - "train_speed(iter/s)": 0.018505 + "train_speed(iter/s)": 0.003346 }, { - "epoch": 0.04005094716072651, - "grad_norm": 2.125, - "learning_rate": 9.96090687790564e-06, - "loss": 0.06118761, - "memory(GiB)": 75.24, + "epoch": 0.16020378864290605, + "grad_norm": 1.015625, + "learning_rate": 9.385809886727044e-06, + "loss": 0.06778824, + "memory(GiB)": 75.3, "step": 170, - "train_speed(iter/s)": 0.018511 + "train_speed(iter/s)": 0.003343 }, { - "epoch": 0.04122891619486553, - "grad_norm": 1.8671875, - "learning_rate": 9.95856270403725e-06, - "loss": 0.06012461, - "memory(GiB)": 75.24, + "epoch": 0.16491566477946212, + "grad_norm": 1.046875, + "learning_rate": 9.349750021108212e-06, + "loss": 0.06321884, + "memory(GiB)": 75.3, "step": 175, - "train_speed(iter/s)": 0.018517 + "train_speed(iter/s)": 0.003343 }, { - "epoch": 0.04240688522900454, - "grad_norm": 2.234375, - "learning_rate": 9.956150570852088e-06, - "loss": 0.0591939, - "memory(GiB)": 75.24, + "epoch": 0.16962754091601817, + "grad_norm": 0.97265625, + "learning_rate": 9.31273497734901e-06, + "loss": 0.06310185, + "memory(GiB)": 75.3, "step": 180, - "train_speed(iter/s)": 0.01852 + "train_speed(iter/s)": 0.003344 }, { - "epoch": 0.043584854263143555, - "grad_norm": 2.234375, - "learning_rate": 9.95367051140952e-06, - "loss": 0.06429687, - "memory(GiB)": 75.24, + "epoch": 0.17433941705257422, + "grad_norm": 0.9765625, + "learning_rate": 9.274772883723587e-06, + "loss": 0.06271737, + "memory(GiB)": 75.3, "step": 185, - "train_speed(iter/s)": 0.018524 + "train_speed(iter/s)": 0.003344 }, { - "epoch": 0.044762823297282574, - "grad_norm": 1.59375, - "learning_rate": 9.951122559699868e-06, - "loss": 0.05647093, - "memory(GiB)": 75.24, + "epoch": 0.1790512931891303, + "grad_norm": 0.97265625, + "learning_rate": 9.235872076472378e-06, + "loss": 0.06393245, + "memory(GiB)": 75.3, "step": 190, - "train_speed(iter/s)": 0.018525 + "train_speed(iter/s)": 0.003345 }, { - "epoch": 0.045940792331421586, - "grad_norm": 1.9140625, - "learning_rate": 9.948506750643946e-06, - "loss": 0.05816346, - "memory(GiB)": 75.24, + "epoch": 0.18376316932568634, + "grad_norm": 1.03125, + "learning_rate": 9.196041097971509e-06, + "loss": 0.06558744, + "memory(GiB)": 75.3, "step": 195, - "train_speed(iter/s)": 0.018525 + "train_speed(iter/s)": 0.003346 }, { - "epoch": 0.047118761365560605, - "grad_norm": 2.546875, - "learning_rate": 9.94582312009259e-06, - "loss": 0.05947306, - "memory(GiB)": 75.24, + "epoch": 0.18847504546224242, + "grad_norm": 0.98046875, + "learning_rate": 9.155288694856942e-06, + "loss": 0.06127087, + "memory(GiB)": 75.3, "step": 200, - "train_speed(iter/s)": 0.018527 + "train_speed(iter/s)": 0.003345 }, { - "epoch": 0.04829673039969962, - "grad_norm": 2.359375, - "learning_rate": 9.943071704826153e-06, - "loss": 0.06321282, - "memory(GiB)": 75.24, + "epoch": 0.19318692159879847, + "grad_norm": 0.875, + "learning_rate": 9.113623816103775e-06, + "loss": 0.06313071, + "memory(GiB)": 75.3, "step": 205, - "train_speed(iter/s)": 0.018454 + "train_speed(iter/s)": 0.003343 }, { - "epoch": 0.049474699433838636, - "grad_norm": 2.203125, - "learning_rate": 9.940252542554007e-06, - "loss": 0.06456767, - "memory(GiB)": 75.24, + "epoch": 0.19789879773535454, + "grad_norm": 1.0, + "learning_rate": 9.071055611061102e-06, + "loss": 0.06330621, + "memory(GiB)": 75.3, "step": 210, - "train_speed(iter/s)": 0.018455 + "train_speed(iter/s)": 0.003343 }, { - "epoch": 0.05065266846797765, - "grad_norm": 2.15625, - "learning_rate": 9.937365671914037e-06, - "loss": 0.06057892, - "memory(GiB)": 75.24, + "epoch": 0.2026106738719106, + "grad_norm": 0.9453125, + "learning_rate": 9.027593427442867e-06, + "loss": 0.06415906, + "memory(GiB)": 75.3, "step": 215, - "train_speed(iter/s)": 0.018456 + "train_speed(iter/s)": 0.003343 }, { - "epoch": 0.05183063750211667, - "grad_norm": 2.0, - "learning_rate": 9.934411132472088e-06, - "loss": 0.05920454, - "memory(GiB)": 75.24, + "epoch": 0.20732255000846667, + "grad_norm": 0.94140625, + "learning_rate": 8.98324680927517e-06, + "loss": 0.06299359, + "memory(GiB)": 75.3, "step": 220, - "train_speed(iter/s)": 0.018458 + "train_speed(iter/s)": 0.003343 }, { - "epoch": 0.05300860653625568, - "grad_norm": 2.015625, - "learning_rate": 9.931388964721446e-06, - "loss": 0.05975649, - "memory(GiB)": 75.24, + "epoch": 0.21203442614502271, + "grad_norm": 0.9609375, + "learning_rate": 8.938025494800454e-06, + "loss": 0.06004124, + "memory(GiB)": 75.3, "step": 225, - "train_speed(iter/s)": 0.018461 + "train_speed(iter/s)": 0.003343 }, { - "epoch": 0.05418657557039469, - "grad_norm": 2.0, - "learning_rate": 9.92829921008227e-06, - "loss": 0.06393375, - "memory(GiB)": 75.24, + "epoch": 0.21674630228157876, + "grad_norm": 0.97265625, + "learning_rate": 8.891939414339048e-06, + "loss": 0.06477681, + "memory(GiB)": 75.3, "step": 230, - "train_speed(iter/s)": 0.018462 + "train_speed(iter/s)": 0.003344 }, { - "epoch": 0.05536454460453371, - "grad_norm": 2.28125, - "learning_rate": 9.925141910901029e-06, - "loss": 0.06334119, - "memory(GiB)": 75.24, + "epoch": 0.22145817841813484, + "grad_norm": 0.92578125, + "learning_rate": 8.844998688108535e-06, + "loss": 0.06010489, + "memory(GiB)": 75.3, "step": 235, - "train_speed(iter/s)": 0.018466 + "train_speed(iter/s)": 0.003344 }, { - "epoch": 0.05654251363867272, - "grad_norm": 2.09375, - "learning_rate": 9.921917110449914e-06, - "loss": 0.06911048, - "memory(GiB)": 75.24, + "epoch": 0.22617005455469089, + "grad_norm": 0.9609375, + "learning_rate": 8.797213624001403e-06, + "loss": 0.05960445, + "memory(GiB)": 75.3, "step": 240, - "train_speed(iter/s)": 0.018468 + "train_speed(iter/s)": 0.003345 }, { - "epoch": 0.05772048267281174, - "grad_norm": 1.984375, - "learning_rate": 9.918624852926258e-06, - "loss": 0.05916922, - "memory(GiB)": 75.24, + "epoch": 0.23088193069124696, + "grad_norm": 1.0, + "learning_rate": 8.748594715321512e-06, + "loss": 0.06301316, + "memory(GiB)": 75.3, "step": 245, - "train_speed(iter/s)": 0.01847 + "train_speed(iter/s)": 0.003345 }, { - "epoch": 0.05889845170695075, - "grad_norm": 1.859375, - "learning_rate": 9.915265183451923e-06, - "loss": 0.06251335, - "memory(GiB)": 75.24, + "epoch": 0.235593806827803, + "grad_norm": 0.94921875, + "learning_rate": 8.699152638479817e-06, + "loss": 0.06120233, + "memory(GiB)": 75.3, "step": 250, - "train_speed(iter/s)": 0.018471 + "train_speed(iter/s)": 0.003345 }, { - "epoch": 0.06007642074108977, - "grad_norm": 1.8515625, - "learning_rate": 9.911838148072678e-06, - "loss": 0.06203491, - "memory(GiB)": 75.24, + "epoch": 0.24030568296435909, + "grad_norm": 0.97265625, + "learning_rate": 8.6488982506499e-06, + "loss": 0.06014684, + "memory(GiB)": 75.3, "step": 255, - "train_speed(iter/s)": 0.018477 + "train_speed(iter/s)": 0.003345 }, { - "epoch": 0.06125438977522878, - "grad_norm": 2.265625, - "learning_rate": 9.908343793757574e-06, - "loss": 0.06085759, - "memory(GiB)": 75.24, + "epoch": 0.24501755910091513, + "grad_norm": 1.0, + "learning_rate": 8.597842587383797e-06, + "loss": 0.05922247, + "memory(GiB)": 75.3, "step": 260, - "train_speed(iter/s)": 0.01848 + "train_speed(iter/s)": 0.003345 }, { - "epoch": 0.062432358809367795, - "grad_norm": 2.375, - "learning_rate": 9.904782168398296e-06, - "loss": 0.06250409, - "memory(GiB)": 75.24, + "epoch": 0.24972943523747118, + "grad_norm": 0.97265625, + "learning_rate": 8.545996860188668e-06, + "loss": 0.05851297, + "memory(GiB)": 75.3, "step": 265, - "train_speed(iter/s)": 0.018484 + "train_speed(iter/s)": 0.003345 }, { - "epoch": 0.06361032784350681, - "grad_norm": 1.9609375, - "learning_rate": 9.901153320808514e-06, - "loss": 0.05536562, - "memory(GiB)": 75.24, + "epoch": 0.25444131137402726, + "grad_norm": 1.0625, + "learning_rate": 8.493372454064809e-06, + "loss": 0.05934198, + "memory(GiB)": 75.3, "step": 270, - "train_speed(iter/s)": 0.018489 + "train_speed(iter/s)": 0.003345 }, { - "epoch": 0.06478829687764583, - "grad_norm": 1.8359375, - "learning_rate": 9.897457300723202e-06, - "loss": 0.05569639, - "memory(GiB)": 75.24, + "epoch": 0.2591531875105833, + "grad_norm": 0.90234375, + "learning_rate": 8.439980925005587e-06, + "loss": 0.06134464, + "memory(GiB)": 75.3, "step": 275, - "train_speed(iter/s)": 0.018491 + "train_speed(iter/s)": 0.003346 }, { - "epoch": 0.06596626591178484, - "grad_norm": 2.40625, - "learning_rate": 9.893694158797968e-06, - "loss": 0.05840618, - "memory(GiB)": 75.24, + "epoch": 0.26386506364713935, + "grad_norm": 0.90234375, + "learning_rate": 8.385833997459804e-06, + "loss": 0.05825667, + "memory(GiB)": 75.3, "step": 280, - "train_speed(iter/s)": 0.018494 + "train_speed(iter/s)": 0.003346 }, { - "epoch": 0.06714423494592386, - "grad_norm": 2.265625, - "learning_rate": 9.889863946608352e-06, - "loss": 0.05661937, - "memory(GiB)": 75.24, + "epoch": 0.26857693978369546, + "grad_norm": 0.8828125, + "learning_rate": 8.330943561757092e-06, + "loss": 0.06092241, + "memory(GiB)": 75.3, "step": 285, - "train_speed(iter/s)": 0.018496 + "train_speed(iter/s)": 0.003346 }, { - "epoch": 0.06832220398006288, - "grad_norm": 2.140625, - "learning_rate": 9.885966716649125e-06, - "loss": 0.06150655, - "memory(GiB)": 75.24, + "epoch": 0.2732888159202515, + "grad_norm": 0.91796875, + "learning_rate": 8.275321671496862e-06, + "loss": 0.05940055, + "memory(GiB)": 75.3, "step": 290, - "train_speed(iter/s)": 0.018497 + "train_speed(iter/s)": 0.003347 }, { - "epoch": 0.06950017301420189, - "grad_norm": 2.09375, - "learning_rate": 9.88200252233356e-06, - "loss": 0.06209329, - "memory(GiB)": 75.24, + "epoch": 0.27800069205680755, + "grad_norm": 0.9140625, + "learning_rate": 8.218980540901417e-06, + "loss": 0.05920713, + "memory(GiB)": 75.3, "step": 295, - "train_speed(iter/s)": 0.018497 + "train_speed(iter/s)": 0.003347 }, { - "epoch": 0.0706781420483409, - "grad_norm": 3.375, - "learning_rate": 9.877971417992716e-06, - "loss": 0.05904433, - "memory(GiB)": 75.24, + "epoch": 0.2827125681933636, + "grad_norm": 0.92578125, + "learning_rate": 8.16193254213377e-06, + "loss": 0.05777416, + "memory(GiB)": 75.3, "step": 300, - "train_speed(iter/s)": 0.018499 - }, - { - "epoch": 0.07185611108247993, - "grad_norm": 1.796875, - "learning_rate": 9.873873458874676e-06, - "loss": 0.05126434, - "memory(GiB)": 75.24, - "step": 305, - "train_speed(iter/s)": 0.018458 - }, - { - "epoch": 0.07303408011661894, - "grad_norm": 2.0, - "learning_rate": 9.8697087011438e-06, - "loss": 0.05796698, - "memory(GiB)": 75.24, - "step": 310, - "train_speed(iter/s)": 0.018459 - }, - { - "epoch": 0.07421204915075795, - "grad_norm": 1.875, - "learning_rate": 9.865477201879953e-06, - "loss": 0.05630487, - "memory(GiB)": 75.24, - "step": 315, - "train_speed(iter/s)": 0.01846 - }, - { - "epoch": 0.07539001818489696, - "grad_norm": 2.515625, - "learning_rate": 9.861179019077725e-06, - "loss": 0.0567848, - "memory(GiB)": 75.24, - "step": 320, - "train_speed(iter/s)": 0.018461 - }, - { - "epoch": 0.07656798721903597, - "grad_norm": 2.109375, - "learning_rate": 9.856814211645627e-06, - "loss": 0.05985626, - "memory(GiB)": 75.24, - "step": 325, - "train_speed(iter/s)": 0.018463 - }, - { - "epoch": 0.077745956253175, - "grad_norm": 2.09375, - "learning_rate": 9.852382839405298e-06, - "loss": 0.05782009, - "memory(GiB)": 75.24, - "step": 330, - "train_speed(iter/s)": 0.018466 - }, - { - "epoch": 0.07892392528731401, - "grad_norm": 2.28125, - "learning_rate": 9.847884963090675e-06, - "loss": 0.06585214, - "memory(GiB)": 75.24, - "step": 335, - "train_speed(iter/s)": 0.018468 - }, - { - "epoch": 0.08010189432145302, - "grad_norm": 2.234375, - "learning_rate": 9.843320644347156e-06, - "loss": 0.06263242, - "memory(GiB)": 75.24, - "step": 340, - "train_speed(iter/s)": 0.01847 - }, - { - "epoch": 0.08127986335559204, - "grad_norm": 2.203125, - "learning_rate": 9.838689945730776e-06, - "loss": 0.05163463, - "memory(GiB)": 75.24, - "step": 345, - "train_speed(iter/s)": 0.018472 - }, - { - "epoch": 0.08245783238973106, - "grad_norm": 2.015625, - "learning_rate": 9.833992930707321e-06, - "loss": 0.05960041, - "memory(GiB)": 75.24, - "step": 350, - "train_speed(iter/s)": 0.018475 - }, - { - "epoch": 0.08363580142387007, - "grad_norm": 2.5, - "learning_rate": 9.829229663651483e-06, - "loss": 0.05999585, - "memory(GiB)": 75.24, - "step": 355, - "train_speed(iter/s)": 0.018477 - }, - { - "epoch": 0.08481377045800909, - "grad_norm": 1.671875, - "learning_rate": 9.824400209845967e-06, - "loss": 0.05059795, - "memory(GiB)": 75.24, - "step": 360, - "train_speed(iter/s)": 0.018479 - }, - { - "epoch": 0.0859917394921481, - "grad_norm": 2.171875, - "learning_rate": 9.81950463548059e-06, - "loss": 0.05671123, - "memory(GiB)": 75.24, - "step": 365, - "train_speed(iter/s)": 0.018481 - }, - { - "epoch": 0.08716970852628711, - "grad_norm": 2.625, - "learning_rate": 9.814543007651389e-06, - "loss": 0.05803382, - "memory(GiB)": 75.24, - "step": 370, - "train_speed(iter/s)": 0.018483 - }, - { - "epoch": 0.08834767756042614, - "grad_norm": 1.890625, - "learning_rate": 9.80951539435969e-06, - "loss": 0.05704566, - "memory(GiB)": 75.24, - "step": 375, - "train_speed(iter/s)": 0.018485 - }, - { - "epoch": 0.08952564659456515, - "grad_norm": 2.03125, - "learning_rate": 9.804421864511175e-06, - "loss": 0.05998203, - "memory(GiB)": 75.24, - "step": 380, - "train_speed(iter/s)": 0.018487 - }, - { - "epoch": 0.09070361562870416, - "grad_norm": 2.53125, - "learning_rate": 9.79926248791495e-06, - "loss": 0.06044774, - "memory(GiB)": 75.24, - "step": 385, - "train_speed(iter/s)": 0.018488 - }, - { - "epoch": 0.09188158466284317, - "grad_norm": 2.1875, - "learning_rate": 9.794037335282572e-06, - "loss": 0.06596763, - "memory(GiB)": 75.24, - "step": 390, - "train_speed(iter/s)": 0.018489 - }, - { - "epoch": 0.0930595536969822, - "grad_norm": 2.171875, - "learning_rate": 9.788746478227097e-06, - "loss": 0.06313769, - "memory(GiB)": 75.24, - "step": 395, - "train_speed(iter/s)": 0.018489 - }, - { - "epoch": 0.09423752273112121, - "grad_norm": 1.9296875, - "learning_rate": 9.783389989262078e-06, - "loss": 0.05841722, - "memory(GiB)": 75.24, - "step": 400, - "train_speed(iter/s)": 0.018489 - }, - { - "epoch": 0.09541549176526022, - "grad_norm": 2.171875, - "learning_rate": 9.777967941800593e-06, - "loss": 0.05844305, - "memory(GiB)": 75.24, - "step": 405, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.09659346079939923, - "grad_norm": 2.125, - "learning_rate": 9.772480410154224e-06, - "loss": 0.05875611, - "memory(GiB)": 75.24, - "step": 410, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.09777142983353825, - "grad_norm": 2.078125, - "learning_rate": 9.766927469532042e-06, - "loss": 0.0553933, - "memory(GiB)": 75.24, - "step": 415, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.09894939886767727, - "grad_norm": 2.09375, - "learning_rate": 9.76130919603958e-06, - "loss": 0.05277152, - "memory(GiB)": 75.24, - "step": 420, - "train_speed(iter/s)": 0.018453 - }, - { - "epoch": 0.10012736790181628, - "grad_norm": 1.984375, - "learning_rate": 9.755625666677786e-06, - "loss": 0.05973901, - "memory(GiB)": 75.24, - "step": 425, - "train_speed(iter/s)": 0.018454 - }, - { - "epoch": 0.1013053369359553, - "grad_norm": 1.859375, - "learning_rate": 9.749876959341966e-06, - "loss": 0.05203662, - "memory(GiB)": 75.24, - "step": 430, - "train_speed(iter/s)": 0.018455 - }, - { - "epoch": 0.10248330597009431, - "grad_norm": 2.265625, - "learning_rate": 9.744063152820726e-06, - "loss": 0.06516852, - "memory(GiB)": 75.24, - "step": 435, - "train_speed(iter/s)": 0.018455 - }, - { - "epoch": 0.10366127500423333, - "grad_norm": 1.8828125, - "learning_rate": 9.738184326794878e-06, - "loss": 0.04924915, - "memory(GiB)": 75.24, - "step": 440, - "train_speed(iter/s)": 0.018456 - }, - { - "epoch": 0.10483924403837235, - "grad_norm": 2.28125, - "learning_rate": 9.732240561836362e-06, - "loss": 0.0607591, - "memory(GiB)": 75.24, - "step": 445, - "train_speed(iter/s)": 0.018457 - }, - { - "epoch": 0.10601721307251136, - "grad_norm": 1.8359375, - "learning_rate": 9.726231939407126e-06, - "loss": 0.05255068, - "memory(GiB)": 75.24, - "step": 450, - "train_speed(iter/s)": 0.018459 - }, - { - "epoch": 0.10719518210665037, - "grad_norm": 2.625, - "learning_rate": 9.72015854185803e-06, - "loss": 0.06666003, - "memory(GiB)": 75.24, - "step": 455, - "train_speed(iter/s)": 0.018461 - }, - { - "epoch": 0.10837315114078938, - "grad_norm": 1.859375, - "learning_rate": 9.714020452427699e-06, - "loss": 0.0617595, - "memory(GiB)": 75.24, - "step": 460, - "train_speed(iter/s)": 0.018462 - }, - { - "epoch": 0.1095511201749284, - "grad_norm": 1.8984375, - "learning_rate": 9.707817755241388e-06, - "loss": 0.05265898, - "memory(GiB)": 75.24, - "step": 465, - "train_speed(iter/s)": 0.018462 - }, - { - "epoch": 0.11072908920906742, - "grad_norm": 2.140625, - "learning_rate": 9.701550535309836e-06, - "loss": 0.06160604, - "memory(GiB)": 75.24, - "step": 470, - "train_speed(iter/s)": 0.018464 - }, - { - "epoch": 0.11190705824320643, - "grad_norm": 1.765625, - "learning_rate": 9.69521887852809e-06, - "loss": 0.051065, - "memory(GiB)": 75.24, - "step": 475, - "train_speed(iter/s)": 0.018466 - }, - { - "epoch": 0.11308502727734544, - "grad_norm": 2.140625, - "learning_rate": 9.688822871674331e-06, - "loss": 0.05968869, - "memory(GiB)": 75.24, - "step": 480, - "train_speed(iter/s)": 0.018466 - }, - { - "epoch": 0.11426299631148447, - "grad_norm": 2.203125, - "learning_rate": 9.682362602408697e-06, - "loss": 0.05215322, - "memory(GiB)": 75.24, - "step": 485, - "train_speed(iter/s)": 0.018467 - }, - { - "epoch": 0.11544096534562348, - "grad_norm": 2.109375, - "learning_rate": 9.675838159272057e-06, - "loss": 0.05403855, - "memory(GiB)": 75.24, - "step": 490, - "train_speed(iter/s)": 0.01847 - }, - { - "epoch": 0.11661893437976249, - "grad_norm": 1.9296875, - "learning_rate": 9.669249631684825e-06, - "loss": 0.05886961, - "memory(GiB)": 75.24, - "step": 495, - "train_speed(iter/s)": 0.018472 - }, - { - "epoch": 0.1177969034139015, - "grad_norm": 1.8671875, - "learning_rate": 9.66259710994571e-06, - "loss": 0.0597525, - "memory(GiB)": 75.24, - "step": 500, - "train_speed(iter/s)": 0.018474 - }, - { - "epoch": 0.11897487244804052, - "grad_norm": 2.0, - "learning_rate": 9.655880685230495e-06, - "loss": 0.05357894, - "memory(GiB)": 75.24, - "step": 505, - "train_speed(iter/s)": 0.018447 - }, - { - "epoch": 0.12015284148217954, - "grad_norm": 2.328125, - "learning_rate": 9.64910044959078e-06, - "loss": 0.06085765, - "memory(GiB)": 75.24, - "step": 510, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.12133081051631855, - "grad_norm": 1.90625, - "learning_rate": 9.642256495952726e-06, - "loss": 0.05562757, - "memory(GiB)": 75.24, - "step": 515, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.12250877955045757, - "grad_norm": 1.65625, - "learning_rate": 9.635348918115773e-06, - "loss": 0.06202195, - "memory(GiB)": 75.24, - "step": 520, - "train_speed(iter/s)": 0.018451 - }, - { - "epoch": 0.12368674858459658, - "grad_norm": 2.0625, - "learning_rate": 9.628377810751361e-06, - "loss": 0.05460469, - "memory(GiB)": 75.24, - "step": 525, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.12486471761873559, - "grad_norm": 1.9375, - "learning_rate": 9.621343269401629e-06, - "loss": 0.0583164, - "memory(GiB)": 75.24, - "step": 530, - "train_speed(iter/s)": 0.018453 - }, - { - "epoch": 0.1260426866528746, - "grad_norm": 1.921875, - "learning_rate": 9.61424539047811e-06, - "loss": 0.05761399, - "memory(GiB)": 75.24, - "step": 535, - "train_speed(iter/s)": 0.018454 - }, - { - "epoch": 0.12722065568701363, - "grad_norm": 2.0625, - "learning_rate": 9.607084271260405e-06, - "loss": 0.05920839, - "memory(GiB)": 75.24, - "step": 540, - "train_speed(iter/s)": 0.018455 - }, - { - "epoch": 0.12839862472115265, - "grad_norm": 1.9296875, - "learning_rate": 9.59986000989485e-06, - "loss": 0.04984753, - "memory(GiB)": 75.24, - "step": 545, - "train_speed(iter/s)": 0.018456 - }, - { - "epoch": 0.12957659375529165, - "grad_norm": 1.828125, - "learning_rate": 9.592572705393177e-06, - "loss": 0.05788959, - "memory(GiB)": 75.24, - "step": 550, - "train_speed(iter/s)": 0.018457 - }, - { - "epoch": 0.13075456278943068, - "grad_norm": 2.359375, - "learning_rate": 9.585222457631145e-06, - "loss": 0.05896995, - "memory(GiB)": 75.24, - "step": 555, - "train_speed(iter/s)": 0.018458 - }, - { - "epoch": 0.13193253182356968, - "grad_norm": 2.109375, - "learning_rate": 9.577809367347188e-06, - "loss": 0.05593311, - "memory(GiB)": 75.24, - "step": 560, - "train_speed(iter/s)": 0.01846 - }, - { - "epoch": 0.1331105008577087, - "grad_norm": 1.875, - "learning_rate": 9.570333536141016e-06, - "loss": 0.0451288, - "memory(GiB)": 75.24, - "step": 565, - "train_speed(iter/s)": 0.018461 - }, - { - "epoch": 0.13428846989184773, - "grad_norm": 1.9296875, - "learning_rate": 9.562795066472236e-06, - "loss": 0.0507852, - "memory(GiB)": 75.24, - "step": 570, - "train_speed(iter/s)": 0.018463 - }, - { - "epoch": 0.13546643892598673, - "grad_norm": 2.015625, - "learning_rate": 9.555194061658942e-06, - "loss": 0.06269175, - "memory(GiB)": 75.24, - "step": 575, - "train_speed(iter/s)": 0.018464 - }, - { - "epoch": 0.13664440796012575, - "grad_norm": 2.203125, - "learning_rate": 9.5475306258763e-06, - "loss": 0.06263654, - "memory(GiB)": 75.24, - "step": 580, - "train_speed(iter/s)": 0.018465 - }, - { - "epoch": 0.13782237699426475, - "grad_norm": 2.015625, - "learning_rate": 9.539804864155123e-06, - "loss": 0.0612376, - "memory(GiB)": 75.24, - "step": 585, - "train_speed(iter/s)": 0.018467 - }, - { - "epoch": 0.13900034602840378, - "grad_norm": 1.9453125, - "learning_rate": 9.532016882380422e-06, - "loss": 0.05655064, - "memory(GiB)": 75.24, - "step": 590, - "train_speed(iter/s)": 0.018468 - }, - { - "epoch": 0.1401783150625428, - "grad_norm": 2.296875, - "learning_rate": 9.524166787289968e-06, - "loss": 0.05865269, - "memory(GiB)": 75.24, - "step": 595, - "train_speed(iter/s)": 0.018467 - }, - { - "epoch": 0.1413562840966818, - "grad_norm": 1.8515625, - "learning_rate": 9.516254686472822e-06, - "loss": 0.06035703, - "memory(GiB)": 75.24, - "step": 600, - "train_speed(iter/s)": 0.018467 - }, - { - "epoch": 0.14253425313082083, - "grad_norm": 1.7578125, - "learning_rate": 9.508280688367859e-06, - "loss": 0.05865194, - "memory(GiB)": 75.24, - "step": 605, - "train_speed(iter/s)": 0.01844 - }, - { - "epoch": 0.14371222216495985, - "grad_norm": 2.3125, - "learning_rate": 9.500244902262285e-06, - "loss": 0.05606778, - "memory(GiB)": 75.24, - "step": 610, - "train_speed(iter/s)": 0.018442 - }, - { - "epoch": 0.14489019119909885, - "grad_norm": 1.9453125, - "learning_rate": 9.492147438290136e-06, - "loss": 0.05248761, - "memory(GiB)": 75.24, - "step": 615, - "train_speed(iter/s)": 0.018443 - }, - { - "epoch": 0.14606816023323788, - "grad_norm": 1.609375, - "learning_rate": 9.483988407430777e-06, - "loss": 0.04969699, - "memory(GiB)": 75.24, - "step": 620, - "train_speed(iter/s)": 0.018445 - }, - { - "epoch": 0.14724612926737687, - "grad_norm": 1.9609375, - "learning_rate": 9.475767921507367e-06, - "loss": 0.04865633, - "memory(GiB)": 75.24, - "step": 625, - "train_speed(iter/s)": 0.018447 - }, - { - "epoch": 0.1484240983015159, - "grad_norm": 2.015625, - "learning_rate": 9.467486093185339e-06, - "loss": 0.05612766, - "memory(GiB)": 75.24, - "step": 630, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.14960206733565493, - "grad_norm": 2.46875, - "learning_rate": 9.459143035970848e-06, - "loss": 0.06206894, - "memory(GiB)": 75.24, - "step": 635, - "train_speed(iter/s)": 0.018449 - }, - { - "epoch": 0.15078003636979392, - "grad_norm": 2.109375, - "learning_rate": 9.450738864209222e-06, - "loss": 0.05258945, - "memory(GiB)": 75.24, - "step": 640, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.15195800540393295, - "grad_norm": 2.0, - "learning_rate": 9.442273693083387e-06, - "loss": 0.06007297, - "memory(GiB)": 75.24, - "step": 645, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.15313597443807195, - "grad_norm": 2.171875, - "learning_rate": 9.433747638612296e-06, - "loss": 0.05551058, - "memory(GiB)": 75.24, - "step": 650, - "train_speed(iter/s)": 0.018451 - }, - { - "epoch": 0.15431394347221097, - "grad_norm": 2.046875, - "learning_rate": 9.425160817649333e-06, - "loss": 0.05920315, - "memory(GiB)": 75.24, - "step": 655, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.15549191250635, - "grad_norm": 2.609375, - "learning_rate": 9.416513347880715e-06, - "loss": 0.05700436, - "memory(GiB)": 75.24, - "step": 660, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.156669881540489, - "grad_norm": 1.859375, - "learning_rate": 9.40780534782388e-06, - "loss": 0.05983176, - "memory(GiB)": 75.24, - "step": 665, - "train_speed(iter/s)": 0.018446 - }, - { - "epoch": 0.15784785057462802, - "grad_norm": 2.40625, - "learning_rate": 9.399036936825854e-06, - "loss": 0.05986168, - "memory(GiB)": 75.24, - "step": 670, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.15902581960876702, - "grad_norm": 2.203125, - "learning_rate": 9.390208235061632e-06, - "loss": 0.05381632, - "memory(GiB)": 75.24, - "step": 675, - "train_speed(iter/s)": 0.018449 - }, - { - "epoch": 0.16020378864290605, - "grad_norm": 1.71875, - "learning_rate": 9.381319363532509e-06, - "loss": 0.05839885, - "memory(GiB)": 75.24, - "step": 680, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.16138175767704507, - "grad_norm": 1.8359375, - "learning_rate": 9.372370444064444e-06, - "loss": 0.05449136, - "memory(GiB)": 75.24, - "step": 685, - "train_speed(iter/s)": 0.018451 - }, - { - "epoch": 0.16255972671118407, - "grad_norm": 1.9609375, - "learning_rate": 9.363361599306377e-06, - "loss": 0.05309354, - "memory(GiB)": 75.24, - "step": 690, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.1637376957453231, - "grad_norm": 2.0, - "learning_rate": 9.354292952728547e-06, - "loss": 0.04827639, - "memory(GiB)": 75.24, - "step": 695, - "train_speed(iter/s)": 0.018454 - }, - { - "epoch": 0.16491566477946212, - "grad_norm": 2.5, - "learning_rate": 9.345164628620806e-06, - "loss": 0.05382951, - "memory(GiB)": 75.24, - "step": 700, - "train_speed(iter/s)": 0.018455 - }, - { - "epoch": 0.16609363381360112, - "grad_norm": 2.265625, - "learning_rate": 9.335976752090914e-06, - "loss": 0.06151073, - "memory(GiB)": 75.24, - "step": 705, - "train_speed(iter/s)": 0.018437 - }, - { - "epoch": 0.16727160284774015, - "grad_norm": 2.09375, - "learning_rate": 9.32672944906282e-06, - "loss": 0.05772614, - "memory(GiB)": 75.24, - "step": 710, - "train_speed(iter/s)": 0.018438 - }, - { - "epoch": 0.16844957188187915, - "grad_norm": 1.9140625, - "learning_rate": 9.317422846274951e-06, - "loss": 0.05297416, - "memory(GiB)": 75.24, - "step": 715, - "train_speed(iter/s)": 0.01844 - }, - { - "epoch": 0.16962754091601817, - "grad_norm": 2.3125, - "learning_rate": 9.308057071278447e-06, - "loss": 0.0586594, - "memory(GiB)": 75.24, - "step": 720, - "train_speed(iter/s)": 0.018441 - }, - { - "epoch": 0.1708055099501572, - "grad_norm": 2.015625, - "learning_rate": 9.29863225243544e-06, - "loss": 0.05650487, - "memory(GiB)": 75.24, - "step": 725, - "train_speed(iter/s)": 0.018442 - }, - { - "epoch": 0.1719834789842962, - "grad_norm": 1.9609375, - "learning_rate": 9.289148518917283e-06, - "loss": 0.04863868, - "memory(GiB)": 75.24, - "step": 730, - "train_speed(iter/s)": 0.018444 - }, - { - "epoch": 0.17316144801843522, - "grad_norm": 1.546875, - "learning_rate": 9.279606000702781e-06, - "loss": 0.05578424, - "memory(GiB)": 75.24, - "step": 735, - "train_speed(iter/s)": 0.018445 - }, - { - "epoch": 0.17433941705257422, - "grad_norm": 2.0, - "learning_rate": 9.270004828576408e-06, - "loss": 0.06056005, - "memory(GiB)": 75.24, - "step": 740, - "train_speed(iter/s)": 0.018446 - }, - { - "epoch": 0.17551738608671325, - "grad_norm": 2.171875, - "learning_rate": 9.260345134126515e-06, - "loss": 0.05374076, - "memory(GiB)": 75.24, - "step": 745, - "train_speed(iter/s)": 0.018447 - }, - { - "epoch": 0.17669535512085227, - "grad_norm": 2.375, - "learning_rate": 9.25062704974353e-06, - "loss": 0.05449854, - "memory(GiB)": 75.24, - "step": 750, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.17787332415499127, - "grad_norm": 2.125, - "learning_rate": 9.240850708618143e-06, - "loss": 0.05342836, - "memory(GiB)": 75.24, - "step": 755, - "train_speed(iter/s)": 0.018449 - }, - { - "epoch": 0.1790512931891303, - "grad_norm": 2.5625, - "learning_rate": 9.231016244739473e-06, - "loss": 0.05539855, - "memory(GiB)": 75.24, - "step": 760, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.1802292622232693, - "grad_norm": 1.8359375, - "learning_rate": 9.22112379289324e-06, - "loss": 0.05723217, - "memory(GiB)": 75.24, - "step": 765, - "train_speed(iter/s)": 0.018451 - }, - { - "epoch": 0.18140723125740832, - "grad_norm": 2.5625, - "learning_rate": 9.21117348865992e-06, - "loss": 0.0604903, - "memory(GiB)": 75.24, - "step": 770, - "train_speed(iter/s)": 0.018451 - }, - { - "epoch": 0.18258520029154734, - "grad_norm": 2.34375, - "learning_rate": 9.20116546841287e-06, - "loss": 0.05900519, - "memory(GiB)": 75.24, - "step": 775, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.18376316932568634, - "grad_norm": 2.3125, - "learning_rate": 9.191099869316485e-06, - "loss": 0.0589312, - "memory(GiB)": 75.24, - "step": 780, - "train_speed(iter/s)": 0.018453 - }, - { - "epoch": 0.18494113835982537, - "grad_norm": 2.25, - "learning_rate": 9.18097682932429e-06, - "loss": 0.05549632, - "memory(GiB)": 75.24, - "step": 785, - "train_speed(iter/s)": 0.018454 - }, - { - "epoch": 0.1861191073939644, - "grad_norm": 1.9296875, - "learning_rate": 9.170796487177076e-06, - "loss": 0.05326385, - "memory(GiB)": 75.24, - "step": 790, - "train_speed(iter/s)": 0.018455 - }, - { - "epoch": 0.1872970764281034, - "grad_norm": 1.6015625, - "learning_rate": 9.160558982400976e-06, - "loss": 0.04917867, - "memory(GiB)": 75.24, - "step": 795, - "train_speed(iter/s)": 0.018456 - }, - { - "epoch": 0.18847504546224242, - "grad_norm": 2.265625, - "learning_rate": 9.150264455305567e-06, - "loss": 0.0541541, - "memory(GiB)": 75.24, - "step": 800, - "train_speed(iter/s)": 0.018457 - }, - { - "epoch": 0.18965301449638142, - "grad_norm": 1.9609375, - "learning_rate": 9.13991304698194e-06, - "loss": 0.05413353, - "memory(GiB)": 75.24, - "step": 805, - "train_speed(iter/s)": 0.018441 - }, - { - "epoch": 0.19083098353052044, - "grad_norm": 1.7890625, - "learning_rate": 9.129504899300773e-06, - "loss": 0.05472311, - "memory(GiB)": 75.24, - "step": 810, - "train_speed(iter/s)": 0.018442 - }, - { - "epoch": 0.19200895256465947, - "grad_norm": 1.7890625, - "learning_rate": 9.119040154910376e-06, - "loss": 0.05228683, - "memory(GiB)": 75.24, - "step": 815, - "train_speed(iter/s)": 0.018443 - }, - { - "epoch": 0.19318692159879847, - "grad_norm": 2.78125, - "learning_rate": 9.108518957234746e-06, - "loss": 0.05799963, - "memory(GiB)": 75.24, - "step": 820, - "train_speed(iter/s)": 0.018444 - }, - { - "epoch": 0.1943648906329375, - "grad_norm": 2.078125, - "learning_rate": 9.097941450471597e-06, - "loss": 0.05133148, - "memory(GiB)": 75.24, - "step": 825, - "train_speed(iter/s)": 0.018445 - }, - { - "epoch": 0.1955428596670765, - "grad_norm": 1.703125, - "learning_rate": 9.087307779590386e-06, - "loss": 0.05284491, - "memory(GiB)": 75.24, - "step": 830, - "train_speed(iter/s)": 0.018445 - }, - { - "epoch": 0.19672082870121552, - "grad_norm": 1.7578125, - "learning_rate": 9.076618090330317e-06, - "loss": 0.06207844, - "memory(GiB)": 75.24, - "step": 835, - "train_speed(iter/s)": 0.018446 - }, - { - "epoch": 0.19789879773535454, - "grad_norm": 2.375, - "learning_rate": 9.065872529198363e-06, - "loss": 0.0547864, - "memory(GiB)": 75.24, - "step": 840, - "train_speed(iter/s)": 0.018446 - }, - { - "epoch": 0.19907676676949354, - "grad_norm": 1.7421875, - "learning_rate": 9.055071243467235e-06, - "loss": 0.04715272, - "memory(GiB)": 75.24, - "step": 845, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.20025473580363257, - "grad_norm": 1.640625, - "learning_rate": 9.04421438117338e-06, - "loss": 0.04856663, - "memory(GiB)": 75.24, - "step": 850, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.20143270483777156, - "grad_norm": 1.90625, - "learning_rate": 9.033302091114944e-06, - "loss": 0.05076236, - "memory(GiB)": 75.24, - "step": 855, - "train_speed(iter/s)": 0.018449 - }, - { - "epoch": 0.2026106738719106, - "grad_norm": 2.671875, - "learning_rate": 9.022334522849736e-06, - "loss": 0.05138582, - "memory(GiB)": 75.24, - "step": 860, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.20378864290604962, - "grad_norm": 2.09375, - "learning_rate": 9.011311826693182e-06, - "loss": 0.05206831, - "memory(GiB)": 75.24, - "step": 865, - "train_speed(iter/s)": 0.018451 - }, - { - "epoch": 0.20496661194018861, - "grad_norm": 2.234375, - "learning_rate": 9.000234153716253e-06, - "loss": 0.05928425, - "memory(GiB)": 75.24, - "step": 870, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.20614458097432764, - "grad_norm": 2.03125, - "learning_rate": 8.989101655743407e-06, - "loss": 0.05609163, - "memory(GiB)": 75.24, - "step": 875, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.20732255000846667, - "grad_norm": 2.21875, - "learning_rate": 8.977914485350501e-06, - "loss": 0.06157886, - "memory(GiB)": 75.24, - "step": 880, - "train_speed(iter/s)": 0.018453 - }, - { - "epoch": 0.20850051904260566, - "grad_norm": 2.125, - "learning_rate": 8.966672795862703e-06, - "loss": 0.05769287, - "memory(GiB)": 75.24, - "step": 885, - "train_speed(iter/s)": 0.018453 - }, - { - "epoch": 0.2096784880767447, - "grad_norm": 2.09375, - "learning_rate": 8.955376741352387e-06, - "loss": 0.05559229, - "memory(GiB)": 75.24, - "step": 890, - "train_speed(iter/s)": 0.018454 - }, - { - "epoch": 0.2108564571108837, - "grad_norm": 2.0, - "learning_rate": 8.944026476637027e-06, - "loss": 0.05189531, - "memory(GiB)": 75.24, - "step": 895, - "train_speed(iter/s)": 0.018454 - }, - { - "epoch": 0.21203442614502271, - "grad_norm": 1.84375, - "learning_rate": 8.93262215727707e-06, - "loss": 0.05473455, - "memory(GiB)": 75.24, - "step": 900, - "train_speed(iter/s)": 0.018454 - }, - { - "epoch": 0.21321239517916174, - "grad_norm": 1.984375, - "learning_rate": 8.92116393957381e-06, - "loss": 0.0589035, - "memory(GiB)": 75.24, - "step": 905, - "train_speed(iter/s)": 0.01844 - }, - { - "epoch": 0.21439036421330074, - "grad_norm": 2.203125, - "learning_rate": 8.909651980567235e-06, - "loss": 0.05482052, - "memory(GiB)": 75.24, - "step": 910, - "train_speed(iter/s)": 0.018441 - }, - { - "epoch": 0.21556833324743976, - "grad_norm": 1.7578125, - "learning_rate": 8.898086438033889e-06, - "loss": 0.04918118, - "memory(GiB)": 75.24, - "step": 915, - "train_speed(iter/s)": 0.018441 - }, - { - "epoch": 0.21674630228157876, - "grad_norm": 1.9453125, - "learning_rate": 8.886467470484695e-06, - "loss": 0.05721087, - "memory(GiB)": 75.24, - "step": 920, - "train_speed(iter/s)": 0.018441 - }, - { - "epoch": 0.2179242713157178, - "grad_norm": 2.15625, - "learning_rate": 8.874795237162794e-06, - "loss": 0.06429954, - "memory(GiB)": 75.24, - "step": 925, - "train_speed(iter/s)": 0.018442 - }, - { - "epoch": 0.2191022403498568, - "grad_norm": 2.171875, - "learning_rate": 8.86306989804136e-06, - "loss": 0.05842216, - "memory(GiB)": 75.24, - "step": 930, - "train_speed(iter/s)": 0.018443 - }, - { - "epoch": 0.2202802093839958, - "grad_norm": 2.1875, - "learning_rate": 8.8512916138214e-06, - "loss": 0.05584286, - "memory(GiB)": 75.24, - "step": 935, - "train_speed(iter/s)": 0.018444 - }, - { - "epoch": 0.22145817841813484, - "grad_norm": 2.359375, - "learning_rate": 8.839460545929565e-06, - "loss": 0.06666573, - "memory(GiB)": 75.24, - "step": 940, - "train_speed(iter/s)": 0.018444 - }, - { - "epoch": 0.22263614745227384, - "grad_norm": 1.7890625, - "learning_rate": 8.827576856515921e-06, - "loss": 0.05159678, - "memory(GiB)": 75.24, - "step": 945, - "train_speed(iter/s)": 0.018445 - }, - { - "epoch": 0.22381411648641286, - "grad_norm": 2.390625, - "learning_rate": 8.81564070845174e-06, - "loss": 0.05441403, - "memory(GiB)": 75.24, - "step": 950, - "train_speed(iter/s)": 0.018445 - }, - { - "epoch": 0.2249920855205519, - "grad_norm": 1.515625, - "learning_rate": 8.803652265327264e-06, - "loss": 0.05460416, - "memory(GiB)": 75.24, - "step": 955, - "train_speed(iter/s)": 0.018446 - }, - { - "epoch": 0.22617005455469089, - "grad_norm": 1.8046875, - "learning_rate": 8.79161169144946e-06, - "loss": 0.05041842, - "memory(GiB)": 75.24, - "step": 960, - "train_speed(iter/s)": 0.018446 - }, - { - "epoch": 0.2273480235888299, - "grad_norm": 1.9140625, - "learning_rate": 8.779519151839773e-06, - "loss": 0.04634477, - "memory(GiB)": 75.24, - "step": 965, - "train_speed(iter/s)": 0.018447 - }, - { - "epoch": 0.22852599262296894, - "grad_norm": 1.703125, - "learning_rate": 8.767374812231859e-06, - "loss": 0.05057756, - "memory(GiB)": 75.24, - "step": 970, - "train_speed(iter/s)": 0.018447 - }, - { - "epoch": 0.22970396165710794, - "grad_norm": 1.78125, - "learning_rate": 8.755178839069318e-06, - "loss": 0.0483914, - "memory(GiB)": 75.24, - "step": 975, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.23088193069124696, - "grad_norm": 2.59375, - "learning_rate": 8.742931399503408e-06, - "loss": 0.05196972, - "memory(GiB)": 75.24, - "step": 980, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.23205989972538596, - "grad_norm": 1.765625, - "learning_rate": 8.73063266139076e-06, - "loss": 0.05406979, - "memory(GiB)": 75.24, - "step": 985, - "train_speed(iter/s)": 0.018449 - }, - { - "epoch": 0.23323786875952499, - "grad_norm": 1.8671875, - "learning_rate": 8.718282793291075e-06, - "loss": 0.05768092, - "memory(GiB)": 75.24, - "step": 990, - "train_speed(iter/s)": 0.018449 - }, - { - "epoch": 0.234415837793664, - "grad_norm": 2.09375, - "learning_rate": 8.705881964464808e-06, - "loss": 0.0532176, - "memory(GiB)": 75.24, - "step": 995, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.235593806827803, - "grad_norm": 2.046875, - "learning_rate": 8.693430344870861e-06, - "loss": 0.04698043, - "memory(GiB)": 75.24, - "step": 1000, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.23677177586194204, - "grad_norm": 2.0, - "learning_rate": 8.680928105164241e-06, - "loss": 0.05811558, - "memory(GiB)": 75.24, - "step": 1005, - "train_speed(iter/s)": 0.018437 - }, - { - "epoch": 0.23794974489608103, - "grad_norm": 1.6640625, - "learning_rate": 8.668375416693728e-06, - "loss": 0.05066132, - "memory(GiB)": 75.24, - "step": 1010, - "train_speed(iter/s)": 0.018437 - }, - { - "epoch": 0.23912771393022006, - "grad_norm": 2.484375, - "learning_rate": 8.655772451499523e-06, - "loss": 0.06041561, - "memory(GiB)": 75.24, - "step": 1015, - "train_speed(iter/s)": 0.018438 - }, - { - "epoch": 0.24030568296435909, - "grad_norm": 2.234375, - "learning_rate": 8.643119382310896e-06, - "loss": 0.05908003, - "memory(GiB)": 75.24, - "step": 1020, - "train_speed(iter/s)": 0.018439 - }, - { - "epoch": 0.24148365199849808, - "grad_norm": 1.4765625, - "learning_rate": 8.630416382543811e-06, - "loss": 0.04886116, - "memory(GiB)": 75.24, - "step": 1025, - "train_speed(iter/s)": 0.01844 - }, - { - "epoch": 0.2426616210326371, - "grad_norm": 1.9375, - "learning_rate": 8.617663626298554e-06, - "loss": 0.05207675, - "memory(GiB)": 75.24, - "step": 1030, - "train_speed(iter/s)": 0.018441 - }, - { - "epoch": 0.2438395900667761, - "grad_norm": 1.9921875, - "learning_rate": 8.604861288357345e-06, - "loss": 0.05472702, - "memory(GiB)": 75.24, - "step": 1035, - "train_speed(iter/s)": 0.018441 - }, - { - "epoch": 0.24501755910091513, - "grad_norm": 1.8046875, - "learning_rate": 8.592009544181946e-06, - "loss": 0.05440549, - "memory(GiB)": 75.24, - "step": 1040, - "train_speed(iter/s)": 0.018442 - }, - { - "epoch": 0.24619552813505416, - "grad_norm": 2.390625, - "learning_rate": 8.57910856991125e-06, - "loss": 0.06334137, - "memory(GiB)": 75.24, - "step": 1045, - "train_speed(iter/s)": 0.018442 - }, - { - "epoch": 0.24737349716919316, - "grad_norm": 1.8125, - "learning_rate": 8.566158542358873e-06, - "loss": 0.04818487, - "memory(GiB)": 75.24, - "step": 1050, - "train_speed(iter/s)": 0.018443 - }, - { - "epoch": 0.24855146620333218, - "grad_norm": 2.171875, - "learning_rate": 8.553159639010728e-06, - "loss": 0.0541147, - "memory(GiB)": 75.24, - "step": 1055, - "train_speed(iter/s)": 0.018443 - }, - { - "epoch": 0.24972943523747118, - "grad_norm": 2.078125, - "learning_rate": 8.540112038022588e-06, - "loss": 0.05287529, - "memory(GiB)": 75.24, - "step": 1060, - "train_speed(iter/s)": 0.018443 - }, - { - "epoch": 0.25090740427161023, - "grad_norm": 1.65625, - "learning_rate": 8.527015918217656e-06, - "loss": 0.04908448, - "memory(GiB)": 75.24, - "step": 1065, - "train_speed(iter/s)": 0.018444 - }, - { - "epoch": 0.2520853733057492, - "grad_norm": 2.015625, - "learning_rate": 8.513871459084104e-06, - "loss": 0.05231723, - "memory(GiB)": 75.24, - "step": 1070, - "train_speed(iter/s)": 0.018445 - }, - { - "epoch": 0.25326334233988823, - "grad_norm": 2.0, - "learning_rate": 8.500678840772616e-06, - "loss": 0.05353705, - "memory(GiB)": 75.24, - "step": 1075, - "train_speed(iter/s)": 0.018446 - }, - { - "epoch": 0.25444131137402726, - "grad_norm": 2.015625, - "learning_rate": 8.487438244093914e-06, - "loss": 0.0578441, - "memory(GiB)": 75.24, - "step": 1080, - "train_speed(iter/s)": 0.018446 - }, - { - "epoch": 0.2556192804081663, - "grad_norm": 1.4765625, - "learning_rate": 8.474149850516297e-06, - "loss": 0.049189, - "memory(GiB)": 75.24, - "step": 1085, - "train_speed(iter/s)": 0.018446 - }, - { - "epoch": 0.2567972494423053, - "grad_norm": 2.671875, - "learning_rate": 8.46081384216313e-06, - "loss": 0.05135356, - "memory(GiB)": 75.24, - "step": 1090, - "train_speed(iter/s)": 0.018447 - }, - { - "epoch": 0.2579752184764443, - "grad_norm": 1.8984375, - "learning_rate": 8.447430401810365e-06, - "loss": 0.05644605, - "memory(GiB)": 75.24, - "step": 1095, - "train_speed(iter/s)": 0.018447 - }, - { - "epoch": 0.2591531875105833, - "grad_norm": 2.515625, - "learning_rate": 8.433999712884033e-06, - "loss": 0.05051196, - "memory(GiB)": 75.24, - "step": 1100, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.26033115654472233, - "grad_norm": 1.7421875, - "learning_rate": 8.420521959457722e-06, - "loss": 0.04916621, - "memory(GiB)": 75.24, - "step": 1105, - "train_speed(iter/s)": 0.018437 - }, - { - "epoch": 0.26150912557886136, - "grad_norm": 1.8203125, - "learning_rate": 8.406997326250064e-06, - "loss": 0.0479083, - "memory(GiB)": 75.24, - "step": 1110, - "train_speed(iter/s)": 0.018437 - }, - { - "epoch": 0.2626870946130004, - "grad_norm": 2.078125, - "learning_rate": 8.393425998622197e-06, - "loss": 0.05451037, - "memory(GiB)": 75.24, - "step": 1115, - "train_speed(iter/s)": 0.018437 - }, - { - "epoch": 0.26386506364713935, - "grad_norm": 2.296875, - "learning_rate": 8.37980816257523e-06, - "loss": 0.05726813, - "memory(GiB)": 75.24, - "step": 1120, - "train_speed(iter/s)": 0.018438 - }, - { - "epoch": 0.2650430326812784, - "grad_norm": 2.421875, - "learning_rate": 8.366144004747692e-06, - "loss": 0.06053632, - "memory(GiB)": 75.24, - "step": 1125, - "train_speed(iter/s)": 0.018438 - }, - { - "epoch": 0.2662210017154174, - "grad_norm": 1.4921875, - "learning_rate": 8.352433712412961e-06, - "loss": 0.04834349, - "memory(GiB)": 75.24, - "step": 1130, - "train_speed(iter/s)": 0.018439 - }, - { - "epoch": 0.26739897074955643, - "grad_norm": 2.15625, - "learning_rate": 8.338677473476726e-06, - "loss": 0.05667044, - "memory(GiB)": 75.24, - "step": 1135, - "train_speed(iter/s)": 0.01844 - }, - { - "epoch": 0.26857693978369546, - "grad_norm": 1.7890625, - "learning_rate": 8.324875476474382e-06, - "loss": 0.05549613, - "memory(GiB)": 75.24, - "step": 1140, - "train_speed(iter/s)": 0.018441 - }, - { - "epoch": 0.2697549088178344, - "grad_norm": 1.6953125, - "learning_rate": 8.311027910568463e-06, - "loss": 0.04905889, - "memory(GiB)": 75.24, - "step": 1145, - "train_speed(iter/s)": 0.018442 - }, - { - "epoch": 0.27093287785197345, - "grad_norm": 2.125, - "learning_rate": 8.297134965546047e-06, - "loss": 0.05011801, - "memory(GiB)": 75.24, - "step": 1150, - "train_speed(iter/s)": 0.018443 - }, - { - "epoch": 0.2721108468861125, - "grad_norm": 1.953125, - "learning_rate": 8.28319683181615e-06, - "loss": 0.04972631, - "memory(GiB)": 75.24, - "step": 1155, - "train_speed(iter/s)": 0.018444 - }, - { - "epoch": 0.2732888159202515, - "grad_norm": 2.390625, - "learning_rate": 8.269213700407124e-06, - "loss": 0.05374957, - "memory(GiB)": 75.24, - "step": 1160, - "train_speed(iter/s)": 0.018445 - }, - { - "epoch": 0.27446678495439053, - "grad_norm": 3.65625, - "learning_rate": 8.255185762964027e-06, - "loss": 0.05927312, - "memory(GiB)": 75.24, - "step": 1165, - "train_speed(iter/s)": 0.018445 - }, - { - "epoch": 0.2756447539885295, - "grad_norm": 2.15625, - "learning_rate": 8.241113211746008e-06, - "loss": 0.04802802, - "memory(GiB)": 75.24, - "step": 1170, - "train_speed(iter/s)": 0.018446 - }, - { - "epoch": 0.2768227230226685, - "grad_norm": 2.109375, - "learning_rate": 8.226996239623668e-06, - "loss": 0.05518162, - "memory(GiB)": 75.24, - "step": 1175, - "train_speed(iter/s)": 0.018447 - }, - { - "epoch": 0.27800069205680755, - "grad_norm": 2.25, - "learning_rate": 8.212835040076416e-06, - "loss": 0.05104474, - "memory(GiB)": 75.24, - "step": 1180, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.2791786610909466, - "grad_norm": 2.03125, - "learning_rate": 8.19862980718982e-06, - "loss": 0.0574605, - "memory(GiB)": 75.24, - "step": 1185, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.2803566301250856, - "grad_norm": 2.296875, - "learning_rate": 8.184380735652937e-06, - "loss": 0.05337468, - "memory(GiB)": 75.24, - "step": 1190, - "train_speed(iter/s)": 0.018449 - }, - { - "epoch": 0.2815345991592246, - "grad_norm": 1.7421875, - "learning_rate": 8.17008802075566e-06, - "loss": 0.04882863, - "memory(GiB)": 75.24, - "step": 1195, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.2827125681933636, - "grad_norm": 1.84375, - "learning_rate": 8.155751858386031e-06, - "loss": 0.04774918, - "memory(GiB)": 75.24, - "step": 1200, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.2838905372275026, - "grad_norm": 2.109375, - "learning_rate": 8.14137244502756e-06, - "loss": 0.06021355, - "memory(GiB)": 75.24, - "step": 1205, - "train_speed(iter/s)": 0.018437 - }, - { - "epoch": 0.28506850626164165, - "grad_norm": 1.7421875, - "learning_rate": 8.126949977756527e-06, - "loss": 0.04749005, - "memory(GiB)": 75.24, - "step": 1210, - "train_speed(iter/s)": 0.018438 - }, - { - "epoch": 0.2862464752957807, - "grad_norm": 1.90625, - "learning_rate": 8.112484654239292e-06, - "loss": 0.05273245, - "memory(GiB)": 75.24, - "step": 1215, - "train_speed(iter/s)": 0.018439 - }, - { - "epoch": 0.2874244443299197, - "grad_norm": 2.75, - "learning_rate": 8.097976672729571e-06, - "loss": 0.05524676, - "memory(GiB)": 75.24, - "step": 1220, - "train_speed(iter/s)": 0.01844 - }, - { - "epoch": 0.2886024133640587, - "grad_norm": 2.171875, - "learning_rate": 8.083426232065737e-06, - "loss": 0.054957, - "memory(GiB)": 75.24, - "step": 1225, - "train_speed(iter/s)": 0.018441 - }, - { - "epoch": 0.2897803823981977, - "grad_norm": 1.6796875, - "learning_rate": 8.068833531668071e-06, - "loss": 0.05104529, - "memory(GiB)": 75.24, - "step": 1230, - "train_speed(iter/s)": 0.018441 - }, - { - "epoch": 0.2909583514323367, - "grad_norm": 1.609375, - "learning_rate": 8.054198771536056e-06, - "loss": 0.05857276, - "memory(GiB)": 75.24, - "step": 1235, - "train_speed(iter/s)": 0.018442 - }, - { - "epoch": 0.29213632046647575, - "grad_norm": 2.0625, - "learning_rate": 8.039522152245617e-06, - "loss": 0.05216116, - "memory(GiB)": 75.24, - "step": 1240, - "train_speed(iter/s)": 0.018442 - }, - { - "epoch": 0.2933142895006148, - "grad_norm": 2.203125, - "learning_rate": 8.024803874946374e-06, - "loss": 0.04643235, - "memory(GiB)": 75.24, - "step": 1245, - "train_speed(iter/s)": 0.018443 - }, - { - "epoch": 0.29449225853475375, - "grad_norm": 2.25, - "learning_rate": 8.010044141358892e-06, - "loss": 0.05186967, - "memory(GiB)": 75.24, - "step": 1250, - "train_speed(iter/s)": 0.018444 - }, - { - "epoch": 0.2956702275688928, - "grad_norm": 2.375, - "learning_rate": 7.995243153771917e-06, - "loss": 0.04665394, - "memory(GiB)": 75.24, - "step": 1255, - "train_speed(iter/s)": 0.018445 - }, - { - "epoch": 0.2968481966030318, - "grad_norm": 1.796875, - "learning_rate": 7.980401115039588e-06, - "loss": 0.05233122, - "memory(GiB)": 75.24, - "step": 1260, - "train_speed(iter/s)": 0.018446 - }, - { - "epoch": 0.2980261656371708, - "grad_norm": 2.28125, - "learning_rate": 7.965518228578684e-06, - "loss": 0.05171386, - "memory(GiB)": 75.24, - "step": 1265, - "train_speed(iter/s)": 0.018446 - }, - { - "epoch": 0.29920413467130985, - "grad_norm": 2.140625, - "learning_rate": 7.950594698365805e-06, - "loss": 0.05227633, - "memory(GiB)": 75.24, - "step": 1270, - "train_speed(iter/s)": 0.018446 - }, - { - "epoch": 0.3003821037054488, - "grad_norm": 1.96875, - "learning_rate": 7.935630728934602e-06, - "loss": 0.05938133, - "memory(GiB)": 75.24, - "step": 1275, - "train_speed(iter/s)": 0.018447 - }, - { - "epoch": 0.30156007273958785, - "grad_norm": 1.8671875, - "learning_rate": 7.92062652537296e-06, - "loss": 0.05327083, - "memory(GiB)": 75.24, - "step": 1280, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.3027380417737269, - "grad_norm": 2.375, - "learning_rate": 7.905582293320188e-06, - "loss": 0.05497534, - "memory(GiB)": 75.24, - "step": 1285, - "train_speed(iter/s)": 0.018449 - }, - { - "epoch": 0.3039160108078659, - "grad_norm": 1.3984375, - "learning_rate": 7.890498238964211e-06, - "loss": 0.05238924, - "memory(GiB)": 75.24, - "step": 1290, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.3050939798420049, - "grad_norm": 1.6328125, - "learning_rate": 7.87537456903873e-06, - "loss": 0.0558899, - "memory(GiB)": 75.24, - "step": 1295, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.3062719488761439, - "grad_norm": 1.8671875, - "learning_rate": 7.860211490820395e-06, - "loss": 0.0439707, - "memory(GiB)": 75.24, - "step": 1300, - "train_speed(iter/s)": 0.018451 - }, - { - "epoch": 0.3074499179102829, - "grad_norm": 1.9453125, - "learning_rate": 7.845009212125968e-06, - "loss": 0.04970224, - "memory(GiB)": 75.24, - "step": 1305, - "train_speed(iter/s)": 0.018442 - }, - { - "epoch": 0.30862788694442195, - "grad_norm": 2.109375, - "learning_rate": 7.82976794130947e-06, - "loss": 0.06191138, - "memory(GiB)": 75.24, - "step": 1310, - "train_speed(iter/s)": 0.018443 - }, - { - "epoch": 0.309805855978561, - "grad_norm": 1.8984375, - "learning_rate": 7.814487887259322e-06, - "loss": 0.0583761, - "memory(GiB)": 75.24, - "step": 1315, - "train_speed(iter/s)": 0.018443 - }, - { - "epoch": 0.3109838250127, - "grad_norm": 1.71875, - "learning_rate": 7.799169259395492e-06, - "loss": 0.05760248, - "memory(GiB)": 75.24, - "step": 1320, - "train_speed(iter/s)": 0.018444 - }, - { - "epoch": 0.31216179404683897, - "grad_norm": 1.9921875, - "learning_rate": 7.783812267666617e-06, - "loss": 0.04379511, - "memory(GiB)": 75.24, - "step": 1325, - "train_speed(iter/s)": 0.018445 - }, - { - "epoch": 0.313339763080978, - "grad_norm": 2.0, - "learning_rate": 7.768417122547131e-06, - "loss": 0.0493092, - "memory(GiB)": 75.24, - "step": 1330, - "train_speed(iter/s)": 0.018446 - }, - { - "epoch": 0.314517732115117, - "grad_norm": 2.296875, - "learning_rate": 7.752984035034372e-06, - "loss": 0.04375547, - "memory(GiB)": 75.24, - "step": 1335, - "train_speed(iter/s)": 0.018447 - }, - { - "epoch": 0.31569570114925605, - "grad_norm": 6.0, - "learning_rate": 7.737513216645699e-06, - "loss": 0.04486598, - "memory(GiB)": 75.24, - "step": 1340, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.3168736701833951, - "grad_norm": 1.921875, - "learning_rate": 7.722004879415587e-06, - "loss": 0.0507314, - "memory(GiB)": 75.24, - "step": 1345, - "train_speed(iter/s)": 0.018449 - }, - { - "epoch": 0.31805163921753404, - "grad_norm": 3.828125, - "learning_rate": 7.706459235892729e-06, - "loss": 0.05101304, - "memory(GiB)": 75.24, - "step": 1350, - "train_speed(iter/s)": 0.018449 - }, - { - "epoch": 0.31922960825167307, - "grad_norm": 1.734375, - "learning_rate": 7.690876499137109e-06, - "loss": 0.04622986, - "memory(GiB)": 75.24, - "step": 1355, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.3204075772858121, - "grad_norm": 1.515625, - "learning_rate": 7.675256882717097e-06, - "loss": 0.05373204, - "memory(GiB)": 75.24, - "step": 1360, - "train_speed(iter/s)": 0.018451 - }, - { - "epoch": 0.3215855463199511, - "grad_norm": 2.0625, - "learning_rate": 7.659600600706514e-06, - "loss": 0.048058, - "memory(GiB)": 75.24, - "step": 1365, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.32276351535409015, - "grad_norm": 3.1875, - "learning_rate": 7.643907867681695e-06, - "loss": 0.04943306, - "memory(GiB)": 75.24, - "step": 1370, - "train_speed(iter/s)": 0.018453 - }, - { - "epoch": 0.3239414843882291, - "grad_norm": 1.6953125, - "learning_rate": 7.628178898718559e-06, - "loss": 0.04682907, - "memory(GiB)": 75.24, - "step": 1375, - "train_speed(iter/s)": 0.018453 - }, - { - "epoch": 0.32511945342236814, - "grad_norm": 2.234375, - "learning_rate": 7.612413909389651e-06, - "loss": 0.04913375, - "memory(GiB)": 75.24, - "step": 1380, - "train_speed(iter/s)": 0.018454 - }, - { - "epoch": 0.32629742245650717, - "grad_norm": 2.375, - "learning_rate": 7.596613115761189e-06, - "loss": 0.04683136, - "memory(GiB)": 75.24, - "step": 1385, - "train_speed(iter/s)": 0.018455 - }, - { - "epoch": 0.3274753914906462, - "grad_norm": 1.6953125, - "learning_rate": 7.580776734390112e-06, - "loss": 0.05235771, - "memory(GiB)": 75.24, - "step": 1390, - "train_speed(iter/s)": 0.018456 - }, - { - "epoch": 0.3286533605247852, - "grad_norm": 2.625, - "learning_rate": 7.564904982321097e-06, - "loss": 0.05894827, - "memory(GiB)": 75.24, - "step": 1395, - "train_speed(iter/s)": 0.018456 - }, - { - "epoch": 0.32983132955892425, - "grad_norm": 2.0, - "learning_rate": 7.548998077083598e-06, - "loss": 0.05407764, - "memory(GiB)": 75.24, - "step": 1400, - "train_speed(iter/s)": 0.018457 - }, - { - "epoch": 0.3310092985930632, - "grad_norm": 2.234375, - "learning_rate": 7.533056236688856e-06, - "loss": 0.05440799, - "memory(GiB)": 75.24, - "step": 1405, - "train_speed(iter/s)": 0.018449 - }, - { - "epoch": 0.33218726762720224, - "grad_norm": 1.875, - "learning_rate": 7.517079679626914e-06, - "loss": 0.04769931, - "memory(GiB)": 75.24, - "step": 1410, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.33336523666134127, - "grad_norm": 2.8125, - "learning_rate": 7.501068624863622e-06, - "loss": 0.06096817, - "memory(GiB)": 75.24, - "step": 1415, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.3345432056954803, - "grad_norm": 2.09375, - "learning_rate": 7.485023291837641e-06, - "loss": 0.05072919, - "memory(GiB)": 75.24, - "step": 1420, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.3357211747296193, - "grad_norm": 1.921875, - "learning_rate": 7.468943900457423e-06, - "loss": 0.05104024, - "memory(GiB)": 75.24, - "step": 1425, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.3368991437637583, - "grad_norm": 1.703125, - "learning_rate": 7.4528306710982135e-06, - "loss": 0.05378548, - "memory(GiB)": 75.24, - "step": 1430, - "train_speed(iter/s)": 0.018451 - }, - { - "epoch": 0.3380771127978973, - "grad_norm": 1.796875, - "learning_rate": 7.436683824599015e-06, - "loss": 0.05540187, - "memory(GiB)": 75.24, - "step": 1435, - "train_speed(iter/s)": 0.018451 - }, - { - "epoch": 0.33925508183203634, - "grad_norm": 2.25, - "learning_rate": 7.4205035822595716e-06, - "loss": 0.05696809, - "memory(GiB)": 75.24, - "step": 1440, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.34043305086617537, - "grad_norm": 1.8671875, - "learning_rate": 7.404290165837335e-06, - "loss": 0.05431355, - "memory(GiB)": 75.24, - "step": 1445, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.3416110199003144, - "grad_norm": 1.7421875, - "learning_rate": 7.388043797544415e-06, - "loss": 0.04478872, - "memory(GiB)": 75.24, - "step": 1450, - "train_speed(iter/s)": 0.018453 - }, - { - "epoch": 0.34278898893445336, - "grad_norm": 1.9609375, - "learning_rate": 7.37176470004455e-06, - "loss": 0.04893782, - "memory(GiB)": 75.24, - "step": 1455, - "train_speed(iter/s)": 0.018453 - }, - { - "epoch": 0.3439669579685924, - "grad_norm": 1.671875, - "learning_rate": 7.3554530964500416e-06, - "loss": 0.05598937, - "memory(GiB)": 75.24, - "step": 1460, - "train_speed(iter/s)": 0.018454 - }, - { - "epoch": 0.3451449270027314, - "grad_norm": 2.453125, - "learning_rate": 7.339109210318704e-06, - "loss": 0.05856699, - "memory(GiB)": 75.24, - "step": 1465, - "train_speed(iter/s)": 0.018454 - }, - { - "epoch": 0.34632289603687044, - "grad_norm": 2.28125, - "learning_rate": 7.322733265650793e-06, - "loss": 0.04954901, - "memory(GiB)": 75.24, - "step": 1470, - "train_speed(iter/s)": 0.018454 - }, - { - "epoch": 0.34750086507100947, - "grad_norm": 2.046875, - "learning_rate": 7.30632548688595e-06, - "loss": 0.06270002, - "memory(GiB)": 75.24, - "step": 1475, - "train_speed(iter/s)": 0.018454 - }, - { - "epoch": 0.34867883410514844, - "grad_norm": 2.03125, - "learning_rate": 7.289886098900112e-06, - "loss": 0.04866944, - "memory(GiB)": 75.24, - "step": 1480, - "train_speed(iter/s)": 0.018455 - }, - { - "epoch": 0.34985680313928746, - "grad_norm": 1.7421875, - "learning_rate": 7.273415327002431e-06, - "loss": 0.04686972, - "memory(GiB)": 75.24, - "step": 1485, - "train_speed(iter/s)": 0.018455 - }, - { - "epoch": 0.3510347721734265, - "grad_norm": 1.6484375, - "learning_rate": 7.256913396932196e-06, - "loss": 0.05006968, - "memory(GiB)": 75.24, - "step": 1490, - "train_speed(iter/s)": 0.018456 - }, - { - "epoch": 0.3522127412075655, - "grad_norm": 1.9921875, - "learning_rate": 7.240380534855729e-06, - "loss": 0.0498129, - "memory(GiB)": 75.24, - "step": 1495, - "train_speed(iter/s)": 0.018456 - }, - { - "epoch": 0.35339071024170454, - "grad_norm": 1.953125, - "learning_rate": 7.223816967363289e-06, - "loss": 0.0470607, - "memory(GiB)": 75.24, - "step": 1500, - "train_speed(iter/s)": 0.018456 - }, - { - "epoch": 0.3545686792758435, - "grad_norm": 1.9453125, - "learning_rate": 7.207222921465966e-06, - "loss": 0.04950109, - "memory(GiB)": 75.24, - "step": 1505, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.35574664830998254, - "grad_norm": 1.703125, - "learning_rate": 7.190598624592575e-06, - "loss": 0.04311339, - "memory(GiB)": 75.24, - "step": 1510, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.35692461734412156, - "grad_norm": 1.84375, - "learning_rate": 7.173944304586529e-06, - "loss": 0.04974242, - "memory(GiB)": 75.24, - "step": 1515, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.3581025863782606, - "grad_norm": 1.484375, - "learning_rate": 7.15726018970272e-06, - "loss": 0.04702036, - "memory(GiB)": 75.24, - "step": 1520, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.3592805554123996, - "grad_norm": 2.15625, - "learning_rate": 7.140546508604399e-06, - "loss": 0.0517059, - "memory(GiB)": 75.24, - "step": 1525, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.3604585244465386, - "grad_norm": 2.390625, - "learning_rate": 7.12380349036003e-06, - "loss": 0.04891534, - "memory(GiB)": 75.24, - "step": 1530, - "train_speed(iter/s)": 0.018449 - }, - { - "epoch": 0.3616364934806776, - "grad_norm": 2.703125, - "learning_rate": 7.107031364440157e-06, - "loss": 0.05355787, - "memory(GiB)": 75.24, - "step": 1535, - "train_speed(iter/s)": 0.018449 - }, - { - "epoch": 0.36281446251481664, - "grad_norm": 2.140625, - "learning_rate": 7.090230360714254e-06, - "loss": 0.05681676, - "memory(GiB)": 75.24, - "step": 1540, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.36399243154895566, - "grad_norm": 2.359375, - "learning_rate": 7.073400709447587e-06, - "loss": 0.05522673, - "memory(GiB)": 75.24, - "step": 1545, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.3651704005830947, - "grad_norm": 2.015625, - "learning_rate": 7.056542641298042e-06, - "loss": 0.04967948, - "memory(GiB)": 75.24, - "step": 1550, - "train_speed(iter/s)": 0.018451 - }, - { - "epoch": 0.36634836961723366, - "grad_norm": 2.25, - "learning_rate": 7.039656387312973e-06, - "loss": 0.05282811, - "memory(GiB)": 75.24, - "step": 1555, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.3675263386513727, - "grad_norm": 1.859375, - "learning_rate": 7.022742178926034e-06, - "loss": 0.04866407, - "memory(GiB)": 75.24, - "step": 1560, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.3687043076855117, - "grad_norm": 2.359375, - "learning_rate": 7.005800247954005e-06, - "loss": 0.05169091, - "memory(GiB)": 75.24, - "step": 1565, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.36988227671965074, - "grad_norm": 1.953125, - "learning_rate": 6.98883082659362e-06, - "loss": 0.05216286, - "memory(GiB)": 75.24, - "step": 1570, - "train_speed(iter/s)": 0.018453 - }, - { - "epoch": 0.37106024575378976, - "grad_norm": 1.7734375, - "learning_rate": 6.971834147418377e-06, - "loss": 0.05442037, - "memory(GiB)": 75.24, - "step": 1575, - "train_speed(iter/s)": 0.018454 - }, - { - "epoch": 0.3722382147879288, - "grad_norm": 1.7421875, - "learning_rate": 6.954810443375356e-06, - "loss": 0.05119426, - "memory(GiB)": 75.24, - "step": 1580, - "train_speed(iter/s)": 0.018454 - }, - { - "epoch": 0.37341618382206776, - "grad_norm": 1.59375, - "learning_rate": 6.937759947782027e-06, - "loss": 0.05131354, - "memory(GiB)": 75.24, - "step": 1585, - "train_speed(iter/s)": 0.018455 - }, - { - "epoch": 0.3745941528562068, - "grad_norm": 1.640625, - "learning_rate": 6.920682894323046e-06, - "loss": 0.05248099, - "memory(GiB)": 75.24, - "step": 1590, - "train_speed(iter/s)": 0.018456 - }, - { - "epoch": 0.3757721218903458, - "grad_norm": 1.7109375, - "learning_rate": 6.903579517047061e-06, - "loss": 0.04946129, - "memory(GiB)": 75.24, - "step": 1595, - "train_speed(iter/s)": 0.018456 - }, - { - "epoch": 0.37695009092448484, - "grad_norm": 1.796875, - "learning_rate": 6.886450050363496e-06, - "loss": 0.05146198, - "memory(GiB)": 75.24, - "step": 1600, - "train_speed(iter/s)": 0.018457 - }, - { - "epoch": 0.37812805995862386, - "grad_norm": 2.171875, - "learning_rate": 6.869294729039344e-06, - "loss": 0.05650499, - "memory(GiB)": 75.24, - "step": 1605, - "train_speed(iter/s)": 0.018447 - }, - { - "epoch": 0.37930602899276283, - "grad_norm": 1.578125, - "learning_rate": 6.8521137881959454e-06, - "loss": 0.05240663, - "memory(GiB)": 75.24, - "step": 1610, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.38048399802690186, - "grad_norm": 1.7421875, - "learning_rate": 6.834907463305771e-06, - "loss": 0.0468967, - "memory(GiB)": 75.24, - "step": 1615, - "train_speed(iter/s)": 0.018448 - }, - { - "epoch": 0.3816619670610409, - "grad_norm": 1.8671875, - "learning_rate": 6.817675990189188e-06, - "loss": 0.04780641, - "memory(GiB)": 75.24, - "step": 1620, - "train_speed(iter/s)": 0.018449 - }, - { - "epoch": 0.3828399360951799, - "grad_norm": 1.625, - "learning_rate": 6.800419605011232e-06, - "loss": 0.05245537, - "memory(GiB)": 75.24, - "step": 1625, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.38401790512931894, - "grad_norm": 2.53125, - "learning_rate": 6.78313854427837e-06, - "loss": 0.04904989, - "memory(GiB)": 75.24, - "step": 1630, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.3851958741634579, - "grad_norm": 1.7890625, - "learning_rate": 6.76583304483526e-06, - "loss": 0.05040743, - "memory(GiB)": 75.24, - "step": 1635, - "train_speed(iter/s)": 0.018451 - }, - { - "epoch": 0.38637384319759693, - "grad_norm": 2.125, - "learning_rate": 6.7485033438615e-06, - "loss": 0.04769071, - "memory(GiB)": 75.24, - "step": 1640, - "train_speed(iter/s)": 0.018451 - }, - { - "epoch": 0.38755181223173596, - "grad_norm": 2.34375, - "learning_rate": 6.731149678868385e-06, - "loss": 0.0504717, - "memory(GiB)": 75.24, - "step": 1645, - "train_speed(iter/s)": 0.018451 - }, - { - "epoch": 0.388729781265875, - "grad_norm": 1.7578125, - "learning_rate": 6.7137722876956435e-06, - "loss": 0.04594471, - "memory(GiB)": 75.24, - "step": 1650, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.389907750300014, - "grad_norm": 2.015625, - "learning_rate": 6.696371408508185e-06, - "loss": 0.04914819, - "memory(GiB)": 75.24, - "step": 1655, - "train_speed(iter/s)": 0.018453 - }, - { - "epoch": 0.391085719334153, - "grad_norm": 2.21875, - "learning_rate": 6.67894727979283e-06, - "loss": 0.05229876, - "memory(GiB)": 75.24, - "step": 1660, - "train_speed(iter/s)": 0.018453 - }, - { - "epoch": 0.392263688368292, - "grad_norm": 1.6015625, - "learning_rate": 6.661500140355046e-06, - "loss": 0.05038055, - "memory(GiB)": 75.24, - "step": 1665, - "train_speed(iter/s)": 0.018453 - }, - { - "epoch": 0.39344165740243103, - "grad_norm": 1.6484375, - "learning_rate": 6.644030229315674e-06, - "loss": 0.04761205, - "memory(GiB)": 75.24, - "step": 1670, - "train_speed(iter/s)": 0.018454 - }, - { - "epoch": 0.39461962643657006, - "grad_norm": 1.96875, - "learning_rate": 6.626537786107647e-06, - "loss": 0.05017205, - "memory(GiB)": 75.24, - "step": 1675, - "train_speed(iter/s)": 0.018455 - }, - { - "epoch": 0.3957975954707091, - "grad_norm": 2.03125, - "learning_rate": 6.609023050472712e-06, - "loss": 0.0522933, - "memory(GiB)": 75.24, - "step": 1680, - "train_speed(iter/s)": 0.018456 - }, - { - "epoch": 0.39697556450484806, - "grad_norm": 2.015625, - "learning_rate": 6.591486262458146e-06, - "loss": 0.05544751, - "memory(GiB)": 75.24, - "step": 1685, - "train_speed(iter/s)": 0.018456 - }, - { - "epoch": 0.3981535335389871, - "grad_norm": 2.5625, - "learning_rate": 6.573927662413462e-06, - "loss": 0.05269616, - "memory(GiB)": 75.24, - "step": 1690, - "train_speed(iter/s)": 0.018457 - }, - { - "epoch": 0.3993315025731261, - "grad_norm": 1.6796875, - "learning_rate": 6.556347490987115e-06, - "loss": 0.0480206, - "memory(GiB)": 75.24, - "step": 1695, - "train_speed(iter/s)": 0.018457 - }, - { - "epoch": 0.40050947160726513, - "grad_norm": 1.71875, - "learning_rate": 6.53874598912321e-06, - "loss": 0.04850869, - "memory(GiB)": 75.24, - "step": 1700, - "train_speed(iter/s)": 0.018458 - }, - { - "epoch": 0.40168744064140416, - "grad_norm": 2.109375, - "learning_rate": 6.521123398058189e-06, - "loss": 0.04664425, - "memory(GiB)": 75.24, - "step": 1705, - "train_speed(iter/s)": 0.018449 - }, - { - "epoch": 0.40286540967554313, - "grad_norm": 1.640625, - "learning_rate": 6.503479959317536e-06, - "loss": 0.05617427, - "memory(GiB)": 75.24, - "step": 1710, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.40404337870968215, - "grad_norm": 1.421875, - "learning_rate": 6.485815914712461e-06, - "loss": 0.04710643, - "memory(GiB)": 75.24, - "step": 1715, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.4052213477438212, - "grad_norm": 2.0, - "learning_rate": 6.468131506336584e-06, - "loss": 0.05491562, - "memory(GiB)": 75.24, - "step": 1720, - "train_speed(iter/s)": 0.01845 - }, - { - "epoch": 0.4063993167779602, - "grad_norm": 1.7578125, - "learning_rate": 6.450426976562623e-06, - "loss": 0.04522764, - "memory(GiB)": 75.24, - "step": 1725, - "train_speed(iter/s)": 0.018451 - }, - { - "epoch": 0.40757728581209923, - "grad_norm": 1.8671875, - "learning_rate": 6.4327025680390646e-06, - "loss": 0.0509581, - "memory(GiB)": 75.24, - "step": 1730, - "train_speed(iter/s)": 0.018451 - }, - { - "epoch": 0.4087552548462382, - "grad_norm": 2.5625, - "learning_rate": 6.414958523686845e-06, - "loss": 0.05455146, - "memory(GiB)": 75.24, - "step": 1735, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.40993322388037723, - "grad_norm": 1.609375, - "learning_rate": 6.397195086696021e-06, - "loss": 0.04453045, - "memory(GiB)": 75.24, - "step": 1740, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.41111119291451625, - "grad_norm": 2.984375, - "learning_rate": 6.379412500522427e-06, - "loss": 0.04473322, - "memory(GiB)": 75.24, - "step": 1745, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.4122891619486553, - "grad_norm": 1.609375, - "learning_rate": 6.361611008884353e-06, - "loss": 0.05007306, - "memory(GiB)": 75.24, - "step": 1750, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.4134671309827943, - "grad_norm": 1.984375, - "learning_rate": 6.343790855759191e-06, - "loss": 0.04592868, - "memory(GiB)": 75.24, - "step": 1755, - "train_speed(iter/s)": 0.018452 - }, - { - "epoch": 0.41464510001693333, - "grad_norm": 1.9140625, - "learning_rate": 6.3259522853801015e-06, - "loss": 0.04621382, - "memory(GiB)": 75.24, - "step": 1760, - "train_speed(iter/s)": 0.018453 - }, - { - "epoch": 0.4158230690510723, - "grad_norm": 1.8671875, - "learning_rate": 6.3080955422326595e-06, - "loss": 0.04989561, - "memory(GiB)": 75.24, - "step": 1765, - "train_speed(iter/s)": 0.018453 - }, - { - "epoch": 0.41700103808521133, - "grad_norm": 1.5390625, - "learning_rate": 6.290220871051506e-06, - "loss": 0.05395367, - "memory(GiB)": 75.24, - "step": 1770, - "train_speed(iter/s)": 0.018454 - }, - { - "epoch": 0.41817900711935035, - "grad_norm": 1.8671875, - "learning_rate": 6.272328516816993e-06, - "loss": 0.04335098, - "memory(GiB)": 75.24, - "step": 1775, - "train_speed(iter/s)": 0.018454 - }, - { - "epoch": 0.4193569761534894, - "grad_norm": 1.671875, - "learning_rate": 6.2544187247518285e-06, - "loss": 0.05292164, - "memory(GiB)": 75.24, - "step": 1780, - "train_speed(iter/s)": 0.018455 - }, - { - "epoch": 0.4205349451876284, - "grad_norm": 1.609375, - "learning_rate": 6.2364917403177115e-06, - "loss": 0.04423888, - "memory(GiB)": 75.24, - "step": 1785, - "train_speed(iter/s)": 0.018456 - }, - { - "epoch": 0.4217129142217674, - "grad_norm": 2.140625, - "learning_rate": 6.218547809211973e-06, - "loss": 0.04801815, - "memory(GiB)": 75.24, - "step": 1790, - "train_speed(iter/s)": 0.018456 - }, - { - "epoch": 0.4228908832559064, - "grad_norm": 2.15625, - "learning_rate": 6.200587177364204e-06, - "loss": 0.0498428, - "memory(GiB)": 75.24, - "step": 1795, - "train_speed(iter/s)": 0.018457 - }, - { - "epoch": 0.42406885229004543, - "grad_norm": 2.421875, - "learning_rate": 6.1826100909328855e-06, - "loss": 0.05989224, - "memory(GiB)": 75.24, - "step": 1800, - "train_speed(iter/s)": 0.018457 + "train_speed(iter/s)": 0.003347 } ], "logging_steps": 5, - "max_steps": 4244, + "max_steps": 1061, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, @@ -3275,8 +575,8 @@ "attributes": {} } }, - "total_flos": 1.9948649883254129e+18, - "train_batch_size": 4, + "total_flos": 1.6327530207541985e+18, + "train_batch_size": 2, "trial_name": null, "trial_params": null }