diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,5428 +1,748 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.3533881084901493, + "epoch": 0.09423752273112121, "eval_steps": 100000, - "global_step": 3000, + "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0001177960361633831, - "grad_norm": 18.875, + "epoch": 0.000235593806827803, + "grad_norm": 103.0, "learning_rate": 1e-05, - "loss": 0.42521548, - "memory(GiB)": 63.39, + "loss": 0.7800231, + "memory(GiB)": 63.62, "step": 1, - "train_speed(iter/s)": 0.060823 + "train_speed(iter/s)": 0.015931 }, { - "epoch": 0.0005889801808169155, - "grad_norm": 5.6875, - "learning_rate": 9.999994520399261e-06, - "loss": 0.22830066, - "memory(GiB)": 74.41, + "epoch": 0.001177969034139015, + "grad_norm": 5.8125, + "learning_rate": 9.99997807127629e-06, + "loss": 0.41946995, + "memory(GiB)": 75.24, "step": 5, - "train_speed(iter/s)": 0.071086 + "train_speed(iter/s)": 0.017972 }, { - "epoch": 0.001177960361633831, - "grad_norm": 5.40625, - "learning_rate": 9.999972259541841e-06, - "loss": 0.16523392, - "memory(GiB)": 74.41, + "epoch": 0.00235593806827803, + "grad_norm": 2.703125, + "learning_rate": 9.999888986165874e-06, + "loss": 0.0869894, + "memory(GiB)": 75.24, "step": 10, - "train_speed(iter/s)": 0.073351 + "train_speed(iter/s)": 0.018238 }, { - "epoch": 0.0017669405424507465, - "grad_norm": 3.8125, - "learning_rate": 9.999932875028875e-06, - "loss": 0.12637961, - "memory(GiB)": 74.41, + "epoch": 0.003533907102417045, + "grad_norm": 2.140625, + "learning_rate": 9.99973137534353e-06, + "loss": 0.06987351, + "memory(GiB)": 75.24, "step": 15, - "train_speed(iter/s)": 0.073784 + "train_speed(iter/s)": 0.018317 }, { - "epoch": 0.002355920723267662, - "grad_norm": 5.15625, - "learning_rate": 9.999876366995242e-06, - "loss": 0.14416599, - "memory(GiB)": 74.41, + "epoch": 0.00471187613655606, + "grad_norm": 2.515625, + "learning_rate": 9.999505240969388e-06, + "loss": 0.0606461, + "memory(GiB)": 75.24, "step": 20, - "train_speed(iter/s)": 0.074041 + "train_speed(iter/s)": 0.01837 }, { - "epoch": 0.0029449009040845775, - "grad_norm": 4.46875, - "learning_rate": 9.999802735634472e-06, - "loss": 0.12908924, - "memory(GiB)": 74.41, + "epoch": 0.005889845170695076, + "grad_norm": 2.4375, + "learning_rate": 9.999210586142718e-06, + "loss": 0.06591458, + "memory(GiB)": 75.24, "step": 25, - "train_speed(iter/s)": 0.074266 + "train_speed(iter/s)": 0.018407 }, { - "epoch": 0.003533881084901493, - "grad_norm": 5.03125, - "learning_rate": 9.99971198119873e-06, - "loss": 0.15117922, - "memory(GiB)": 74.41, + "epoch": 0.00706781420483409, + "grad_norm": 2.8125, + "learning_rate": 9.998847414901898e-06, + "loss": 0.06059705, + "memory(GiB)": 75.24, "step": 30, - "train_speed(iter/s)": 0.074446 + "train_speed(iter/s)": 0.018432 }, { - "epoch": 0.0041228612657184084, - "grad_norm": 5.53125, - "learning_rate": 9.99960410399883e-06, - "loss": 0.1358834, - "memory(GiB)": 74.41, + "epoch": 0.008245783238973105, + "grad_norm": 1.9921875, + "learning_rate": 9.998415732224352e-06, + "loss": 0.06047676, + "memory(GiB)": 75.24, "step": 35, - "train_speed(iter/s)": 0.074609 + "train_speed(iter/s)": 0.018453 }, { - "epoch": 0.004711841446535324, - "grad_norm": 4.28125, - "learning_rate": 9.999479104404225e-06, - "loss": 0.13182677, - "memory(GiB)": 74.41, + "epoch": 0.00942375227311212, + "grad_norm": 1.921875, + "learning_rate": 9.997915544026483e-06, + "loss": 0.06190881, + "memory(GiB)": 75.24, "step": 40, - "train_speed(iter/s)": 0.074644 + "train_speed(iter/s)": 0.018469 }, { - "epoch": 0.005300821627352239, - "grad_norm": 6.25, - "learning_rate": 9.999336982843003e-06, - "loss": 0.14286679, - "memory(GiB)": 74.41, + "epoch": 0.010601721307251136, + "grad_norm": 1.859375, + "learning_rate": 9.997346857163591e-06, + "loss": 0.05765554, + "memory(GiB)": 75.24, "step": 45, - "train_speed(iter/s)": 0.074647 + "train_speed(iter/s)": 0.018482 }, { - "epoch": 0.005889801808169155, - "grad_norm": 4.09375, - "learning_rate": 9.999177739801898e-06, - "loss": 0.12088619, - "memory(GiB)": 74.41, + "epoch": 0.011779690341390151, + "grad_norm": 2.5625, + "learning_rate": 9.99670967942979e-06, + "loss": 0.0662235, + "memory(GiB)": 75.24, "step": 50, - "train_speed(iter/s)": 0.074676 + "train_speed(iter/s)": 0.01849 }, { - "epoch": 0.00647878198898607, - "grad_norm": 5.1875, - "learning_rate": 9.999001375826279e-06, - "loss": 0.11520904, - "memory(GiB)": 74.41, + "epoch": 0.012957659375529167, + "grad_norm": 2.390625, + "learning_rate": 9.996004019557879e-06, + "loss": 0.06362078, + "memory(GiB)": 75.24, "step": 55, - "train_speed(iter/s)": 0.074701 + "train_speed(iter/s)": 0.0185 }, { - "epoch": 0.007067762169802986, - "grad_norm": 4.09375, - "learning_rate": 9.998807891520147e-06, - "loss": 0.11755368, - "memory(GiB)": 74.41, + "epoch": 0.01413562840966818, + "grad_norm": 2.875, + "learning_rate": 9.995229887219246e-06, + "loss": 0.06171583, + "memory(GiB)": 75.24, "step": 60, - "train_speed(iter/s)": 0.07475 + "train_speed(iter/s)": 0.018512 }, { - "epoch": 0.007656742350619901, - "grad_norm": 5.21875, - "learning_rate": 9.998597287546135e-06, - "loss": 0.1323318, - "memory(GiB)": 74.41, + "epoch": 0.015313597443807196, + "grad_norm": 2.109375, + "learning_rate": 9.99438729302372e-06, + "loss": 0.06211852, + "memory(GiB)": 75.24, "step": 65, - "train_speed(iter/s)": 0.074786 + "train_speed(iter/s)": 0.018519 }, { - "epoch": 0.008245722531436817, - "grad_norm": 4.53125, - "learning_rate": 9.998369564625513e-06, - "loss": 0.11528516, - "memory(GiB)": 74.41, + "epoch": 0.01649156647794621, + "grad_norm": 1.828125, + "learning_rate": 9.993476248519429e-06, + "loss": 0.06484153, + "memory(GiB)": 75.24, "step": 70, - "train_speed(iter/s)": 0.074767 + "train_speed(iter/s)": 0.01852 }, { - "epoch": 0.008834702712253733, - "grad_norm": 4.625, - "learning_rate": 9.998124723538172e-06, - "loss": 0.1128188, - "memory(GiB)": 74.41, + "epoch": 0.017669535512085225, + "grad_norm": 1.90625, + "learning_rate": 9.992496766192645e-06, + "loss": 0.06099743, + "memory(GiB)": 75.24, "step": 75, - "train_speed(iter/s)": 0.074767 + "train_speed(iter/s)": 0.018526 }, { - "epoch": 0.009423682893070648, - "grad_norm": 4.96875, - "learning_rate": 9.997862765122634e-06, - "loss": 0.12012781, - "memory(GiB)": 74.41, + "epoch": 0.01884750454622424, + "grad_norm": 1.796875, + "learning_rate": 9.991448859467611e-06, + "loss": 0.05843818, + "memory(GiB)": 75.24, "step": 80, - "train_speed(iter/s)": 0.074792 + "train_speed(iter/s)": 0.018543 }, { - "epoch": 0.010012663073887564, - "grad_norm": 4.59375, - "learning_rate": 9.99758369027604e-06, - "loss": 0.13530364, - "memory(GiB)": 74.41, + "epoch": 0.020025473580363256, + "grad_norm": 1.8203125, + "learning_rate": 9.99033254270636e-06, + "loss": 0.05953899, + "memory(GiB)": 75.24, "step": 85, - "train_speed(iter/s)": 0.074854 + "train_speed(iter/s)": 0.018546 }, { - "epoch": 0.010601643254704479, - "grad_norm": 5.0, - "learning_rate": 9.997287499954151e-06, - "loss": 0.13009607, - "memory(GiB)": 74.41, + "epoch": 0.02120344261450227, + "grad_norm": 1.9609375, + "learning_rate": 9.989147831208508e-06, + "loss": 0.06501681, + "memory(GiB)": 75.24, "step": 90, - "train_speed(iter/s)": 0.074907 + "train_speed(iter/s)": 0.018554 }, { - "epoch": 0.011190623435521395, - "grad_norm": 4.65625, - "learning_rate": 9.996974195171348e-06, - "loss": 0.13199264, - "memory(GiB)": 74.41, + "epoch": 0.022381411648641287, + "grad_norm": 2.609375, + "learning_rate": 9.987894741211056e-06, + "loss": 0.06521546, + "memory(GiB)": 75.24, "step": 95, - "train_speed(iter/s)": 0.074953 + "train_speed(iter/s)": 0.01856 }, { - "epoch": 0.01177960361633831, - "grad_norm": 5.6875, - "learning_rate": 9.996643777000617e-06, - "loss": 0.13707412, - "memory(GiB)": 74.41, + "epoch": 0.023559380682780302, + "grad_norm": 2.046875, + "learning_rate": 9.986573289888164e-06, + "loss": 0.06153967, + "memory(GiB)": 75.24, "step": 100, - "train_speed(iter/s)": 0.074985 + "train_speed(iter/s)": 0.018562 }, { - "epoch": 0.012368583797155226, - "grad_norm": 4.75, - "learning_rate": 9.996296246573558e-06, - "loss": 0.10490649, - "memory(GiB)": 74.41, + "epoch": 0.024737349716919318, + "grad_norm": 2.109375, + "learning_rate": 9.98518349535091e-06, + "loss": 0.07089446, + "memory(GiB)": 75.24, "step": 105, - "train_speed(iter/s)": 0.074991 + "train_speed(iter/s)": 0.018452 }, { - "epoch": 0.01295756397797214, - "grad_norm": 3.9375, - "learning_rate": 9.99593160508038e-06, - "loss": 0.11323662, - "memory(GiB)": 74.41, + "epoch": 0.025915318751058333, + "grad_norm": 1.7578125, + "learning_rate": 9.98372537664705e-06, + "loss": 0.05478874, + "memory(GiB)": 75.24, "step": 110, - "train_speed(iter/s)": 0.075013 + "train_speed(iter/s)": 0.018463 }, { - "epoch": 0.013546544158789057, - "grad_norm": 4.75, - "learning_rate": 9.995549853769887e-06, - "loss": 0.13000023, - "memory(GiB)": 74.41, + "epoch": 0.027093287785197345, + "grad_norm": 2.9375, + "learning_rate": 9.982198953760752e-06, + "loss": 0.06532571, + "memory(GiB)": 75.24, "step": 115, - "train_speed(iter/s)": 0.075041 + "train_speed(iter/s)": 0.018473 }, { - "epoch": 0.014135524339605972, - "grad_norm": 3.6875, - "learning_rate": 9.995150993949479e-06, - "loss": 0.109146, - "memory(GiB)": 74.41, + "epoch": 0.02827125681933636, + "grad_norm": 2.234375, + "learning_rate": 9.980604247612325e-06, + "loss": 0.06488043, + "memory(GiB)": 75.24, "step": 120, - "train_speed(iter/s)": 0.075048 + "train_speed(iter/s)": 0.018478 }, { - "epoch": 0.014724504520422888, - "grad_norm": 6.625, - "learning_rate": 9.994735026985157e-06, - "loss": 0.13355327, - "memory(GiB)": 74.41, + "epoch": 0.029449225853475376, + "grad_norm": 2.28125, + "learning_rate": 9.978941280057928e-06, + "loss": 0.06263313, + "memory(GiB)": 75.24, "step": 125, - "train_speed(iter/s)": 0.075064 + "train_speed(iter/s)": 0.018482 }, { - "epoch": 0.015313484701239803, - "grad_norm": 4.5, - "learning_rate": 9.994301954301497e-06, - "loss": 0.10676175, - "memory(GiB)": 74.41, + "epoch": 0.03062719488761439, + "grad_norm": 2.21875, + "learning_rate": 9.977210073889273e-06, + "loss": 0.0654664, + "memory(GiB)": 75.24, "step": 130, - "train_speed(iter/s)": 0.075068 + "train_speed(iter/s)": 0.018487 }, { - "epoch": 0.015902464882056717, - "grad_norm": 4.75, - "learning_rate": 9.99385177738167e-06, - "loss": 0.12428848, - "memory(GiB)": 74.41, + "epoch": 0.03180516392175341, + "grad_norm": 2.171875, + "learning_rate": 9.975410652833316e-06, + "loss": 0.06672717, + "memory(GiB)": 75.24, "step": 135, - "train_speed(iter/s)": 0.075069 + "train_speed(iter/s)": 0.018489 }, { - "epoch": 0.016491445062873634, - "grad_norm": 3.875, - "learning_rate": 9.993384497767419e-06, - "loss": 0.11814754, - "memory(GiB)": 74.41, + "epoch": 0.03298313295589242, + "grad_norm": 2.875, + "learning_rate": 9.973543041551924e-06, + "loss": 0.06413687, + "memory(GiB)": 75.24, "step": 140, - "train_speed(iter/s)": 0.075095 + "train_speed(iter/s)": 0.01849 }, { - "epoch": 0.01708042524369055, - "grad_norm": 4.4375, - "learning_rate": 9.992900117059056e-06, - "loss": 0.10705045, - "memory(GiB)": 74.41, + "epoch": 0.03416110199003144, + "grad_norm": 1.9453125, + "learning_rate": 9.971607265641547e-06, + "loss": 0.0582508, + "memory(GiB)": 75.24, "step": 145, - "train_speed(iter/s)": 0.075056 + "train_speed(iter/s)": 0.018495 }, { - "epoch": 0.017669405424507467, - "grad_norm": 4.375, - "learning_rate": 9.992398636915468e-06, - "loss": 0.11953868, - "memory(GiB)": 74.41, + "epoch": 0.03533907102417045, + "grad_norm": 1.9375, + "learning_rate": 9.969603351632855e-06, + "loss": 0.06022533, + "memory(GiB)": 75.24, "step": 150, - "train_speed(iter/s)": 0.07506 + "train_speed(iter/s)": 0.0185 }, { - "epoch": 0.01825838560532438, - "grad_norm": 4.25, - "learning_rate": 9.991880059054097e-06, - "loss": 0.11556933, - "memory(GiB)": 74.41, + "epoch": 0.03651704005830947, + "grad_norm": 2.109375, + "learning_rate": 9.967531326990387e-06, + "loss": 0.06132371, + "memory(GiB)": 75.24, "step": 155, - "train_speed(iter/s)": 0.075058 + "train_speed(iter/s)": 0.018504 }, { - "epoch": 0.018847365786141296, - "grad_norm": 4.625, - "learning_rate": 9.991344385250944e-06, - "loss": 0.11189389, - "memory(GiB)": 74.41, + "epoch": 0.03769500909244848, + "grad_norm": 2.078125, + "learning_rate": 9.965391220112165e-06, + "loss": 0.07101279, + "memory(GiB)": 75.24, "step": 160, - "train_speed(iter/s)": 0.075054 + "train_speed(iter/s)": 0.018506 }, { - "epoch": 0.019436345966958212, - "grad_norm": 4.875, - "learning_rate": 9.990791617340558e-06, - "loss": 0.10822484, - "memory(GiB)": 74.41, + "epoch": 0.0388729781265875, + "grad_norm": 2.140625, + "learning_rate": 9.96318306032931e-06, + "loss": 0.0588982, + "memory(GiB)": 75.24, "step": 165, - "train_speed(iter/s)": 0.07506 + "train_speed(iter/s)": 0.018505 }, { - "epoch": 0.02002532614777513, - "grad_norm": 4.34375, - "learning_rate": 9.990221757216029e-06, - "loss": 0.11197646, - "memory(GiB)": 74.41, + "epoch": 0.04005094716072651, + "grad_norm": 2.125, + "learning_rate": 9.96090687790564e-06, + "loss": 0.06118761, + "memory(GiB)": 75.24, "step": 170, - "train_speed(iter/s)": 0.075068 + "train_speed(iter/s)": 0.018511 }, { - "epoch": 0.02061430632859204, - "grad_norm": 3.953125, - "learning_rate": 9.989634806828987e-06, - "loss": 0.10931354, - "memory(GiB)": 74.41, + "epoch": 0.04122891619486553, + "grad_norm": 1.8671875, + "learning_rate": 9.95856270403725e-06, + "loss": 0.06012461, + "memory(GiB)": 75.24, "step": 175, - "train_speed(iter/s)": 0.075063 + "train_speed(iter/s)": 0.018517 }, { - "epoch": 0.021203286509408958, - "grad_norm": 5.09375, - "learning_rate": 9.98903076818959e-06, - "loss": 0.11188631, - "memory(GiB)": 74.41, + "epoch": 0.04240688522900454, + "grad_norm": 2.234375, + "learning_rate": 9.956150570852088e-06, + "loss": 0.0591939, + "memory(GiB)": 75.24, "step": 180, - "train_speed(iter/s)": 0.075068 + "train_speed(iter/s)": 0.01852 }, { - "epoch": 0.021792266690225874, - "grad_norm": 4.75, - "learning_rate": 9.988409643366518e-06, - "loss": 0.11905994, - "memory(GiB)": 74.41, + "epoch": 0.043584854263143555, + "grad_norm": 2.234375, + "learning_rate": 9.95367051140952e-06, + "loss": 0.06429687, + "memory(GiB)": 75.24, "step": 185, - "train_speed(iter/s)": 0.075071 + "train_speed(iter/s)": 0.018524 }, { - "epoch": 0.02238124687104279, - "grad_norm": 6.09375, - "learning_rate": 9.987771434486972e-06, - "loss": 0.11119931, - "memory(GiB)": 74.41, + "epoch": 0.044762823297282574, + "grad_norm": 1.59375, + "learning_rate": 9.951122559699868e-06, + "loss": 0.05647093, + "memory(GiB)": 75.24, "step": 190, - "train_speed(iter/s)": 0.075058 + "train_speed(iter/s)": 0.018525 }, { - "epoch": 0.022970227051859703, - "grad_norm": 3.46875, - "learning_rate": 9.987116143736656e-06, - "loss": 0.07965935, - "memory(GiB)": 74.41, + "epoch": 0.045940792331421586, + "grad_norm": 1.9140625, + "learning_rate": 9.948506750643946e-06, + "loss": 0.05816346, + "memory(GiB)": 75.24, "step": 195, - "train_speed(iter/s)": 0.075072 + "train_speed(iter/s)": 0.018525 }, { - "epoch": 0.02355920723267662, - "grad_norm": 4.0625, - "learning_rate": 9.986443773359776e-06, - "loss": 0.10195212, - "memory(GiB)": 74.41, + "epoch": 0.047118761365560605, + "grad_norm": 2.546875, + "learning_rate": 9.94582312009259e-06, + "loss": 0.05947306, + "memory(GiB)": 75.24, "step": 200, - "train_speed(iter/s)": 0.075089 + "train_speed(iter/s)": 0.018527 }, { - "epoch": 0.024148187413493536, - "grad_norm": 5.15625, - "learning_rate": 9.985754325659033e-06, - "loss": 0.12634788, - "memory(GiB)": 74.41, + "epoch": 0.04829673039969962, + "grad_norm": 2.359375, + "learning_rate": 9.943071704826153e-06, + "loss": 0.06321282, + "memory(GiB)": 75.24, "step": 205, - "train_speed(iter/s)": 0.075075 + "train_speed(iter/s)": 0.018454 }, { - "epoch": 0.024737167594310452, - "grad_norm": 4.71875, - "learning_rate": 9.985047802995613e-06, - "loss": 0.11235939, - "memory(GiB)": 74.41, + "epoch": 0.049474699433838636, + "grad_norm": 2.203125, + "learning_rate": 9.940252542554007e-06, + "loss": 0.06456767, + "memory(GiB)": 75.24, "step": 210, - "train_speed(iter/s)": 0.075064 + "train_speed(iter/s)": 0.018455 }, { - "epoch": 0.025326147775127365, - "grad_norm": 4.71875, - "learning_rate": 9.98432420778918e-06, - "loss": 0.12128488, - "memory(GiB)": 74.41, + "epoch": 0.05065266846797765, + "grad_norm": 2.15625, + "learning_rate": 9.937365671914037e-06, + "loss": 0.06057892, + "memory(GiB)": 75.24, "step": 215, - "train_speed(iter/s)": 0.07508 + "train_speed(iter/s)": 0.018456 }, { - "epoch": 0.02591512795594428, - "grad_norm": 3.59375, - "learning_rate": 9.983583542517868e-06, - "loss": 0.09801877, - "memory(GiB)": 74.41, + "epoch": 0.05183063750211667, + "grad_norm": 2.0, + "learning_rate": 9.934411132472088e-06, + "loss": 0.05920454, + "memory(GiB)": 75.24, "step": 220, - "train_speed(iter/s)": 0.075075 + "train_speed(iter/s)": 0.018458 }, { - "epoch": 0.026504108136761198, - "grad_norm": 4.8125, - "learning_rate": 9.982825809718268e-06, - "loss": 0.12451966, - "memory(GiB)": 74.41, + "epoch": 0.05300860653625568, + "grad_norm": 2.015625, + "learning_rate": 9.931388964721446e-06, + "loss": 0.05975649, + "memory(GiB)": 75.24, "step": 225, - "train_speed(iter/s)": 0.075076 + "train_speed(iter/s)": 0.018461 }, { - "epoch": 0.027093088317578114, - "grad_norm": 4.4375, - "learning_rate": 9.982051011985428e-06, - "loss": 0.10354398, - "memory(GiB)": 74.41, + "epoch": 0.05418657557039469, + "grad_norm": 2.0, + "learning_rate": 9.92829921008227e-06, + "loss": 0.06393375, + "memory(GiB)": 75.24, "step": 230, - "train_speed(iter/s)": 0.075069 + "train_speed(iter/s)": 0.018462 }, { - "epoch": 0.027682068498395027, - "grad_norm": 4.4375, - "learning_rate": 9.981259151972835e-06, - "loss": 0.11414304, - "memory(GiB)": 74.41, + "epoch": 0.05536454460453371, + "grad_norm": 2.28125, + "learning_rate": 9.925141910901029e-06, + "loss": 0.06334119, + "memory(GiB)": 75.24, "step": 235, - "train_speed(iter/s)": 0.075059 + "train_speed(iter/s)": 0.018466 }, { - "epoch": 0.028271048679211944, - "grad_norm": 4.21875, - "learning_rate": 9.980450232392412e-06, - "loss": 0.11468437, - "memory(GiB)": 74.41, + "epoch": 0.05654251363867272, + "grad_norm": 2.09375, + "learning_rate": 9.921917110449914e-06, + "loss": 0.06911048, + "memory(GiB)": 75.24, "step": 240, - "train_speed(iter/s)": 0.075065 + "train_speed(iter/s)": 0.018468 }, { - "epoch": 0.02886002886002886, - "grad_norm": 4.28125, - "learning_rate": 9.979624256014505e-06, - "loss": 0.10513357, - "memory(GiB)": 74.41, + "epoch": 0.05772048267281174, + "grad_norm": 1.984375, + "learning_rate": 9.918624852926258e-06, + "loss": 0.05916922, + "memory(GiB)": 75.24, "step": 245, - "train_speed(iter/s)": 0.075058 + "train_speed(iter/s)": 0.01847 }, { - "epoch": 0.029449009040845776, - "grad_norm": 4.4375, - "learning_rate": 9.97878122566788e-06, - "loss": 0.11546416, - "memory(GiB)": 74.41, + "epoch": 0.05889845170695075, + "grad_norm": 1.859375, + "learning_rate": 9.915265183451923e-06, + "loss": 0.06251335, + "memory(GiB)": 75.24, "step": 250, - "train_speed(iter/s)": 0.075068 + "train_speed(iter/s)": 0.018471 }, { - "epoch": 0.030037989221662693, - "grad_norm": 4.5625, - "learning_rate": 9.977921144239701e-06, - "loss": 0.10937932, - "memory(GiB)": 74.41, + "epoch": 0.06007642074108977, + "grad_norm": 1.8515625, + "learning_rate": 9.911838148072678e-06, + "loss": 0.06203491, + "memory(GiB)": 75.24, "step": 255, - "train_speed(iter/s)": 0.075063 + "train_speed(iter/s)": 0.018477 }, { - "epoch": 0.030626969402479606, - "grad_norm": 4.09375, - "learning_rate": 9.977044014675537e-06, - "loss": 0.11372919, - "memory(GiB)": 74.41, + "epoch": 0.06125438977522878, + "grad_norm": 2.265625, + "learning_rate": 9.908343793757574e-06, + "loss": 0.06085759, + "memory(GiB)": 75.24, "step": 260, - "train_speed(iter/s)": 0.07507 + "train_speed(iter/s)": 0.01848 }, { - "epoch": 0.031215949583296522, - "grad_norm": 5.84375, - "learning_rate": 9.976149839979333e-06, - "loss": 0.13152323, - "memory(GiB)": 74.41, + "epoch": 0.062432358809367795, + "grad_norm": 2.375, + "learning_rate": 9.904782168398296e-06, + "loss": 0.06250409, + "memory(GiB)": 75.24, "step": 265, - "train_speed(iter/s)": 0.075061 + "train_speed(iter/s)": 0.018484 }, { - "epoch": 0.031804929764113435, - "grad_norm": 3.875, - "learning_rate": 9.975238623213417e-06, - "loss": 0.10404849, - "memory(GiB)": 74.41, + "epoch": 0.06361032784350681, + "grad_norm": 1.9609375, + "learning_rate": 9.901153320808514e-06, + "loss": 0.05536562, + "memory(GiB)": 75.24, "step": 270, - "train_speed(iter/s)": 0.075064 + "train_speed(iter/s)": 0.018489 }, { - "epoch": 0.032393909944930355, - "grad_norm": 4.3125, - "learning_rate": 9.974310367498476e-06, - "loss": 0.09936564, - "memory(GiB)": 74.41, + "epoch": 0.06478829687764583, + "grad_norm": 1.8359375, + "learning_rate": 9.897457300723202e-06, + "loss": 0.05569639, + "memory(GiB)": 75.24, "step": 275, - "train_speed(iter/s)": 0.075057 + "train_speed(iter/s)": 0.018491 }, { - "epoch": 0.03298289012574727, - "grad_norm": 4.90625, - "learning_rate": 9.973365076013558e-06, - "loss": 0.11127584, - "memory(GiB)": 74.41, + "epoch": 0.06596626591178484, + "grad_norm": 2.40625, + "learning_rate": 9.893694158797968e-06, + "loss": 0.05840618, + "memory(GiB)": 75.24, "step": 280, - "train_speed(iter/s)": 0.07505 + "train_speed(iter/s)": 0.018494 }, { - "epoch": 0.03357187030656419, - "grad_norm": 5.0625, - "learning_rate": 9.972402751996045e-06, - "loss": 0.1130929, - "memory(GiB)": 74.41, + "epoch": 0.06714423494592386, + "grad_norm": 2.265625, + "learning_rate": 9.889863946608352e-06, + "loss": 0.05661937, + "memory(GiB)": 75.24, "step": 285, - "train_speed(iter/s)": 0.075047 + "train_speed(iter/s)": 0.018496 }, { - "epoch": 0.0341608504873811, - "grad_norm": 4.875, - "learning_rate": 9.97142339874166e-06, - "loss": 0.13001273, - "memory(GiB)": 74.41, + "epoch": 0.06832220398006288, + "grad_norm": 2.140625, + "learning_rate": 9.885966716649125e-06, + "loss": 0.06150655, + "memory(GiB)": 75.24, "step": 290, - "train_speed(iter/s)": 0.075049 + "train_speed(iter/s)": 0.018497 }, { - "epoch": 0.03474983066819801, - "grad_norm": 4.75, - "learning_rate": 9.970427019604441e-06, - "loss": 0.11981708, - "memory(GiB)": 74.41, + "epoch": 0.06950017301420189, + "grad_norm": 2.09375, + "learning_rate": 9.88200252233356e-06, + "loss": 0.06209329, + "memory(GiB)": 75.24, "step": 295, - "train_speed(iter/s)": 0.075056 + "train_speed(iter/s)": 0.018497 }, { - "epoch": 0.03533881084901493, - "grad_norm": 5.1875, - "learning_rate": 9.96941361799674e-06, - "loss": 0.09800007, - "memory(GiB)": 74.41, + "epoch": 0.0706781420483409, + "grad_norm": 3.375, + "learning_rate": 9.877971417992716e-06, + "loss": 0.05904433, + "memory(GiB)": 75.24, "step": 300, - "train_speed(iter/s)": 0.07505 + "train_speed(iter/s)": 0.018499 }, { - "epoch": 0.035927791029831846, - "grad_norm": 5.5, - "learning_rate": 9.968383197389202e-06, - "loss": 0.10793682, - "memory(GiB)": 74.41, + "epoch": 0.07185611108247993, + "grad_norm": 1.796875, + "learning_rate": 9.873873458874676e-06, + "loss": 0.05126434, + "memory(GiB)": 75.24, "step": 305, - "train_speed(iter/s)": 0.075058 + "train_speed(iter/s)": 0.018458 }, { - "epoch": 0.03651677121064876, - "grad_norm": 3.671875, - "learning_rate": 9.967335761310761e-06, - "loss": 0.10909323, - "memory(GiB)": 74.41, + "epoch": 0.07303408011661894, + "grad_norm": 2.0, + "learning_rate": 9.8697087011438e-06, + "loss": 0.05796698, + "memory(GiB)": 75.24, "step": 310, - "train_speed(iter/s)": 0.075054 + "train_speed(iter/s)": 0.018459 }, { - "epoch": 0.03710575139146568, - "grad_norm": 3.234375, - "learning_rate": 9.966271313348623e-06, - "loss": 0.10540984, - "memory(GiB)": 74.41, + "epoch": 0.07421204915075795, + "grad_norm": 1.875, + "learning_rate": 9.865477201879953e-06, + "loss": 0.05630487, + "memory(GiB)": 75.24, "step": 315, - "train_speed(iter/s)": 0.075062 + "train_speed(iter/s)": 0.01846 }, { - "epoch": 0.03769473157228259, - "grad_norm": 4.21875, - "learning_rate": 9.96518985714826e-06, - "loss": 0.11170142, - "memory(GiB)": 74.41, + "epoch": 0.07539001818489696, + "grad_norm": 2.515625, + "learning_rate": 9.861179019077725e-06, + "loss": 0.0567848, + "memory(GiB)": 75.24, "step": 320, - "train_speed(iter/s)": 0.075084 + "train_speed(iter/s)": 0.018461 }, { - "epoch": 0.03828371175309951, - "grad_norm": 4.28125, - "learning_rate": 9.964091396413381e-06, - "loss": 0.1047295, - "memory(GiB)": 74.41, + "epoch": 0.07656798721903597, + "grad_norm": 2.109375, + "learning_rate": 9.856814211645627e-06, + "loss": 0.05985626, + "memory(GiB)": 75.24, "step": 325, - "train_speed(iter/s)": 0.075092 + "train_speed(iter/s)": 0.018463 }, { - "epoch": 0.038872691933916424, - "grad_norm": 3.875, - "learning_rate": 9.962975934905947e-06, - "loss": 0.11229659, - "memory(GiB)": 74.41, + "epoch": 0.077745956253175, + "grad_norm": 2.09375, + "learning_rate": 9.852382839405298e-06, + "loss": 0.05782009, + "memory(GiB)": 75.24, "step": 330, - "train_speed(iter/s)": 0.075096 + "train_speed(iter/s)": 0.018466 }, { - "epoch": 0.03946167211473334, - "grad_norm": 5.75, - "learning_rate": 9.961843476446134e-06, - "loss": 0.12050457, - "memory(GiB)": 74.41, + "epoch": 0.07892392528731401, + "grad_norm": 2.28125, + "learning_rate": 9.847884963090675e-06, + "loss": 0.06585214, + "memory(GiB)": 75.24, "step": 335, - "train_speed(iter/s)": 0.075102 + "train_speed(iter/s)": 0.018468 }, { - "epoch": 0.04005065229555026, - "grad_norm": 4.1875, - "learning_rate": 9.960694024912328e-06, - "loss": 0.10951951, - "memory(GiB)": 74.41, + "epoch": 0.08010189432145302, + "grad_norm": 2.234375, + "learning_rate": 9.843320644347156e-06, + "loss": 0.06263242, + "memory(GiB)": 75.24, "step": 340, - "train_speed(iter/s)": 0.075106 + "train_speed(iter/s)": 0.01847 }, { - "epoch": 0.04063963247636717, - "grad_norm": 3.21875, - "learning_rate": 9.959527584241112e-06, - "loss": 0.10334082, - "memory(GiB)": 74.41, + "epoch": 0.08127986335559204, + "grad_norm": 2.203125, + "learning_rate": 9.838689945730776e-06, + "loss": 0.05163463, + "memory(GiB)": 75.24, "step": 345, - "train_speed(iter/s)": 0.07511 + "train_speed(iter/s)": 0.018472 }, { - "epoch": 0.04122861265718408, - "grad_norm": 4.84375, - "learning_rate": 9.958344158427255e-06, - "loss": 0.1155634, - "memory(GiB)": 74.41, + "epoch": 0.08245783238973106, + "grad_norm": 2.015625, + "learning_rate": 9.833992930707321e-06, + "loss": 0.05960041, + "memory(GiB)": 75.24, "step": 350, - "train_speed(iter/s)": 0.075116 + "train_speed(iter/s)": 0.018475 }, { - "epoch": 0.041817592838001, - "grad_norm": 4.3125, - "learning_rate": 9.957143751523695e-06, - "loss": 0.12060295, - "memory(GiB)": 74.41, + "epoch": 0.08363580142387007, + "grad_norm": 2.5, + "learning_rate": 9.829229663651483e-06, + "loss": 0.05999585, + "memory(GiB)": 75.24, "step": 355, - "train_speed(iter/s)": 0.075118 + "train_speed(iter/s)": 0.018477 }, { - "epoch": 0.042406573018817915, - "grad_norm": 3.921875, - "learning_rate": 9.955926367641524e-06, - "loss": 0.09431397, - "memory(GiB)": 74.41, + "epoch": 0.08481377045800909, + "grad_norm": 1.671875, + "learning_rate": 9.824400209845967e-06, + "loss": 0.05059795, + "memory(GiB)": 75.24, "step": 360, - "train_speed(iter/s)": 0.075117 + "train_speed(iter/s)": 0.018479 }, { - "epoch": 0.042995553199634835, - "grad_norm": 4.84375, - "learning_rate": 9.95469201094998e-06, - "loss": 0.10672688, - "memory(GiB)": 74.41, + "epoch": 0.0859917394921481, + "grad_norm": 2.171875, + "learning_rate": 9.81950463548059e-06, + "loss": 0.05671123, + "memory(GiB)": 75.24, "step": 365, - "train_speed(iter/s)": 0.075123 + "train_speed(iter/s)": 0.018481 }, { - "epoch": 0.04358453338045175, - "grad_norm": 4.375, - "learning_rate": 9.953440685676425e-06, - "loss": 0.11097996, - "memory(GiB)": 74.41, + "epoch": 0.08716970852628711, + "grad_norm": 2.625, + "learning_rate": 9.814543007651389e-06, + "loss": 0.05803382, + "memory(GiB)": 75.24, "step": 370, - "train_speed(iter/s)": 0.075131 + "train_speed(iter/s)": 0.018483 }, { - "epoch": 0.04417351356126866, - "grad_norm": 3.265625, - "learning_rate": 9.952172396106336e-06, - "loss": 0.11753557, - "memory(GiB)": 74.41, + "epoch": 0.08834767756042614, + "grad_norm": 1.890625, + "learning_rate": 9.80951539435969e-06, + "loss": 0.05704566, + "memory(GiB)": 75.24, "step": 375, - "train_speed(iter/s)": 0.07513 + "train_speed(iter/s)": 0.018485 }, { - "epoch": 0.04476249374208558, - "grad_norm": 3.484375, - "learning_rate": 9.950887146583288e-06, - "loss": 0.09597683, - "memory(GiB)": 74.41, + "epoch": 0.08952564659456515, + "grad_norm": 2.03125, + "learning_rate": 9.804421864511175e-06, + "loss": 0.05998203, + "memory(GiB)": 75.24, "step": 380, - "train_speed(iter/s)": 0.07513 + "train_speed(iter/s)": 0.018487 }, { - "epoch": 0.045351473922902494, - "grad_norm": 4.5625, - "learning_rate": 9.949584941508937e-06, - "loss": 0.10678861, - "memory(GiB)": 74.41, + "epoch": 0.09070361562870416, + "grad_norm": 2.53125, + "learning_rate": 9.79926248791495e-06, + "loss": 0.06044774, + "memory(GiB)": 75.24, "step": 385, - "train_speed(iter/s)": 0.07512 + "train_speed(iter/s)": 0.018488 }, { - "epoch": 0.04594045410371941, - "grad_norm": 4.46875, - "learning_rate": 9.948265785343013e-06, - "loss": 0.10599846, - "memory(GiB)": 74.41, + "epoch": 0.09188158466284317, + "grad_norm": 2.1875, + "learning_rate": 9.794037335282572e-06, + "loss": 0.06596763, + "memory(GiB)": 75.24, "step": 390, - "train_speed(iter/s)": 0.07512 + "train_speed(iter/s)": 0.018489 }, { - "epoch": 0.046529434284536327, - "grad_norm": 3.609375, - "learning_rate": 9.946929682603295e-06, - "loss": 0.10928589, - "memory(GiB)": 74.41, + "epoch": 0.0930595536969822, + "grad_norm": 2.171875, + "learning_rate": 9.788746478227097e-06, + "loss": 0.06313769, + "memory(GiB)": 75.24, "step": 395, - "train_speed(iter/s)": 0.07511 + "train_speed(iter/s)": 0.018489 }, { - "epoch": 0.04711841446535324, - "grad_norm": 4.46875, - "learning_rate": 9.9455766378656e-06, - "loss": 0.10189447, - "memory(GiB)": 74.41, + "epoch": 0.09423752273112121, + "grad_norm": 1.9296875, + "learning_rate": 9.783389989262078e-06, + "loss": 0.05841722, + "memory(GiB)": 75.24, "step": 400, - "train_speed(iter/s)": 0.075107 - }, - { - "epoch": 0.04770739464617016, - "grad_norm": 3.78125, - "learning_rate": 9.94420665576377e-06, - "loss": 0.11017077, - "memory(GiB)": 74.41, - "step": 405, - "train_speed(iter/s)": 0.075097 - }, - { - "epoch": 0.04829637482698707, - "grad_norm": 3.171875, - "learning_rate": 9.942819740989651e-06, - "loss": 0.09610715, - "memory(GiB)": 74.41, - "step": 410, - "train_speed(iter/s)": 0.075098 - }, - { - "epoch": 0.048885355007803985, - "grad_norm": 3.953125, - "learning_rate": 9.941415898293078e-06, - "loss": 0.09449079, - "memory(GiB)": 74.41, - "step": 415, - "train_speed(iter/s)": 0.075098 - }, - { - "epoch": 0.049474335188620905, - "grad_norm": 4.25, - "learning_rate": 9.939995132481863e-06, - "loss": 0.11824782, - "memory(GiB)": 74.41, - "step": 420, - "train_speed(iter/s)": 0.075098 - }, - { - "epoch": 0.05006331536943782, - "grad_norm": 3.953125, - "learning_rate": 9.938557448421772e-06, - "loss": 0.10327038, - "memory(GiB)": 74.41, - "step": 425, - "train_speed(iter/s)": 0.075095 - }, - { - "epoch": 0.05065229555025473, - "grad_norm": 3.71875, - "learning_rate": 9.937102851036515e-06, - "loss": 0.101033, - "memory(GiB)": 74.41, - "step": 430, - "train_speed(iter/s)": 0.075096 - }, - { - "epoch": 0.05124127573107165, - "grad_norm": 5.625, - "learning_rate": 9.935631345307726e-06, - "loss": 0.12589303, - "memory(GiB)": 74.41, - "step": 435, - "train_speed(iter/s)": 0.075096 - }, - { - "epoch": 0.05183025591188856, - "grad_norm": 4.375, - "learning_rate": 9.934142936274941e-06, - "loss": 0.11252528, - "memory(GiB)": 74.41, - "step": 440, - "train_speed(iter/s)": 0.0751 - }, - { - "epoch": 0.05241923609270548, - "grad_norm": 5.03125, - "learning_rate": 9.932637629035594e-06, - "loss": 0.1076035, - "memory(GiB)": 74.41, - "step": 445, - "train_speed(iter/s)": 0.07511 - }, - { - "epoch": 0.053008216273522396, - "grad_norm": 4.8125, - "learning_rate": 9.931115428744983e-06, - "loss": 0.09129641, - "memory(GiB)": 74.41, - "step": 450, - "train_speed(iter/s)": 0.075106 - }, - { - "epoch": 0.05359719645433931, - "grad_norm": 4.40625, - "learning_rate": 9.92957634061626e-06, - "loss": 0.10762049, - "memory(GiB)": 74.41, - "step": 455, - "train_speed(iter/s)": 0.075104 - }, - { - "epoch": 0.05418617663515623, - "grad_norm": 4.375, - "learning_rate": 9.928020369920422e-06, - "loss": 0.13035779, - "memory(GiB)": 74.41, - "step": 460, - "train_speed(iter/s)": 0.075115 - }, - { - "epoch": 0.05477515681597314, - "grad_norm": 4.90625, - "learning_rate": 9.92644752198628e-06, - "loss": 0.11500576, - "memory(GiB)": 74.41, - "step": 465, - "train_speed(iter/s)": 0.075106 - }, - { - "epoch": 0.055364136996790055, - "grad_norm": 5.59375, - "learning_rate": 9.924857802200443e-06, - "loss": 0.11347337, - "memory(GiB)": 74.41, - "step": 470, - "train_speed(iter/s)": 0.075116 - }, - { - "epoch": 0.055953117177606974, - "grad_norm": 4.0625, - "learning_rate": 9.923251216007306e-06, - "loss": 0.11633272, - "memory(GiB)": 74.41, - "step": 475, - "train_speed(iter/s)": 0.075118 - }, - { - "epoch": 0.05654209735842389, - "grad_norm": 4.15625, - "learning_rate": 9.921627768909022e-06, - "loss": 0.09800028, - "memory(GiB)": 74.41, - "step": 480, - "train_speed(iter/s)": 0.075124 - }, - { - "epoch": 0.05713107753924081, - "grad_norm": 8.4375, - "learning_rate": 9.919987466465495e-06, - "loss": 0.11815701, - "memory(GiB)": 74.41, - "step": 485, - "train_speed(iter/s)": 0.075127 - }, - { - "epoch": 0.05772005772005772, - "grad_norm": 4.9375, - "learning_rate": 9.91833031429435e-06, - "loss": 0.11462152, - "memory(GiB)": 74.41, - "step": 490, - "train_speed(iter/s)": 0.075127 - }, - { - "epoch": 0.05830903790087463, - "grad_norm": 4.15625, - "learning_rate": 9.916656318070918e-06, - "loss": 0.09471865, - "memory(GiB)": 74.41, - "step": 495, - "train_speed(iter/s)": 0.075129 - }, - { - "epoch": 0.05889801808169155, - "grad_norm": 4.3125, - "learning_rate": 9.91496548352822e-06, - "loss": 0.11097107, - "memory(GiB)": 74.41, - "step": 500, - "train_speed(iter/s)": 0.075131 - }, - { - "epoch": 0.059486998262508466, - "grad_norm": 6.75, - "learning_rate": 9.913257816456938e-06, - "loss": 0.12602438, - "memory(GiB)": 74.41, - "step": 505, - "train_speed(iter/s)": 0.074738 - }, - { - "epoch": 0.060075978443325385, - "grad_norm": 4.25, - "learning_rate": 9.91153332270541e-06, - "loss": 0.09413192, - "memory(GiB)": 74.41, - "step": 510, - "train_speed(iter/s)": 0.074752 - }, - { - "epoch": 0.0606649586241423, - "grad_norm": 4.6875, - "learning_rate": 9.90979200817959e-06, - "loss": 0.10453418, - "memory(GiB)": 74.41, - "step": 515, - "train_speed(iter/s)": 0.074754 - }, - { - "epoch": 0.06125393880495921, - "grad_norm": 3.453125, - "learning_rate": 9.90803387884305e-06, - "loss": 0.09584854, - "memory(GiB)": 74.41, - "step": 520, - "train_speed(iter/s)": 0.074762 - }, - { - "epoch": 0.06184291898577613, - "grad_norm": 3.625, - "learning_rate": 9.906258940716945e-06, - "loss": 0.11624388, - "memory(GiB)": 74.41, - "step": 525, - "train_speed(iter/s)": 0.074771 - }, - { - "epoch": 0.062431899166593044, - "grad_norm": 3.90625, - "learning_rate": 9.904467199879988e-06, - "loss": 0.09951019, - "memory(GiB)": 74.41, - "step": 530, - "train_speed(iter/s)": 0.074776 - }, - { - "epoch": 0.06302087934740996, - "grad_norm": 4.75, - "learning_rate": 9.90265866246845e-06, - "loss": 0.11697209, - "memory(GiB)": 74.41, - "step": 535, - "train_speed(iter/s)": 0.074779 - }, - { - "epoch": 0.06360985952822687, - "grad_norm": 3.71875, - "learning_rate": 9.900833334676116e-06, - "loss": 0.09333996, - "memory(GiB)": 74.41, - "step": 540, - "train_speed(iter/s)": 0.074775 - }, - { - "epoch": 0.06419883970904379, - "grad_norm": 4.125, - "learning_rate": 9.898991222754278e-06, - "loss": 0.09190307, - "memory(GiB)": 74.41, - "step": 545, - "train_speed(iter/s)": 0.07478 - }, - { - "epoch": 0.06478781988986071, - "grad_norm": 3.296875, - "learning_rate": 9.89713233301171e-06, - "loss": 0.10609086, - "memory(GiB)": 74.41, - "step": 550, - "train_speed(iter/s)": 0.07479 - }, - { - "epoch": 0.06537680007067762, - "grad_norm": 4.21875, - "learning_rate": 9.895256671814646e-06, - "loss": 0.11532583, - "memory(GiB)": 74.41, - "step": 555, - "train_speed(iter/s)": 0.07479 - }, - { - "epoch": 0.06596578025149454, - "grad_norm": 4.3125, - "learning_rate": 9.893364245586755e-06, - "loss": 0.10446991, - "memory(GiB)": 74.41, - "step": 560, - "train_speed(iter/s)": 0.074794 - }, - { - "epoch": 0.06655476043231145, - "grad_norm": 4.125, - "learning_rate": 9.891455060809122e-06, - "loss": 0.08096789, - "memory(GiB)": 74.41, - "step": 565, - "train_speed(iter/s)": 0.074797 - }, - { - "epoch": 0.06714374061312837, - "grad_norm": 4.71875, - "learning_rate": 9.889529124020232e-06, - "loss": 0.10516653, - "memory(GiB)": 74.41, - "step": 570, - "train_speed(iter/s)": 0.074799 - }, - { - "epoch": 0.06773272079394528, - "grad_norm": 4.09375, - "learning_rate": 9.887586441815934e-06, - "loss": 0.10007232, - "memory(GiB)": 74.41, - "step": 575, - "train_speed(iter/s)": 0.074804 - }, - { - "epoch": 0.0683217009747622, - "grad_norm": 3.5, - "learning_rate": 9.885627020849433e-06, - "loss": 0.09896673, - "memory(GiB)": 74.41, - "step": 580, - "train_speed(iter/s)": 0.074798 - }, - { - "epoch": 0.06891068115557912, - "grad_norm": 4.09375, - "learning_rate": 9.883650867831252e-06, - "loss": 0.10226643, - "memory(GiB)": 74.41, - "step": 585, - "train_speed(iter/s)": 0.074805 - }, - { - "epoch": 0.06949966133639603, - "grad_norm": 4.84375, - "learning_rate": 9.881657989529222e-06, - "loss": 0.10619533, - "memory(GiB)": 74.41, - "step": 590, - "train_speed(iter/s)": 0.074807 - }, - { - "epoch": 0.07008864151721295, - "grad_norm": 3.578125, - "learning_rate": 9.879648392768455e-06, - "loss": 0.1003757, - "memory(GiB)": 74.41, - "step": 595, - "train_speed(iter/s)": 0.074807 - }, - { - "epoch": 0.07067762169802987, - "grad_norm": 4.59375, - "learning_rate": 9.877622084431316e-06, - "loss": 0.10820855, - "memory(GiB)": 74.41, - "step": 600, - "train_speed(iter/s)": 0.07481 - }, - { - "epoch": 0.07126660187884677, - "grad_norm": 4.59375, - "learning_rate": 9.875579071457407e-06, - "loss": 0.09604223, - "memory(GiB)": 74.41, - "step": 605, - "train_speed(iter/s)": 0.074807 - }, - { - "epoch": 0.07185558205966369, - "grad_norm": 4.34375, - "learning_rate": 9.873519360843535e-06, - "loss": 0.10592574, - "memory(GiB)": 74.41, - "step": 610, - "train_speed(iter/s)": 0.07481 - }, - { - "epoch": 0.07244456224048061, - "grad_norm": 4.375, - "learning_rate": 9.871442959643694e-06, - "loss": 0.10632453, - "memory(GiB)": 74.41, - "step": 615, - "train_speed(iter/s)": 0.074813 - }, - { - "epoch": 0.07303354242129752, - "grad_norm": 3.59375, - "learning_rate": 9.869349874969039e-06, - "loss": 0.09836457, - "memory(GiB)": 74.41, - "step": 620, - "train_speed(iter/s)": 0.074819 - }, - { - "epoch": 0.07362252260211444, - "grad_norm": 4.46875, - "learning_rate": 9.867240113987863e-06, - "loss": 0.11561801, - "memory(GiB)": 74.41, - "step": 625, - "train_speed(iter/s)": 0.074823 - }, - { - "epoch": 0.07421150278293136, - "grad_norm": 3.109375, - "learning_rate": 9.865113683925571e-06, - "loss": 0.09328078, - "memory(GiB)": 74.41, - "step": 630, - "train_speed(iter/s)": 0.074825 - }, - { - "epoch": 0.07480048296374826, - "grad_norm": 3.9375, - "learning_rate": 9.862970592064653e-06, - "loss": 0.10008674, - "memory(GiB)": 74.41, - "step": 635, - "train_speed(iter/s)": 0.074832 - }, - { - "epoch": 0.07538946314456518, - "grad_norm": 4.40625, - "learning_rate": 9.860810845744664e-06, - "loss": 0.10500932, - "memory(GiB)": 74.41, - "step": 640, - "train_speed(iter/s)": 0.074834 - }, - { - "epoch": 0.0759784433253821, - "grad_norm": 3.9375, - "learning_rate": 9.858634452362195e-06, - "loss": 0.10009133, - "memory(GiB)": 74.41, - "step": 645, - "train_speed(iter/s)": 0.074837 - }, - { - "epoch": 0.07656742350619902, - "grad_norm": 3.75, - "learning_rate": 9.856441419370849e-06, - "loss": 0.09874001, - "memory(GiB)": 74.41, - "step": 650, - "train_speed(iter/s)": 0.074838 - }, - { - "epoch": 0.07715640368701593, - "grad_norm": 4.59375, - "learning_rate": 9.854231754281216e-06, - "loss": 0.10840924, - "memory(GiB)": 74.41, - "step": 655, - "train_speed(iter/s)": 0.074835 - }, - { - "epoch": 0.07774538386783285, - "grad_norm": 4.5, - "learning_rate": 9.852005464660847e-06, - "loss": 0.10447395, - "memory(GiB)": 74.41, - "step": 660, - "train_speed(iter/s)": 0.074838 - }, - { - "epoch": 0.07833436404864977, - "grad_norm": 3.515625, - "learning_rate": 9.84976255813423e-06, - "loss": 0.10944817, - "memory(GiB)": 74.41, - "step": 665, - "train_speed(iter/s)": 0.074837 - }, - { - "epoch": 0.07892334422946667, - "grad_norm": 4.375, - "learning_rate": 9.847503042382756e-06, - "loss": 0.11643915, - "memory(GiB)": 74.41, - "step": 670, - "train_speed(iter/s)": 0.074834 - }, - { - "epoch": 0.0795123244102836, - "grad_norm": 4.4375, - "learning_rate": 9.845226925144702e-06, - "loss": 0.11636159, - "memory(GiB)": 74.41, - "step": 675, - "train_speed(iter/s)": 0.074838 - }, - { - "epoch": 0.08010130459110051, - "grad_norm": 4.40625, - "learning_rate": 9.842934214215203e-06, - "loss": 0.11168916, - "memory(GiB)": 74.41, - "step": 680, - "train_speed(iter/s)": 0.074835 - }, - { - "epoch": 0.08069028477191742, - "grad_norm": 4.40625, - "learning_rate": 9.84062491744622e-06, - "loss": 0.10953134, - "memory(GiB)": 74.41, - "step": 685, - "train_speed(iter/s)": 0.074839 - }, - { - "epoch": 0.08127926495273434, - "grad_norm": 4.78125, - "learning_rate": 9.838299042746518e-06, - "loss": 0.09663584, - "memory(GiB)": 74.41, - "step": 690, - "train_speed(iter/s)": 0.07484 - }, - { - "epoch": 0.08186824513355126, - "grad_norm": 3.546875, - "learning_rate": 9.835956598081634e-06, - "loss": 0.10150684, - "memory(GiB)": 74.41, - "step": 695, - "train_speed(iter/s)": 0.074837 - }, - { - "epoch": 0.08245722531436817, - "grad_norm": 4.3125, - "learning_rate": 9.833597591473858e-06, - "loss": 0.10742941, - "memory(GiB)": 74.41, - "step": 700, - "train_speed(iter/s)": 0.074837 - }, - { - "epoch": 0.08304620549518509, - "grad_norm": 3.90625, - "learning_rate": 9.831222031002197e-06, - "loss": 0.10118611, - "memory(GiB)": 74.41, - "step": 705, - "train_speed(iter/s)": 0.074838 - }, - { - "epoch": 0.083635185676002, - "grad_norm": 4.03125, - "learning_rate": 9.828829924802355e-06, - "loss": 0.0967037, - "memory(GiB)": 74.41, - "step": 710, - "train_speed(iter/s)": 0.074837 - }, - { - "epoch": 0.08422416585681892, - "grad_norm": 3.96875, - "learning_rate": 9.826421281066693e-06, - "loss": 0.10264329, - "memory(GiB)": 74.41, - "step": 715, - "train_speed(iter/s)": 0.074838 - }, - { - "epoch": 0.08481314603763583, - "grad_norm": 3.90625, - "learning_rate": 9.823996108044218e-06, - "loss": 0.10819044, - "memory(GiB)": 74.41, - "step": 720, - "train_speed(iter/s)": 0.07484 - }, - { - "epoch": 0.08540212621845275, - "grad_norm": 3.96875, - "learning_rate": 9.82155441404054e-06, - "loss": 0.10065824, - "memory(GiB)": 74.41, - "step": 725, - "train_speed(iter/s)": 0.074845 - }, - { - "epoch": 0.08599110639926967, - "grad_norm": 3.03125, - "learning_rate": 9.819096207417848e-06, - "loss": 0.08238539, - "memory(GiB)": 74.41, - "step": 730, - "train_speed(iter/s)": 0.074846 - }, - { - "epoch": 0.08658008658008658, - "grad_norm": 4.40625, - "learning_rate": 9.816621496594891e-06, - "loss": 0.10270094, - "memory(GiB)": 74.41, - "step": 735, - "train_speed(iter/s)": 0.074848 - }, - { - "epoch": 0.0871690667609035, - "grad_norm": 4.09375, - "learning_rate": 9.814130290046933e-06, - "loss": 0.11216452, - "memory(GiB)": 74.41, - "step": 740, - "train_speed(iter/s)": 0.074848 - }, - { - "epoch": 0.08775804694172042, - "grad_norm": 4.375, - "learning_rate": 9.811622596305733e-06, - "loss": 0.08987691, - "memory(GiB)": 74.41, - "step": 745, - "train_speed(iter/s)": 0.074847 - }, - { - "epoch": 0.08834702712253732, - "grad_norm": 4.15625, - "learning_rate": 9.809098423959513e-06, - "loss": 0.10679505, - "memory(GiB)": 74.41, - "step": 750, - "train_speed(iter/s)": 0.074847 - }, - { - "epoch": 0.08893600730335424, - "grad_norm": 3.578125, - "learning_rate": 9.806557781652938e-06, - "loss": 0.09105108, - "memory(GiB)": 74.41, - "step": 755, - "train_speed(iter/s)": 0.074852 - }, - { - "epoch": 0.08952498748417116, - "grad_norm": 3.90625, - "learning_rate": 9.804000678087069e-06, - "loss": 0.0892078, - "memory(GiB)": 74.41, - "step": 760, - "train_speed(iter/s)": 0.074853 - }, - { - "epoch": 0.09011396766498807, - "grad_norm": 4.625, - "learning_rate": 9.80142712201935e-06, - "loss": 0.10482442, - "memory(GiB)": 74.41, - "step": 765, - "train_speed(iter/s)": 0.07486 - }, - { - "epoch": 0.09070294784580499, - "grad_norm": 4.21875, - "learning_rate": 9.798837122263563e-06, - "loss": 0.10331622, - "memory(GiB)": 74.41, - "step": 770, - "train_speed(iter/s)": 0.074863 - }, - { - "epoch": 0.09129192802662191, - "grad_norm": 4.6875, - "learning_rate": 9.796230687689812e-06, - "loss": 0.11459937, - "memory(GiB)": 74.41, - "step": 775, - "train_speed(iter/s)": 0.074864 - }, - { - "epoch": 0.09188090820743881, - "grad_norm": 4.15625, - "learning_rate": 9.793607827224485e-06, - "loss": 0.09125889, - "memory(GiB)": 74.41, - "step": 780, - "train_speed(iter/s)": 0.074869 - }, - { - "epoch": 0.09246988838825573, - "grad_norm": 5.09375, - "learning_rate": 9.790968549850224e-06, - "loss": 0.11352601, - "memory(GiB)": 74.41, - "step": 785, - "train_speed(iter/s)": 0.074873 - }, - { - "epoch": 0.09305886856907265, - "grad_norm": 3.296875, - "learning_rate": 9.788312864605893e-06, - "loss": 0.10174282, - "memory(GiB)": 74.41, - "step": 790, - "train_speed(iter/s)": 0.074874 - }, - { - "epoch": 0.09364784874988957, - "grad_norm": 3.921875, - "learning_rate": 9.785640780586551e-06, - "loss": 0.09074538, - "memory(GiB)": 74.41, - "step": 795, - "train_speed(iter/s)": 0.074875 - }, - { - "epoch": 0.09423682893070648, - "grad_norm": 3.78125, - "learning_rate": 9.782952306943418e-06, - "loss": 0.08487504, - "memory(GiB)": 74.41, - "step": 800, - "train_speed(iter/s)": 0.074881 - }, - { - "epoch": 0.0948258091115234, - "grad_norm": 4.625, - "learning_rate": 9.780247452883845e-06, - "loss": 0.10566314, - "memory(GiB)": 74.41, - "step": 805, - "train_speed(iter/s)": 0.074884 - }, - { - "epoch": 0.09541478929234032, - "grad_norm": 4.03125, - "learning_rate": 9.77752622767128e-06, - "loss": 0.11225069, - "memory(GiB)": 74.41, - "step": 810, - "train_speed(iter/s)": 0.074886 - }, - { - "epoch": 0.09600376947315722, - "grad_norm": 3.234375, - "learning_rate": 9.774788640625242e-06, - "loss": 0.08767406, - "memory(GiB)": 74.41, - "step": 815, - "train_speed(iter/s)": 0.074889 - }, - { - "epoch": 0.09659274965397414, - "grad_norm": 4.125, - "learning_rate": 9.772034701121278e-06, - "loss": 0.10022966, - "memory(GiB)": 74.41, - "step": 820, - "train_speed(iter/s)": 0.074892 - }, - { - "epoch": 0.09718172983479106, - "grad_norm": 3.96875, - "learning_rate": 9.769264418590944e-06, - "loss": 0.11698074, - "memory(GiB)": 74.41, - "step": 825, - "train_speed(iter/s)": 0.074893 - }, - { - "epoch": 0.09777071001560797, - "grad_norm": 5.1875, - "learning_rate": 9.76647780252177e-06, - "loss": 0.09307386, - "memory(GiB)": 74.41, - "step": 830, - "train_speed(iter/s)": 0.074895 - }, - { - "epoch": 0.09835969019642489, - "grad_norm": 4.75, - "learning_rate": 9.763674862457214e-06, - "loss": 0.09483533, - "memory(GiB)": 74.41, - "step": 835, - "train_speed(iter/s)": 0.074896 - }, - { - "epoch": 0.09894867037724181, - "grad_norm": 4.9375, - "learning_rate": 9.760855607996648e-06, - "loss": 0.10245972, - "memory(GiB)": 74.41, - "step": 840, - "train_speed(iter/s)": 0.074893 - }, - { - "epoch": 0.09953765055805872, - "grad_norm": 4.34375, - "learning_rate": 9.758020048795313e-06, - "loss": 0.0943541, - "memory(GiB)": 74.41, - "step": 845, - "train_speed(iter/s)": 0.074898 - }, - { - "epoch": 0.10012663073887564, - "grad_norm": 3.65625, - "learning_rate": 9.755168194564291e-06, - "loss": 0.09881738, - "memory(GiB)": 74.41, - "step": 850, - "train_speed(iter/s)": 0.074898 - }, - { - "epoch": 0.10071561091969256, - "grad_norm": 5.625, - "learning_rate": 9.752300055070472e-06, - "loss": 0.10225842, - "memory(GiB)": 74.41, - "step": 855, - "train_speed(iter/s)": 0.074904 - }, - { - "epoch": 0.10130459110050946, - "grad_norm": 3.75, - "learning_rate": 9.749415640136515e-06, - "loss": 0.09013618, - "memory(GiB)": 74.41, - "step": 860, - "train_speed(iter/s)": 0.074907 - }, - { - "epoch": 0.10189357128132638, - "grad_norm": 4.65625, - "learning_rate": 9.74651495964082e-06, - "loss": 0.09407109, - "memory(GiB)": 74.41, - "step": 865, - "train_speed(iter/s)": 0.074908 - }, - { - "epoch": 0.1024825514621433, - "grad_norm": 4.5, - "learning_rate": 9.743598023517493e-06, - "loss": 0.10868713, - "memory(GiB)": 74.41, - "step": 870, - "train_speed(iter/s)": 0.074912 - }, - { - "epoch": 0.10307153164296022, - "grad_norm": 3.640625, - "learning_rate": 9.740664841756313e-06, - "loss": 0.11703372, - "memory(GiB)": 74.41, - "step": 875, - "train_speed(iter/s)": 0.074918 - }, - { - "epoch": 0.10366051182377713, - "grad_norm": 3.484375, - "learning_rate": 9.737715424402691e-06, - "loss": 0.10799139, - "memory(GiB)": 74.41, - "step": 880, - "train_speed(iter/s)": 0.074921 - }, - { - "epoch": 0.10424949200459405, - "grad_norm": 5.09375, - "learning_rate": 9.73474978155765e-06, - "loss": 0.10723754, - "memory(GiB)": 74.41, - "step": 885, - "train_speed(iter/s)": 0.074923 - }, - { - "epoch": 0.10483847218541097, - "grad_norm": 3.75, - "learning_rate": 9.73176792337777e-06, - "loss": 0.11358409, - "memory(GiB)": 74.41, - "step": 890, - "train_speed(iter/s)": 0.074927 - }, - { - "epoch": 0.10542745236622787, - "grad_norm": 3.640625, - "learning_rate": 9.728769860075175e-06, - "loss": 0.09777128, - "memory(GiB)": 74.41, - "step": 895, - "train_speed(iter/s)": 0.074932 - }, - { - "epoch": 0.10601643254704479, - "grad_norm": 4.21875, - "learning_rate": 9.725755601917478e-06, - "loss": 0.10329788, - "memory(GiB)": 74.41, - "step": 900, - "train_speed(iter/s)": 0.074938 - }, - { - "epoch": 0.10660541272786171, - "grad_norm": 5.84375, - "learning_rate": 9.722725159227765e-06, - "loss": 0.10509249, - "memory(GiB)": 74.41, - "step": 905, - "train_speed(iter/s)": 0.074943 - }, - { - "epoch": 0.10719439290867862, - "grad_norm": 3.390625, - "learning_rate": 9.71967854238454e-06, - "loss": 0.1047749, - "memory(GiB)": 74.41, - "step": 910, - "train_speed(iter/s)": 0.074947 - }, - { - "epoch": 0.10778337308949554, - "grad_norm": 4.40625, - "learning_rate": 9.716615761821708e-06, - "loss": 0.10192933, - "memory(GiB)": 74.41, - "step": 915, - "train_speed(iter/s)": 0.074949 - }, - { - "epoch": 0.10837235327031246, - "grad_norm": 4.3125, - "learning_rate": 9.713536828028524e-06, - "loss": 0.11062961, - "memory(GiB)": 74.41, - "step": 920, - "train_speed(iter/s)": 0.074953 - }, - { - "epoch": 0.10896133345112936, - "grad_norm": 4.65625, - "learning_rate": 9.710441751549569e-06, - "loss": 0.09096059, - "memory(GiB)": 74.41, - "step": 925, - "train_speed(iter/s)": 0.074956 - }, - { - "epoch": 0.10955031363194628, - "grad_norm": 4.625, - "learning_rate": 9.707330542984706e-06, - "loss": 0.09158493, - "memory(GiB)": 74.41, - "step": 930, - "train_speed(iter/s)": 0.07496 - }, - { - "epoch": 0.1101392938127632, - "grad_norm": 3.546875, - "learning_rate": 9.704203212989048e-06, - "loss": 0.10105819, - "memory(GiB)": 74.41, - "step": 935, - "train_speed(iter/s)": 0.07496 - }, - { - "epoch": 0.11072827399358011, - "grad_norm": 5.34375, - "learning_rate": 9.701059772272915e-06, - "loss": 0.09538058, - "memory(GiB)": 74.41, - "step": 940, - "train_speed(iter/s)": 0.074962 - }, - { - "epoch": 0.11131725417439703, - "grad_norm": 3.78125, - "learning_rate": 9.69790023160181e-06, - "loss": 0.10084579, - "memory(GiB)": 74.41, - "step": 945, - "train_speed(iter/s)": 0.074966 - }, - { - "epoch": 0.11190623435521395, - "grad_norm": 4.09375, - "learning_rate": 9.69472460179637e-06, - "loss": 0.10691296, - "memory(GiB)": 74.41, - "step": 950, - "train_speed(iter/s)": 0.074968 - }, - { - "epoch": 0.11249521453603087, - "grad_norm": 4.25, - "learning_rate": 9.691532893732331e-06, - "loss": 0.096218, - "memory(GiB)": 74.41, - "step": 955, - "train_speed(iter/s)": 0.074971 - }, - { - "epoch": 0.11308419471684777, - "grad_norm": 3.921875, - "learning_rate": 9.688325118340496e-06, - "loss": 0.10754682, - "memory(GiB)": 74.41, - "step": 960, - "train_speed(iter/s)": 0.074974 - }, - { - "epoch": 0.1136731748976647, - "grad_norm": 3.875, - "learning_rate": 9.685101286606695e-06, - "loss": 0.10555474, - "memory(GiB)": 74.41, - "step": 965, - "train_speed(iter/s)": 0.074978 - }, - { - "epoch": 0.11426215507848161, - "grad_norm": 3.5625, - "learning_rate": 9.68186140957175e-06, - "loss": 0.08726187, - "memory(GiB)": 74.41, - "step": 970, - "train_speed(iter/s)": 0.074983 - }, - { - "epoch": 0.11485113525929852, - "grad_norm": 3.84375, - "learning_rate": 9.678605498331422e-06, - "loss": 0.08722601, - "memory(GiB)": 74.41, - "step": 975, - "train_speed(iter/s)": 0.074988 - }, - { - "epoch": 0.11544011544011544, - "grad_norm": 4.78125, - "learning_rate": 9.675333564036398e-06, - "loss": 0.09676975, - "memory(GiB)": 74.41, - "step": 980, - "train_speed(iter/s)": 0.07499 - }, - { - "epoch": 0.11602909562093236, - "grad_norm": 4.875, - "learning_rate": 9.672045617892235e-06, - "loss": 0.0948953, - "memory(GiB)": 74.41, - "step": 985, - "train_speed(iter/s)": 0.074991 - }, - { - "epoch": 0.11661807580174927, - "grad_norm": 3.234375, - "learning_rate": 9.668741671159329e-06, - "loss": 0.10143999, - "memory(GiB)": 74.41, - "step": 990, - "train_speed(iter/s)": 0.074992 - }, - { - "epoch": 0.11720705598256619, - "grad_norm": 3.96875, - "learning_rate": 9.66542173515287e-06, - "loss": 0.08537531, - "memory(GiB)": 74.41, - "step": 995, - "train_speed(iter/s)": 0.074997 - }, - { - "epoch": 0.1177960361633831, - "grad_norm": 4.65625, - "learning_rate": 9.662085821242807e-06, - "loss": 0.0954556, - "memory(GiB)": 74.41, - "step": 1000, - "train_speed(iter/s)": 0.074998 - }, - { - "epoch": 0.11838501634420001, - "grad_norm": 4.3125, - "learning_rate": 9.658733940853815e-06, - "loss": 0.09058597, - "memory(GiB)": 74.41, - "step": 1005, - "train_speed(iter/s)": 0.074817 - }, - { - "epoch": 0.11897399652501693, - "grad_norm": 4.53125, - "learning_rate": 9.655366105465244e-06, - "loss": 0.09935067, - "memory(GiB)": 74.41, - "step": 1010, - "train_speed(iter/s)": 0.07482 - }, - { - "epoch": 0.11956297670583385, - "grad_norm": 4.15625, - "learning_rate": 9.65198232661109e-06, - "loss": 0.08947166, - "memory(GiB)": 74.41, - "step": 1015, - "train_speed(iter/s)": 0.074825 - }, - { - "epoch": 0.12015195688665077, - "grad_norm": 3.453125, - "learning_rate": 9.648582615879952e-06, - "loss": 0.10199349, - "memory(GiB)": 74.41, - "step": 1020, - "train_speed(iter/s)": 0.074824 - }, - { - "epoch": 0.12074093706746768, - "grad_norm": 4.28125, - "learning_rate": 9.645166984914985e-06, - "loss": 0.10013585, - "memory(GiB)": 74.41, - "step": 1025, - "train_speed(iter/s)": 0.074828 - }, - { - "epoch": 0.1213299172482846, - "grad_norm": 3.140625, - "learning_rate": 9.641735445413875e-06, - "loss": 0.11086836, - "memory(GiB)": 74.41, - "step": 1030, - "train_speed(iter/s)": 0.074831 - }, - { - "epoch": 0.12191889742910152, - "grad_norm": 4.34375, - "learning_rate": 9.638288009128789e-06, - "loss": 0.09356855, - "memory(GiB)": 74.41, - "step": 1035, - "train_speed(iter/s)": 0.074832 - }, - { - "epoch": 0.12250787760991842, - "grad_norm": 3.5, - "learning_rate": 9.634824687866329e-06, - "loss": 0.09086785, - "memory(GiB)": 74.41, - "step": 1040, - "train_speed(iter/s)": 0.074832 - }, - { - "epoch": 0.12309685779073534, - "grad_norm": 4.4375, - "learning_rate": 9.631345493487509e-06, - "loss": 0.1021161, - "memory(GiB)": 74.41, - "step": 1045, - "train_speed(iter/s)": 0.074834 - }, - { - "epoch": 0.12368583797155226, - "grad_norm": 4.1875, - "learning_rate": 9.6278504379077e-06, - "loss": 0.10499916, - "memory(GiB)": 74.41, - "step": 1050, - "train_speed(iter/s)": 0.074835 - }, - { - "epoch": 0.12427481815236917, - "grad_norm": 3.5, - "learning_rate": 9.624339533096594e-06, - "loss": 0.08981788, - "memory(GiB)": 74.41, - "step": 1055, - "train_speed(iter/s)": 0.07484 - }, - { - "epoch": 0.12486379833318609, - "grad_norm": 3.65625, - "learning_rate": 9.620812791078161e-06, - "loss": 0.08886168, - "memory(GiB)": 74.41, - "step": 1060, - "train_speed(iter/s)": 0.074844 - }, - { - "epoch": 0.125452778514003, - "grad_norm": 3.828125, - "learning_rate": 9.617270223930613e-06, - "loss": 0.08076782, - "memory(GiB)": 74.41, - "step": 1065, - "train_speed(iter/s)": 0.074847 - }, - { - "epoch": 0.12604175869481993, - "grad_norm": 3.765625, - "learning_rate": 9.613711843786356e-06, - "loss": 0.08811791, - "memory(GiB)": 74.41, - "step": 1070, - "train_speed(iter/s)": 0.07485 - }, - { - "epoch": 0.12663073887563683, - "grad_norm": 5.71875, - "learning_rate": 9.610137662831953e-06, - "loss": 0.10126686, - "memory(GiB)": 74.41, - "step": 1075, - "train_speed(iter/s)": 0.074854 - }, - { - "epoch": 0.12721971905645374, - "grad_norm": 4.375, - "learning_rate": 9.60654769330808e-06, - "loss": 0.09718223, - "memory(GiB)": 74.41, - "step": 1080, - "train_speed(iter/s)": 0.074858 - }, - { - "epoch": 0.12780869923727067, - "grad_norm": 4.34375, - "learning_rate": 9.60294194750949e-06, - "loss": 0.09517021, - "memory(GiB)": 74.41, - "step": 1085, - "train_speed(iter/s)": 0.074863 - }, - { - "epoch": 0.12839767941808758, - "grad_norm": 4.0625, - "learning_rate": 9.599320437784953e-06, - "loss": 0.09423549, - "memory(GiB)": 74.41, - "step": 1090, - "train_speed(iter/s)": 0.074865 - }, - { - "epoch": 0.12898665959890449, - "grad_norm": 4.90625, - "learning_rate": 9.595683176537242e-06, - "loss": 0.10256985, - "memory(GiB)": 74.41, - "step": 1095, - "train_speed(iter/s)": 0.074868 - }, - { - "epoch": 0.12957563977972142, - "grad_norm": 4.65625, - "learning_rate": 9.592030176223066e-06, - "loss": 0.08859265, - "memory(GiB)": 74.41, - "step": 1100, - "train_speed(iter/s)": 0.074869 - }, - { - "epoch": 0.13016461996053832, - "grad_norm": 4.40625, - "learning_rate": 9.588361449353036e-06, - "loss": 0.08601279, - "memory(GiB)": 74.41, - "step": 1105, - "train_speed(iter/s)": 0.074871 - }, - { - "epoch": 0.13075360014135523, - "grad_norm": 4.625, - "learning_rate": 9.584677008491628e-06, - "loss": 0.12187407, - "memory(GiB)": 74.41, - "step": 1110, - "train_speed(iter/s)": 0.074876 - }, - { - "epoch": 0.13134258032217216, - "grad_norm": 3.34375, - "learning_rate": 9.580976866257129e-06, - "loss": 0.09723064, - "memory(GiB)": 74.41, - "step": 1115, - "train_speed(iter/s)": 0.074877 - }, - { - "epoch": 0.13193156050298907, - "grad_norm": 3.28125, - "learning_rate": 9.577261035321601e-06, - "loss": 0.09539672, - "memory(GiB)": 74.41, - "step": 1120, - "train_speed(iter/s)": 0.074878 - }, - { - "epoch": 0.13252054068380598, - "grad_norm": 4.25, - "learning_rate": 9.573529528410838e-06, - "loss": 0.09372305, - "memory(GiB)": 74.41, - "step": 1125, - "train_speed(iter/s)": 0.074879 - }, - { - "epoch": 0.1331095208646229, - "grad_norm": 3.296875, - "learning_rate": 9.569782358304319e-06, - "loss": 0.07629693, - "memory(GiB)": 74.41, - "step": 1130, - "train_speed(iter/s)": 0.074883 - }, - { - "epoch": 0.13369850104543982, - "grad_norm": 5.03125, - "learning_rate": 9.566019537835166e-06, - "loss": 0.0996025, - "memory(GiB)": 74.41, - "step": 1135, - "train_speed(iter/s)": 0.074886 - }, - { - "epoch": 0.13428748122625675, - "grad_norm": 4.28125, - "learning_rate": 9.562241079890093e-06, - "loss": 0.09327987, - "memory(GiB)": 74.41, - "step": 1140, - "train_speed(iter/s)": 0.074887 - }, - { - "epoch": 0.13487646140707366, - "grad_norm": 4.0625, - "learning_rate": 9.558446997409382e-06, - "loss": 0.09146149, - "memory(GiB)": 74.41, - "step": 1145, - "train_speed(iter/s)": 0.074889 - }, - { - "epoch": 0.13546544158789056, - "grad_norm": 3.234375, - "learning_rate": 9.554637303386813e-06, - "loss": 0.08869413, - "memory(GiB)": 74.41, - "step": 1150, - "train_speed(iter/s)": 0.074893 - }, - { - "epoch": 0.1360544217687075, - "grad_norm": 4.0625, - "learning_rate": 9.550812010869636e-06, - "loss": 0.08775733, - "memory(GiB)": 74.41, - "step": 1155, - "train_speed(iter/s)": 0.074892 - }, - { - "epoch": 0.1366434019495244, - "grad_norm": 5.0, - "learning_rate": 9.546971132958524e-06, - "loss": 0.09848154, - "memory(GiB)": 74.41, - "step": 1160, - "train_speed(iter/s)": 0.074895 - }, - { - "epoch": 0.1372323821303413, - "grad_norm": 4.0625, - "learning_rate": 9.543114682807525e-06, - "loss": 0.09634271, - "memory(GiB)": 74.41, - "step": 1165, - "train_speed(iter/s)": 0.074894 - }, - { - "epoch": 0.13782136231115824, - "grad_norm": 4.375, - "learning_rate": 9.539242673624012e-06, - "loss": 0.10609367, - "memory(GiB)": 74.41, - "step": 1170, - "train_speed(iter/s)": 0.074897 - }, - { - "epoch": 0.13841034249197515, - "grad_norm": 3.796875, - "learning_rate": 9.535355118668652e-06, - "loss": 0.10536243, - "memory(GiB)": 74.41, - "step": 1175, - "train_speed(iter/s)": 0.074901 - }, - { - "epoch": 0.13899932267279205, - "grad_norm": 3.578125, - "learning_rate": 9.53145203125535e-06, - "loss": 0.08514765, - "memory(GiB)": 74.41, - "step": 1180, - "train_speed(iter/s)": 0.074904 - }, - { - "epoch": 0.139588302853609, - "grad_norm": 3.765625, - "learning_rate": 9.527533424751206e-06, - "loss": 0.09435316, - "memory(GiB)": 74.41, - "step": 1185, - "train_speed(iter/s)": 0.074905 - }, - { - "epoch": 0.1401772830344259, - "grad_norm": 3.03125, - "learning_rate": 9.523599312576464e-06, - "loss": 0.0877773, - "memory(GiB)": 74.41, - "step": 1190, - "train_speed(iter/s)": 0.074909 - }, - { - "epoch": 0.1407662632152428, - "grad_norm": 4.59375, - "learning_rate": 9.519649708204478e-06, - "loss": 0.08990437, - "memory(GiB)": 74.41, - "step": 1195, - "train_speed(iter/s)": 0.074911 - }, - { - "epoch": 0.14135524339605973, - "grad_norm": 4.03125, - "learning_rate": 9.515684625161656e-06, - "loss": 0.11489943, - "memory(GiB)": 74.41, - "step": 1200, - "train_speed(iter/s)": 0.074909 - }, - { - "epoch": 0.14194422357687664, - "grad_norm": 7.59375, - "learning_rate": 9.511704077027414e-06, - "loss": 0.0885065, - "memory(GiB)": 74.41, - "step": 1205, - "train_speed(iter/s)": 0.074909 - }, - { - "epoch": 0.14253320375769354, - "grad_norm": 4.15625, - "learning_rate": 9.507708077434138e-06, - "loss": 0.08773028, - "memory(GiB)": 74.41, - "step": 1210, - "train_speed(iter/s)": 0.074908 - }, - { - "epoch": 0.14312218393851048, - "grad_norm": 2.84375, - "learning_rate": 9.503696640067129e-06, - "loss": 0.09397745, - "memory(GiB)": 74.41, - "step": 1215, - "train_speed(iter/s)": 0.074909 - }, - { - "epoch": 0.14371116411932738, - "grad_norm": 5.1875, - "learning_rate": 9.499669778664553e-06, - "loss": 0.10290816, - "memory(GiB)": 74.41, - "step": 1220, - "train_speed(iter/s)": 0.074908 - }, - { - "epoch": 0.1443001443001443, - "grad_norm": 2.59375, - "learning_rate": 9.495627507017407e-06, - "loss": 0.09706104, - "memory(GiB)": 74.41, - "step": 1225, - "train_speed(iter/s)": 0.074908 - }, - { - "epoch": 0.14488912448096122, - "grad_norm": 3.65625, - "learning_rate": 9.491569838969459e-06, - "loss": 0.08967851, - "memory(GiB)": 74.41, - "step": 1230, - "train_speed(iter/s)": 0.074908 - }, - { - "epoch": 0.14547810466177813, - "grad_norm": 3.875, - "learning_rate": 9.487496788417211e-06, - "loss": 0.07626108, - "memory(GiB)": 74.41, - "step": 1235, - "train_speed(iter/s)": 0.074908 - }, - { - "epoch": 0.14606708484259504, - "grad_norm": 5.125, - "learning_rate": 9.483408369309843e-06, - "loss": 0.11157024, - "memory(GiB)": 74.41, - "step": 1240, - "train_speed(iter/s)": 0.074906 - }, - { - "epoch": 0.14665606502341197, - "grad_norm": 3.71875, - "learning_rate": 9.479304595649166e-06, - "loss": 0.09179769, - "memory(GiB)": 74.41, - "step": 1245, - "train_speed(iter/s)": 0.074905 - }, - { - "epoch": 0.14724504520422887, - "grad_norm": 4.0625, - "learning_rate": 9.475185481489581e-06, - "loss": 0.10004801, - "memory(GiB)": 74.41, - "step": 1250, - "train_speed(iter/s)": 0.074906 - }, - { - "epoch": 0.14783402538504578, - "grad_norm": 3.265625, - "learning_rate": 9.471051040938026e-06, - "loss": 0.09620751, - "memory(GiB)": 74.41, - "step": 1255, - "train_speed(iter/s)": 0.074907 - }, - { - "epoch": 0.14842300556586271, - "grad_norm": 3.546875, - "learning_rate": 9.466901288153927e-06, - "loss": 0.08952096, - "memory(GiB)": 74.41, - "step": 1260, - "train_speed(iter/s)": 0.074909 - }, - { - "epoch": 0.14901198574667962, - "grad_norm": 3.59375, - "learning_rate": 9.462736237349145e-06, - "loss": 0.08996918, - "memory(GiB)": 74.41, - "step": 1265, - "train_speed(iter/s)": 0.07491 - }, - { - "epoch": 0.14960096592749653, - "grad_norm": 3.234375, - "learning_rate": 9.458555902787945e-06, - "loss": 0.09044787, - "memory(GiB)": 74.41, - "step": 1270, - "train_speed(iter/s)": 0.074911 - }, - { - "epoch": 0.15018994610831346, - "grad_norm": 4.09375, - "learning_rate": 9.454360298786924e-06, - "loss": 0.08726504, - "memory(GiB)": 74.41, - "step": 1275, - "train_speed(iter/s)": 0.074913 - }, - { - "epoch": 0.15077892628913037, - "grad_norm": 4.96875, - "learning_rate": 9.45014943971498e-06, - "loss": 0.0934383, - "memory(GiB)": 74.41, - "step": 1280, - "train_speed(iter/s)": 0.074913 - }, - { - "epoch": 0.1513679064699473, - "grad_norm": 3.640625, - "learning_rate": 9.44592333999325e-06, - "loss": 0.08363155, - "memory(GiB)": 74.41, - "step": 1285, - "train_speed(iter/s)": 0.074916 - }, - { - "epoch": 0.1519568866507642, - "grad_norm": 4.3125, - "learning_rate": 9.441682014095071e-06, - "loss": 0.08424637, - "memory(GiB)": 74.41, - "step": 1290, - "train_speed(iter/s)": 0.074918 - }, - { - "epoch": 0.1525458668315811, - "grad_norm": 3.734375, - "learning_rate": 9.437425476545926e-06, - "loss": 0.09210879, - "memory(GiB)": 74.41, - "step": 1295, - "train_speed(iter/s)": 0.074919 - }, - { - "epoch": 0.15313484701239805, - "grad_norm": 3.234375, - "learning_rate": 9.43315374192339e-06, - "loss": 0.09213892, - "memory(GiB)": 74.41, - "step": 1300, - "train_speed(iter/s)": 0.074922 - }, - { - "epoch": 0.15372382719321495, - "grad_norm": 4.5, - "learning_rate": 9.428866824857087e-06, - "loss": 0.08642803, - "memory(GiB)": 74.41, - "step": 1305, - "train_speed(iter/s)": 0.074925 - }, - { - "epoch": 0.15431280737403186, - "grad_norm": 4.0, - "learning_rate": 9.424564740028638e-06, - "loss": 0.08265491, - "memory(GiB)": 74.41, - "step": 1310, - "train_speed(iter/s)": 0.074928 - }, - { - "epoch": 0.1549017875548488, - "grad_norm": 4.15625, - "learning_rate": 9.420247502171606e-06, - "loss": 0.08584201, - "memory(GiB)": 74.41, - "step": 1315, - "train_speed(iter/s)": 0.07493 - }, - { - "epoch": 0.1554907677356657, - "grad_norm": 5.21875, - "learning_rate": 9.415915126071456e-06, - "loss": 0.09583381, - "memory(GiB)": 74.41, - "step": 1320, - "train_speed(iter/s)": 0.074932 - }, - { - "epoch": 0.1560797479164826, - "grad_norm": 5.8125, - "learning_rate": 9.41156762656549e-06, - "loss": 0.07853642, - "memory(GiB)": 74.41, - "step": 1325, - "train_speed(iter/s)": 0.074936 - }, - { - "epoch": 0.15666872809729954, - "grad_norm": 4.4375, - "learning_rate": 9.407205018542807e-06, - "loss": 0.11025757, - "memory(GiB)": 74.41, - "step": 1330, - "train_speed(iter/s)": 0.07494 - }, - { - "epoch": 0.15725770827811644, - "grad_norm": 3.546875, - "learning_rate": 9.402827316944253e-06, - "loss": 0.10197206, - "memory(GiB)": 74.41, - "step": 1335, - "train_speed(iter/s)": 0.074941 - }, - { - "epoch": 0.15784668845893335, - "grad_norm": 3.6875, - "learning_rate": 9.398434536762359e-06, - "loss": 0.09212865, - "memory(GiB)": 74.41, - "step": 1340, - "train_speed(iter/s)": 0.074943 - }, - { - "epoch": 0.15843566863975028, - "grad_norm": 3.703125, - "learning_rate": 9.3940266930413e-06, - "loss": 0.08865182, - "memory(GiB)": 74.41, - "step": 1345, - "train_speed(iter/s)": 0.074945 - }, - { - "epoch": 0.1590246488205672, - "grad_norm": 2.890625, - "learning_rate": 9.389603800876838e-06, - "loss": 0.08562068, - "memory(GiB)": 74.41, - "step": 1350, - "train_speed(iter/s)": 0.074948 - }, - { - "epoch": 0.1596136290013841, - "grad_norm": 4.6875, - "learning_rate": 9.385165875416278e-06, - "loss": 0.1068722, - "memory(GiB)": 74.41, - "step": 1355, - "train_speed(iter/s)": 0.074947 - }, - { - "epoch": 0.16020260918220103, - "grad_norm": 3.5, - "learning_rate": 9.3807129318584e-06, - "loss": 0.09370728, - "memory(GiB)": 74.41, - "step": 1360, - "train_speed(iter/s)": 0.074948 - }, - { - "epoch": 0.16079158936301793, - "grad_norm": 5.0625, - "learning_rate": 9.376244985453427e-06, - "loss": 0.08163351, - "memory(GiB)": 74.41, - "step": 1365, - "train_speed(iter/s)": 0.07495 - }, - { - "epoch": 0.16138056954383484, - "grad_norm": 5.0, - "learning_rate": 9.371762051502957e-06, - "loss": 0.10225484, - "memory(GiB)": 74.41, - "step": 1370, - "train_speed(iter/s)": 0.074952 - }, - { - "epoch": 0.16196954972465177, - "grad_norm": 3.5, - "learning_rate": 9.36726414535992e-06, - "loss": 0.10129716, - "memory(GiB)": 74.41, - "step": 1375, - "train_speed(iter/s)": 0.074952 - }, - { - "epoch": 0.16255852990546868, - "grad_norm": 3.640625, - "learning_rate": 9.362751282428519e-06, - "loss": 0.08826185, - "memory(GiB)": 74.41, - "step": 1380, - "train_speed(iter/s)": 0.074953 - }, - { - "epoch": 0.16314751008628559, - "grad_norm": 3.890625, - "learning_rate": 9.35822347816418e-06, - "loss": 0.10849725, - "memory(GiB)": 74.41, - "step": 1385, - "train_speed(iter/s)": 0.074955 - }, - { - "epoch": 0.16373649026710252, - "grad_norm": 3.984375, - "learning_rate": 9.353680748073509e-06, - "loss": 0.10136243, - "memory(GiB)": 74.41, - "step": 1390, - "train_speed(iter/s)": 0.074957 - }, - { - "epoch": 0.16432547044791943, - "grad_norm": 3.75, - "learning_rate": 9.349123107714211e-06, - "loss": 0.08352109, - "memory(GiB)": 74.41, - "step": 1395, - "train_speed(iter/s)": 0.07496 - }, - { - "epoch": 0.16491445062873633, - "grad_norm": 4.65625, - "learning_rate": 9.344550572695073e-06, - "loss": 0.1029326, - "memory(GiB)": 74.41, - "step": 1400, - "train_speed(iter/s)": 0.074961 - }, - { - "epoch": 0.16550343080955326, - "grad_norm": 4.09375, - "learning_rate": 9.33996315867588e-06, - "loss": 0.10244679, - "memory(GiB)": 74.41, - "step": 1405, - "train_speed(iter/s)": 0.074964 - }, - { - "epoch": 0.16609241099037017, - "grad_norm": 4.3125, - "learning_rate": 9.335360881367381e-06, - "loss": 0.09310257, - "memory(GiB)": 74.41, - "step": 1410, - "train_speed(iter/s)": 0.074966 - }, - { - "epoch": 0.16668139117118708, - "grad_norm": 4.03125, - "learning_rate": 9.330743756531227e-06, - "loss": 0.09983196, - "memory(GiB)": 74.41, - "step": 1415, - "train_speed(iter/s)": 0.074968 - }, - { - "epoch": 0.167270371352004, - "grad_norm": 3.453125, - "learning_rate": 9.326111799979915e-06, - "loss": 0.09484429, - "memory(GiB)": 74.41, - "step": 1420, - "train_speed(iter/s)": 0.074967 - }, - { - "epoch": 0.16785935153282092, - "grad_norm": 4.25, - "learning_rate": 9.321465027576738e-06, - "loss": 0.08307941, - "memory(GiB)": 74.41, - "step": 1425, - "train_speed(iter/s)": 0.074967 - }, - { - "epoch": 0.16844833171363785, - "grad_norm": 4.03125, - "learning_rate": 9.316803455235732e-06, - "loss": 0.09872603, - "memory(GiB)": 74.41, - "step": 1430, - "train_speed(iter/s)": 0.074966 - }, - { - "epoch": 0.16903731189445476, - "grad_norm": 3.078125, - "learning_rate": 9.312127098921614e-06, - "loss": 0.09692515, - "memory(GiB)": 74.41, - "step": 1435, - "train_speed(iter/s)": 0.074965 - }, - { - "epoch": 0.16962629207527166, - "grad_norm": 4.09375, - "learning_rate": 9.307435974649744e-06, - "loss": 0.09557692, - "memory(GiB)": 74.41, - "step": 1440, - "train_speed(iter/s)": 0.074968 - }, - { - "epoch": 0.1702152722560886, - "grad_norm": 4.09375, - "learning_rate": 9.30273009848604e-06, - "loss": 0.10825093, - "memory(GiB)": 74.41, - "step": 1445, - "train_speed(iter/s)": 0.074971 - }, - { - "epoch": 0.1708042524369055, - "grad_norm": 4.03125, - "learning_rate": 9.29800948654696e-06, - "loss": 0.09957713, - "memory(GiB)": 74.41, - "step": 1450, - "train_speed(iter/s)": 0.074971 - }, - { - "epoch": 0.1713932326177224, - "grad_norm": 3.71875, - "learning_rate": 9.293274154999415e-06, - "loss": 0.08825269, - "memory(GiB)": 74.41, - "step": 1455, - "train_speed(iter/s)": 0.074974 - }, - { - "epoch": 0.17198221279853934, - "grad_norm": 3.6875, - "learning_rate": 9.288524120060733e-06, - "loss": 0.1051899, - "memory(GiB)": 74.41, - "step": 1460, - "train_speed(iter/s)": 0.074977 - }, - { - "epoch": 0.17257119297935625, - "grad_norm": 3.859375, - "learning_rate": 9.2837593979986e-06, - "loss": 0.09597876, - "memory(GiB)": 74.41, - "step": 1465, - "train_speed(iter/s)": 0.07498 - }, - { - "epoch": 0.17316017316017315, - "grad_norm": 4.65625, - "learning_rate": 9.278980005130996e-06, - "loss": 0.10092977, - "memory(GiB)": 74.41, - "step": 1470, - "train_speed(iter/s)": 0.074982 - }, - { - "epoch": 0.1737491533409901, - "grad_norm": 4.0, - "learning_rate": 9.274185957826147e-06, - "loss": 0.10035901, - "memory(GiB)": 74.41, - "step": 1475, - "train_speed(iter/s)": 0.074984 - }, - { - "epoch": 0.174338133521807, - "grad_norm": 4.125, - "learning_rate": 9.26937727250247e-06, - "loss": 0.08626782, - "memory(GiB)": 74.41, - "step": 1480, - "train_speed(iter/s)": 0.074985 - }, - { - "epoch": 0.1749271137026239, - "grad_norm": 4.59375, - "learning_rate": 9.264553965628506e-06, - "loss": 0.087044, - "memory(GiB)": 74.41, - "step": 1485, - "train_speed(iter/s)": 0.074984 - }, - { - "epoch": 0.17551609388344083, - "grad_norm": 4.09375, - "learning_rate": 9.259716053722878e-06, - "loss": 0.10160702, - "memory(GiB)": 74.41, - "step": 1490, - "train_speed(iter/s)": 0.074985 - }, - { - "epoch": 0.17610507406425774, - "grad_norm": 3.875, - "learning_rate": 9.254863553354224e-06, - "loss": 0.07840201, - "memory(GiB)": 74.41, - "step": 1495, - "train_speed(iter/s)": 0.074988 - }, - { - "epoch": 0.17669405424507464, - "grad_norm": 3.25, - "learning_rate": 9.249996481141149e-06, - "loss": 0.08488679, - "memory(GiB)": 74.41, - "step": 1500, - "train_speed(iter/s)": 0.074991 - }, - { - "epoch": 0.17728303442589158, - "grad_norm": 3.96875, - "learning_rate": 9.245114853752157e-06, - "loss": 0.10635732, - "memory(GiB)": 74.41, - "step": 1505, - "train_speed(iter/s)": 0.074857 - }, - { - "epoch": 0.17787201460670848, - "grad_norm": 3.921875, - "learning_rate": 9.240218687905598e-06, - "loss": 0.08825208, - "memory(GiB)": 74.41, - "step": 1510, - "train_speed(iter/s)": 0.074857 - }, - { - "epoch": 0.1784609947875254, - "grad_norm": 4.28125, - "learning_rate": 9.235308000369623e-06, - "loss": 0.09694942, - "memory(GiB)": 74.41, - "step": 1515, - "train_speed(iter/s)": 0.074857 - }, - { - "epoch": 0.17904997496834232, - "grad_norm": 4.21875, - "learning_rate": 9.230382807962106e-06, - "loss": 0.10525234, - "memory(GiB)": 74.41, - "step": 1520, - "train_speed(iter/s)": 0.074859 - }, - { - "epoch": 0.17963895514915923, - "grad_norm": 3.46875, - "learning_rate": 9.225443127550601e-06, - "loss": 0.10098197, - "memory(GiB)": 74.41, - "step": 1525, - "train_speed(iter/s)": 0.07486 - }, - { - "epoch": 0.18022793532997614, - "grad_norm": 4.9375, - "learning_rate": 9.220488976052279e-06, - "loss": 0.09873599, - "memory(GiB)": 74.41, - "step": 1530, - "train_speed(iter/s)": 0.074861 - }, - { - "epoch": 0.18081691551079307, - "grad_norm": 3.546875, - "learning_rate": 9.21552037043387e-06, - "loss": 0.08211206, - "memory(GiB)": 74.41, - "step": 1535, - "train_speed(iter/s)": 0.074863 - }, - { - "epoch": 0.18140589569160998, - "grad_norm": 3.1875, - "learning_rate": 9.210537327711608e-06, - "loss": 0.09001173, - "memory(GiB)": 74.41, - "step": 1540, - "train_speed(iter/s)": 0.074863 - }, - { - "epoch": 0.18199487587242688, - "grad_norm": 5.53125, - "learning_rate": 9.205539864951169e-06, - "loss": 0.1033685, - "memory(GiB)": 74.41, - "step": 1545, - "train_speed(iter/s)": 0.074866 - }, - { - "epoch": 0.18258385605324381, - "grad_norm": 3.390625, - "learning_rate": 9.200527999267614e-06, - "loss": 0.09862096, - "memory(GiB)": 74.41, - "step": 1550, - "train_speed(iter/s)": 0.074867 - }, - { - "epoch": 0.18317283623406072, - "grad_norm": 3.609375, - "learning_rate": 9.19550174782533e-06, - "loss": 0.07522287, - "memory(GiB)": 74.41, - "step": 1555, - "train_speed(iter/s)": 0.074869 - }, - { - "epoch": 0.18376181641487763, - "grad_norm": 4.15625, - "learning_rate": 9.190461127837971e-06, - "loss": 0.07987702, - "memory(GiB)": 74.41, - "step": 1560, - "train_speed(iter/s)": 0.074872 - }, - { - "epoch": 0.18435079659569456, - "grad_norm": 3.1875, - "learning_rate": 9.185406156568404e-06, - "loss": 0.08131886, - "memory(GiB)": 74.41, - "step": 1565, - "train_speed(iter/s)": 0.074875 - }, - { - "epoch": 0.18493977677651147, - "grad_norm": 3.15625, - "learning_rate": 9.18033685132864e-06, - "loss": 0.09074624, - "memory(GiB)": 74.41, - "step": 1570, - "train_speed(iter/s)": 0.074877 - }, - { - "epoch": 0.18552875695732837, - "grad_norm": 3.390625, - "learning_rate": 9.175253229479784e-06, - "loss": 0.08689606, - "memory(GiB)": 74.41, - "step": 1575, - "train_speed(iter/s)": 0.074878 - }, - { - "epoch": 0.1861177371381453, - "grad_norm": 2.953125, - "learning_rate": 9.17015530843197e-06, - "loss": 0.09036893, - "memory(GiB)": 74.41, - "step": 1580, - "train_speed(iter/s)": 0.07488 - }, - { - "epoch": 0.1867067173189622, - "grad_norm": 3.921875, - "learning_rate": 9.165043105644303e-06, - "loss": 0.08858083, - "memory(GiB)": 74.41, - "step": 1585, - "train_speed(iter/s)": 0.074883 - }, - { - "epoch": 0.18729569749977915, - "grad_norm": 3.65625, - "learning_rate": 9.159916638624801e-06, - "loss": 0.08610859, - "memory(GiB)": 74.41, - "step": 1590, - "train_speed(iter/s)": 0.074887 - }, - { - "epoch": 0.18788467768059605, - "grad_norm": 3.4375, - "learning_rate": 9.154775924930333e-06, - "loss": 0.09216586, - "memory(GiB)": 74.41, - "step": 1595, - "train_speed(iter/s)": 0.074888 - }, - { - "epoch": 0.18847365786141296, - "grad_norm": 4.59375, - "learning_rate": 9.149620982166557e-06, - "loss": 0.09575672, - "memory(GiB)": 74.41, - "step": 1600, - "train_speed(iter/s)": 0.074892 - }, - { - "epoch": 0.1890626380422299, - "grad_norm": 3.5, - "learning_rate": 9.144451827987865e-06, - "loss": 0.10303175, - "memory(GiB)": 74.41, - "step": 1605, - "train_speed(iter/s)": 0.074892 - }, - { - "epoch": 0.1896516182230468, - "grad_norm": 4.3125, - "learning_rate": 9.139268480097317e-06, - "loss": 0.10116973, - "memory(GiB)": 74.41, - "step": 1610, - "train_speed(iter/s)": 0.074894 - }, - { - "epoch": 0.1902405984038637, - "grad_norm": 4.65625, - "learning_rate": 9.134070956246586e-06, - "loss": 0.08978627, - "memory(GiB)": 74.41, - "step": 1615, - "train_speed(iter/s)": 0.074895 - }, - { - "epoch": 0.19082957858468064, - "grad_norm": 3.46875, - "learning_rate": 9.128859274235893e-06, - "loss": 0.09291114, - "memory(GiB)": 74.41, - "step": 1620, - "train_speed(iter/s)": 0.074897 - }, - { - "epoch": 0.19141855876549754, - "grad_norm": 4.1875, - "learning_rate": 9.123633451913945e-06, - "loss": 0.0884718, - "memory(GiB)": 74.41, - "step": 1625, - "train_speed(iter/s)": 0.074899 - }, - { - "epoch": 0.19200753894631445, - "grad_norm": 3.78125, - "learning_rate": 9.118393507177877e-06, - "loss": 0.08941454, - "memory(GiB)": 74.41, - "step": 1630, - "train_speed(iter/s)": 0.074901 - }, - { - "epoch": 0.19259651912713138, - "grad_norm": 4.0625, - "learning_rate": 9.11313945797319e-06, - "loss": 0.08903505, - "memory(GiB)": 74.41, - "step": 1635, - "train_speed(iter/s)": 0.074903 - }, - { - "epoch": 0.1931854993079483, - "grad_norm": 3.390625, - "learning_rate": 9.107871322293694e-06, - "loss": 0.06867069, - "memory(GiB)": 74.41, - "step": 1640, - "train_speed(iter/s)": 0.074906 - }, - { - "epoch": 0.1937744794887652, - "grad_norm": 4.25, - "learning_rate": 9.102589118181432e-06, - "loss": 0.09329748, - "memory(GiB)": 74.41, - "step": 1645, - "train_speed(iter/s)": 0.074909 - }, - { - "epoch": 0.19436345966958213, - "grad_norm": 6.375, - "learning_rate": 9.097292863726635e-06, - "loss": 0.10539714, - "memory(GiB)": 74.41, - "step": 1650, - "train_speed(iter/s)": 0.07491 - }, - { - "epoch": 0.19495243985039903, - "grad_norm": 4.0625, - "learning_rate": 9.091982577067651e-06, - "loss": 0.09411023, - "memory(GiB)": 74.41, - "step": 1655, - "train_speed(iter/s)": 0.074913 - }, - { - "epoch": 0.19554142003121594, - "grad_norm": 3.25, - "learning_rate": 9.086658276390883e-06, - "loss": 0.08853958, - "memory(GiB)": 74.41, - "step": 1660, - "train_speed(iter/s)": 0.074913 - }, - { - "epoch": 0.19613040021203287, - "grad_norm": 4.34375, - "learning_rate": 9.081319979930735e-06, - "loss": 0.08764254, - "memory(GiB)": 74.41, - "step": 1665, - "train_speed(iter/s)": 0.074914 - }, - { - "epoch": 0.19671938039284978, - "grad_norm": 3.078125, - "learning_rate": 9.075967705969536e-06, - "loss": 0.09640335, - "memory(GiB)": 74.41, - "step": 1670, - "train_speed(iter/s)": 0.074915 - }, - { - "epoch": 0.19730836057366669, - "grad_norm": 3.875, - "learning_rate": 9.070601472837486e-06, - "loss": 0.08481975, - "memory(GiB)": 74.41, - "step": 1675, - "train_speed(iter/s)": 0.074917 - }, - { - "epoch": 0.19789734075448362, - "grad_norm": 3.28125, - "learning_rate": 9.065221298912592e-06, - "loss": 0.09183933, - "memory(GiB)": 74.41, - "step": 1680, - "train_speed(iter/s)": 0.07492 - }, - { - "epoch": 0.19848632093530053, - "grad_norm": 5.0, - "learning_rate": 9.059827202620608e-06, - "loss": 0.08611407, - "memory(GiB)": 74.41, - "step": 1685, - "train_speed(iter/s)": 0.074921 - }, - { - "epoch": 0.19907530111611743, - "grad_norm": 3.9375, - "learning_rate": 9.054419202434963e-06, - "loss": 0.08965172, - "memory(GiB)": 74.41, - "step": 1690, - "train_speed(iter/s)": 0.074921 - }, - { - "epoch": 0.19966428129693436, - "grad_norm": 3.984375, - "learning_rate": 9.048997316876707e-06, - "loss": 0.09860962, - "memory(GiB)": 74.41, - "step": 1695, - "train_speed(iter/s)": 0.074923 - }, - { - "epoch": 0.20025326147775127, - "grad_norm": 3.671875, - "learning_rate": 9.043561564514445e-06, - "loss": 0.09100879, - "memory(GiB)": 74.41, - "step": 1700, - "train_speed(iter/s)": 0.074924 - }, - { - "epoch": 0.20084224165856818, - "grad_norm": 4.375, - "learning_rate": 9.038111963964266e-06, - "loss": 0.09089627, - "memory(GiB)": 74.41, - "step": 1705, - "train_speed(iter/s)": 0.074925 - }, - { - "epoch": 0.2014312218393851, - "grad_norm": 2.734375, - "learning_rate": 9.032648533889697e-06, - "loss": 0.08130147, - "memory(GiB)": 74.41, - "step": 1710, - "train_speed(iter/s)": 0.074925 - }, - { - "epoch": 0.20202020202020202, - "grad_norm": 3.140625, - "learning_rate": 9.027171293001615e-06, - "loss": 0.09625549, - "memory(GiB)": 74.41, - "step": 1715, - "train_speed(iter/s)": 0.074928 - }, - { - "epoch": 0.20260918220101892, - "grad_norm": 4.65625, - "learning_rate": 9.021680260058203e-06, - "loss": 0.09061633, - "memory(GiB)": 74.41, - "step": 1720, - "train_speed(iter/s)": 0.07493 - }, - { - "epoch": 0.20319816238183586, - "grad_norm": 4.3125, - "learning_rate": 9.016175453864877e-06, - "loss": 0.09252499, - "memory(GiB)": 74.41, - "step": 1725, - "train_speed(iter/s)": 0.074931 - }, - { - "epoch": 0.20378714256265276, - "grad_norm": 3.40625, - "learning_rate": 9.010656893274222e-06, - "loss": 0.0898882, - "memory(GiB)": 74.41, - "step": 1730, - "train_speed(iter/s)": 0.074933 - }, - { - "epoch": 0.2043761227434697, - "grad_norm": 3.90625, - "learning_rate": 9.00512459718593e-06, - "loss": 0.08716853, - "memory(GiB)": 74.41, - "step": 1735, - "train_speed(iter/s)": 0.074935 - }, - { - "epoch": 0.2049651029242866, - "grad_norm": 3.890625, - "learning_rate": 8.999578584546733e-06, - "loss": 0.08696725, - "memory(GiB)": 74.41, - "step": 1740, - "train_speed(iter/s)": 0.074937 - }, - { - "epoch": 0.2055540831051035, - "grad_norm": 4.25, - "learning_rate": 8.994018874350337e-06, - "loss": 0.09265312, - "memory(GiB)": 74.41, - "step": 1745, - "train_speed(iter/s)": 0.074939 - }, - { - "epoch": 0.20614306328592044, - "grad_norm": 4.59375, - "learning_rate": 8.98844548563736e-06, - "loss": 0.09432794, - "memory(GiB)": 74.41, - "step": 1750, - "train_speed(iter/s)": 0.07494 - }, - { - "epoch": 0.20673204346673735, - "grad_norm": 3.171875, - "learning_rate": 8.982858437495267e-06, - "loss": 0.08902328, - "memory(GiB)": 74.41, - "step": 1755, - "train_speed(iter/s)": 0.07494 - }, - { - "epoch": 0.20732102364755425, - "grad_norm": 4.28125, - "learning_rate": 8.977257749058302e-06, - "loss": 0.09804455, - "memory(GiB)": 74.41, - "step": 1760, - "train_speed(iter/s)": 0.074942 - }, - { - "epoch": 0.2079100038283712, - "grad_norm": 3.4375, - "learning_rate": 8.97164343950742e-06, - "loss": 0.08721474, - "memory(GiB)": 74.41, - "step": 1765, - "train_speed(iter/s)": 0.074944 - }, - { - "epoch": 0.2084989840091881, - "grad_norm": 4.59375, - "learning_rate": 8.966015528070232e-06, - "loss": 0.09626728, - "memory(GiB)": 74.41, - "step": 1770, - "train_speed(iter/s)": 0.074946 - }, - { - "epoch": 0.209087964190005, - "grad_norm": 3.890625, - "learning_rate": 8.960374034020926e-06, - "loss": 0.0885976, - "memory(GiB)": 74.41, - "step": 1775, - "train_speed(iter/s)": 0.074948 - }, - { - "epoch": 0.20967694437082193, - "grad_norm": 4.46875, - "learning_rate": 8.954718976680212e-06, - "loss": 0.09599138, - "memory(GiB)": 74.41, - "step": 1780, - "train_speed(iter/s)": 0.074949 - }, - { - "epoch": 0.21026592455163884, - "grad_norm": 3.828125, - "learning_rate": 8.949050375415245e-06, - "loss": 0.09163067, - "memory(GiB)": 74.41, - "step": 1785, - "train_speed(iter/s)": 0.07495 - }, - { - "epoch": 0.21085490473245574, - "grad_norm": 3.53125, - "learning_rate": 8.94336824963957e-06, - "loss": 0.09342519, - "memory(GiB)": 74.41, - "step": 1790, - "train_speed(iter/s)": 0.074952 - }, - { - "epoch": 0.21144388491327268, - "grad_norm": 3.375, - "learning_rate": 8.937672618813046e-06, - "loss": 0.08690483, - "memory(GiB)": 74.41, - "step": 1795, - "train_speed(iter/s)": 0.074952 - }, - { - "epoch": 0.21203286509408958, - "grad_norm": 3.25, - "learning_rate": 8.931963502441788e-06, - "loss": 0.0816748, - "memory(GiB)": 74.41, - "step": 1800, - "train_speed(iter/s)": 0.07495 - }, - { - "epoch": 0.2126218452749065, - "grad_norm": 3.984375, - "learning_rate": 8.926240920078091e-06, - "loss": 0.08076277, - "memory(GiB)": 74.41, - "step": 1805, - "train_speed(iter/s)": 0.074952 - }, - { - "epoch": 0.21321082545572342, - "grad_norm": 2.90625, - "learning_rate": 8.92050489132037e-06, - "loss": 0.08003956, - "memory(GiB)": 74.41, - "step": 1810, - "train_speed(iter/s)": 0.074952 - }, - { - "epoch": 0.21379980563654033, - "grad_norm": 3.421875, - "learning_rate": 8.91475543581309e-06, - "loss": 0.09223965, - "memory(GiB)": 74.41, - "step": 1815, - "train_speed(iter/s)": 0.074954 - }, - { - "epoch": 0.21438878581735724, - "grad_norm": 6.0625, - "learning_rate": 8.9089925732467e-06, - "loss": 0.09530929, - "memory(GiB)": 74.41, - "step": 1820, - "train_speed(iter/s)": 0.074956 - }, - { - "epoch": 0.21497776599817417, - "grad_norm": 3.765625, - "learning_rate": 8.903216323357562e-06, - "loss": 0.09788524, - "memory(GiB)": 74.41, - "step": 1825, - "train_speed(iter/s)": 0.074956 - }, - { - "epoch": 0.21556674617899108, - "grad_norm": 3.328125, - "learning_rate": 8.897426705927892e-06, - "loss": 0.07674338, - "memory(GiB)": 74.41, - "step": 1830, - "train_speed(iter/s)": 0.074958 - }, - { - "epoch": 0.21615572635980798, - "grad_norm": 3.421875, - "learning_rate": 8.89162374078568e-06, - "loss": 0.09034337, - "memory(GiB)": 74.41, - "step": 1835, - "train_speed(iter/s)": 0.074957 - }, - { - "epoch": 0.21674470654062492, - "grad_norm": 3.859375, - "learning_rate": 8.885807447804633e-06, - "loss": 0.08671494, - "memory(GiB)": 74.41, - "step": 1840, - "train_speed(iter/s)": 0.074959 - }, - { - "epoch": 0.21733368672144182, - "grad_norm": 4.53125, - "learning_rate": 8.8799778469041e-06, - "loss": 0.08138096, - "memory(GiB)": 74.41, - "step": 1845, - "train_speed(iter/s)": 0.074961 - }, - { - "epoch": 0.21792266690225873, - "grad_norm": 3.65625, - "learning_rate": 8.87413495804901e-06, - "loss": 0.08538883, - "memory(GiB)": 74.41, - "step": 1850, - "train_speed(iter/s)": 0.074961 - }, - { - "epoch": 0.21851164708307566, - "grad_norm": 3.390625, - "learning_rate": 8.868278801249794e-06, - "loss": 0.09670607, - "memory(GiB)": 74.41, - "step": 1855, - "train_speed(iter/s)": 0.074962 - }, - { - "epoch": 0.21910062726389257, - "grad_norm": 3.375, - "learning_rate": 8.862409396562326e-06, - "loss": 0.09224599, - "memory(GiB)": 74.41, - "step": 1860, - "train_speed(iter/s)": 0.074963 - }, - { - "epoch": 0.21968960744470947, - "grad_norm": 3.625, - "learning_rate": 8.856526764087852e-06, - "loss": 0.10004935, - "memory(GiB)": 74.41, - "step": 1865, - "train_speed(iter/s)": 0.074963 - }, - { - "epoch": 0.2202785876255264, - "grad_norm": 2.484375, - "learning_rate": 8.850630923972919e-06, - "loss": 0.08341603, - "memory(GiB)": 74.41, - "step": 1870, - "train_speed(iter/s)": 0.074965 - }, - { - "epoch": 0.2208675678063433, - "grad_norm": 3.4375, - "learning_rate": 8.844721896409302e-06, - "loss": 0.08883305, - "memory(GiB)": 74.41, - "step": 1875, - "train_speed(iter/s)": 0.074967 - }, - { - "epoch": 0.22145654798716022, - "grad_norm": 4.8125, - "learning_rate": 8.838799701633946e-06, - "loss": 0.09026896, - "memory(GiB)": 74.41, - "step": 1880, - "train_speed(iter/s)": 0.07497 - }, - { - "epoch": 0.22204552816797715, - "grad_norm": 3.328125, - "learning_rate": 8.832864359928892e-06, - "loss": 0.07970605, - "memory(GiB)": 74.41, - "step": 1885, - "train_speed(iter/s)": 0.07497 - }, - { - "epoch": 0.22263450834879406, - "grad_norm": 4.0625, - "learning_rate": 8.826915891621196e-06, - "loss": 0.09585873, - "memory(GiB)": 74.41, - "step": 1890, - "train_speed(iter/s)": 0.074971 - }, - { - "epoch": 0.223223488529611, - "grad_norm": 4.375, - "learning_rate": 8.820954317082879e-06, - "loss": 0.08822346, - "memory(GiB)": 74.41, - "step": 1895, - "train_speed(iter/s)": 0.074973 - }, - { - "epoch": 0.2238124687104279, - "grad_norm": 4.34375, - "learning_rate": 8.814979656730841e-06, - "loss": 0.08403029, - "memory(GiB)": 74.41, - "step": 1900, - "train_speed(iter/s)": 0.074974 - }, - { - "epoch": 0.2244014488912448, - "grad_norm": 3.21875, - "learning_rate": 8.808991931026806e-06, - "loss": 0.08884512, - "memory(GiB)": 74.41, - "step": 1905, - "train_speed(iter/s)": 0.074976 - }, - { - "epoch": 0.22499042907206174, - "grad_norm": 3.453125, - "learning_rate": 8.802991160477235e-06, - "loss": 0.07620925, - "memory(GiB)": 74.41, - "step": 1910, - "train_speed(iter/s)": 0.074977 - }, - { - "epoch": 0.22557940925287864, - "grad_norm": 3.421875, - "learning_rate": 8.796977365633265e-06, - "loss": 0.08231162, - "memory(GiB)": 74.41, - "step": 1915, - "train_speed(iter/s)": 0.074979 - }, - { - "epoch": 0.22616838943369555, - "grad_norm": 3.328125, - "learning_rate": 8.790950567090646e-06, - "loss": 0.08745655, - "memory(GiB)": 74.41, - "step": 1920, - "train_speed(iter/s)": 0.07498 - }, - { - "epoch": 0.22675736961451248, - "grad_norm": 3.984375, - "learning_rate": 8.784910785489653e-06, - "loss": 0.09518582, - "memory(GiB)": 74.41, - "step": 1925, - "train_speed(iter/s)": 0.074983 - }, - { - "epoch": 0.2273463497953294, - "grad_norm": 3.34375, - "learning_rate": 8.77885804151503e-06, - "loss": 0.08628204, - "memory(GiB)": 74.41, - "step": 1930, - "train_speed(iter/s)": 0.074984 - }, - { - "epoch": 0.2279353299761463, - "grad_norm": 3.859375, - "learning_rate": 8.772792355895914e-06, - "loss": 0.11409738, - "memory(GiB)": 74.41, - "step": 1935, - "train_speed(iter/s)": 0.074984 - }, - { - "epoch": 0.22852431015696323, - "grad_norm": 2.984375, - "learning_rate": 8.76671374940576e-06, - "loss": 0.08102646, - "memory(GiB)": 74.41, - "step": 1940, - "train_speed(iter/s)": 0.074986 - }, - { - "epoch": 0.22911329033778013, - "grad_norm": 4.09375, - "learning_rate": 8.760622242862278e-06, - "loss": 0.08864189, - "memory(GiB)": 74.41, - "step": 1945, - "train_speed(iter/s)": 0.074989 - }, - { - "epoch": 0.22970227051859704, - "grad_norm": 3.671875, - "learning_rate": 8.754517857127354e-06, - "loss": 0.07894121, - "memory(GiB)": 74.41, - "step": 1950, - "train_speed(iter/s)": 0.074991 - }, - { - "epoch": 0.23029125069941397, - "grad_norm": 3.71875, - "learning_rate": 8.748400613106987e-06, - "loss": 0.08113719, - "memory(GiB)": 74.41, - "step": 1955, - "train_speed(iter/s)": 0.074993 - }, - { - "epoch": 0.23088023088023088, - "grad_norm": 3.359375, - "learning_rate": 8.742270531751205e-06, - "loss": 0.07365735, - "memory(GiB)": 74.41, - "step": 1960, - "train_speed(iter/s)": 0.074995 - }, - { - "epoch": 0.23146921106104779, - "grad_norm": 2.765625, - "learning_rate": 8.736127634054005e-06, - "loss": 0.09221531, - "memory(GiB)": 74.41, - "step": 1965, - "train_speed(iter/s)": 0.074996 - }, - { - "epoch": 0.23205819124186472, - "grad_norm": 3.71875, - "learning_rate": 8.729971941053281e-06, - "loss": 0.07759452, - "memory(GiB)": 74.41, - "step": 1970, - "train_speed(iter/s)": 0.074998 - }, - { - "epoch": 0.23264717142268163, - "grad_norm": 4.03125, - "learning_rate": 8.723803473830742e-06, - "loss": 0.07012342, - "memory(GiB)": 74.41, - "step": 1975, - "train_speed(iter/s)": 0.074999 - }, - { - "epoch": 0.23323615160349853, - "grad_norm": 4.25, - "learning_rate": 8.717622253511841e-06, - "loss": 0.09737064, - "memory(GiB)": 74.41, - "step": 1980, - "train_speed(iter/s)": 0.075 - }, - { - "epoch": 0.23382513178431547, - "grad_norm": 3.640625, - "learning_rate": 8.71142830126572e-06, - "loss": 0.0820079, - "memory(GiB)": 74.41, - "step": 1985, - "train_speed(iter/s)": 0.075002 - }, - { - "epoch": 0.23441411196513237, - "grad_norm": 4.3125, - "learning_rate": 8.705221638305115e-06, - "loss": 0.11287713, - "memory(GiB)": 74.41, - "step": 1990, - "train_speed(iter/s)": 0.075002 - }, - { - "epoch": 0.23500309214594928, - "grad_norm": 3.921875, - "learning_rate": 8.699002285886296e-06, - "loss": 0.08315295, - "memory(GiB)": 74.41, - "step": 1995, - "train_speed(iter/s)": 0.075001 - }, - { - "epoch": 0.2355920723267662, - "grad_norm": 3.203125, - "learning_rate": 8.69277026530899e-06, - "loss": 0.08922192, - "memory(GiB)": 74.41, - "step": 2000, - "train_speed(iter/s)": 0.075003 - }, - { - "epoch": 0.23618105250758312, - "grad_norm": 4.46875, - "learning_rate": 8.68652559791631e-06, - "loss": 0.07833961, - "memory(GiB)": 74.41, - "step": 2005, - "train_speed(iter/s)": 0.074902 - }, - { - "epoch": 0.23677003268840002, - "grad_norm": 4.71875, - "learning_rate": 8.680268305094682e-06, - "loss": 0.07653344, - "memory(GiB)": 74.41, - "step": 2010, - "train_speed(iter/s)": 0.074902 - }, - { - "epoch": 0.23735901286921696, - "grad_norm": 2.9375, - "learning_rate": 8.673998408273772e-06, - "loss": 0.08819388, - "memory(GiB)": 74.41, - "step": 2015, - "train_speed(iter/s)": 0.074904 - }, - { - "epoch": 0.23794799305003386, - "grad_norm": 3.90625, - "learning_rate": 8.667715928926406e-06, - "loss": 0.08014967, - "memory(GiB)": 74.41, - "step": 2020, - "train_speed(iter/s)": 0.074905 - }, - { - "epoch": 0.23853697323085077, - "grad_norm": 2.609375, - "learning_rate": 8.661420888568508e-06, - "loss": 0.07681896, - "memory(GiB)": 74.41, - "step": 2025, - "train_speed(iter/s)": 0.074908 - }, - { - "epoch": 0.2391259534116677, - "grad_norm": 4.28125, - "learning_rate": 8.65511330875902e-06, - "loss": 0.08645278, - "memory(GiB)": 74.41, - "step": 2030, - "train_speed(iter/s)": 0.07491 - }, - { - "epoch": 0.2397149335924846, - "grad_norm": 4.09375, - "learning_rate": 8.648793211099823e-06, - "loss": 0.0930752, - "memory(GiB)": 74.41, - "step": 2035, - "train_speed(iter/s)": 0.07491 - }, - { - "epoch": 0.24030391377330154, - "grad_norm": 4.375, - "learning_rate": 8.642460617235675e-06, - "loss": 0.08526615, - "memory(GiB)": 74.41, - "step": 2040, - "train_speed(iter/s)": 0.07491 - }, - { - "epoch": 0.24089289395411845, - "grad_norm": 4.09375, - "learning_rate": 8.636115548854125e-06, - "loss": 0.0950511, - "memory(GiB)": 74.41, - "step": 2045, - "train_speed(iter/s)": 0.07491 - }, - { - "epoch": 0.24148187413493535, - "grad_norm": 4.96875, - "learning_rate": 8.629758027685452e-06, - "loss": 0.07512583, - "memory(GiB)": 74.41, - "step": 2050, - "train_speed(iter/s)": 0.074907 - }, - { - "epoch": 0.2420708543157523, - "grad_norm": 3.65625, - "learning_rate": 8.623388075502569e-06, - "loss": 0.09064118, - "memory(GiB)": 74.41, - "step": 2055, - "train_speed(iter/s)": 0.074907 - }, - { - "epoch": 0.2426598344965692, - "grad_norm": 3.828125, - "learning_rate": 8.617005714120977e-06, - "loss": 0.08695101, - "memory(GiB)": 74.41, - "step": 2060, - "train_speed(iter/s)": 0.074909 - }, - { - "epoch": 0.2432488146773861, - "grad_norm": 3.453125, - "learning_rate": 8.610610965398665e-06, - "loss": 0.09054219, - "memory(GiB)": 74.41, - "step": 2065, - "train_speed(iter/s)": 0.074909 - }, - { - "epoch": 0.24383779485820303, - "grad_norm": 4.6875, - "learning_rate": 8.604203851236053e-06, - "loss": 0.08845918, - "memory(GiB)": 74.41, - "step": 2070, - "train_speed(iter/s)": 0.07491 - }, - { - "epoch": 0.24442677503901994, - "grad_norm": 4.40625, - "learning_rate": 8.597784393575902e-06, - "loss": 0.08291144, - "memory(GiB)": 74.41, - "step": 2075, - "train_speed(iter/s)": 0.074911 - }, - { - "epoch": 0.24501575521983684, - "grad_norm": 3.3125, - "learning_rate": 8.591352614403255e-06, - "loss": 0.07554498, - "memory(GiB)": 74.41, - "step": 2080, - "train_speed(iter/s)": 0.074912 - }, - { - "epoch": 0.24560473540065378, - "grad_norm": 3.96875, - "learning_rate": 8.584908535745344e-06, - "loss": 0.07999554, - "memory(GiB)": 74.41, - "step": 2085, - "train_speed(iter/s)": 0.074911 - }, - { - "epoch": 0.24619371558147068, - "grad_norm": 3.84375, - "learning_rate": 8.57845217967153e-06, - "loss": 0.09628522, - "memory(GiB)": 74.41, - "step": 2090, - "train_speed(iter/s)": 0.074914 - }, - { - "epoch": 0.2467826957622876, - "grad_norm": 2.3125, - "learning_rate": 8.57198356829322e-06, - "loss": 0.07814129, - "memory(GiB)": 74.41, - "step": 2095, - "train_speed(iter/s)": 0.074913 - }, - { - "epoch": 0.24737167594310452, - "grad_norm": 3.5, - "learning_rate": 8.56550272376379e-06, - "loss": 0.09396058, - "memory(GiB)": 74.41, - "step": 2100, - "train_speed(iter/s)": 0.074913 - }, - { - "epoch": 0.24796065612392143, - "grad_norm": 3.59375, - "learning_rate": 8.559009668278516e-06, - "loss": 0.08171555, - "memory(GiB)": 74.41, - "step": 2105, - "train_speed(iter/s)": 0.074915 - }, - { - "epoch": 0.24854963630473834, - "grad_norm": 4.25, - "learning_rate": 8.552504424074484e-06, - "loss": 0.08422858, - "memory(GiB)": 74.41, - "step": 2110, - "train_speed(iter/s)": 0.074916 - }, - { - "epoch": 0.24913861648555527, - "grad_norm": 3.625, - "learning_rate": 8.545987013430536e-06, - "loss": 0.08350295, - "memory(GiB)": 74.41, - "step": 2115, - "train_speed(iter/s)": 0.074915 - }, - { - "epoch": 0.24972759666637218, - "grad_norm": 4.0, - "learning_rate": 8.539457458667173e-06, - "loss": 0.09641573, - "memory(GiB)": 74.41, - "step": 2120, - "train_speed(iter/s)": 0.074916 - }, - { - "epoch": 0.2503165768471891, - "grad_norm": 4.375, - "learning_rate": 8.532915782146488e-06, - "loss": 0.09828712, - "memory(GiB)": 74.41, - "step": 2125, - "train_speed(iter/s)": 0.074916 - }, - { - "epoch": 0.250905557028006, - "grad_norm": 4.03125, - "learning_rate": 8.526362006272087e-06, - "loss": 0.08949637, - "memory(GiB)": 74.41, - "step": 2130, - "train_speed(iter/s)": 0.074916 - }, - { - "epoch": 0.2514945372088229, - "grad_norm": 4.0, - "learning_rate": 8.519796153489018e-06, - "loss": 0.08683264, - "memory(GiB)": 74.41, - "step": 2135, - "train_speed(iter/s)": 0.074915 - }, - { - "epoch": 0.25208351738963986, - "grad_norm": 4.09375, - "learning_rate": 8.513218246283684e-06, - "loss": 0.08473853, - "memory(GiB)": 74.41, - "step": 2140, - "train_speed(iter/s)": 0.074915 - }, - { - "epoch": 0.25267249757045673, - "grad_norm": 3.546875, - "learning_rate": 8.506628307183773e-06, - "loss": 0.08321443, - "memory(GiB)": 74.41, - "step": 2145, - "train_speed(iter/s)": 0.074913 - }, - { - "epoch": 0.25326147775127367, - "grad_norm": 4.09375, - "learning_rate": 8.50002635875818e-06, - "loss": 0.09309025, - "memory(GiB)": 74.41, - "step": 2150, - "train_speed(iter/s)": 0.074916 - }, - { - "epoch": 0.2538504579320906, - "grad_norm": 3.53125, - "learning_rate": 8.493412423616929e-06, - "loss": 0.08694515, - "memory(GiB)": 74.41, - "step": 2155, - "train_speed(iter/s)": 0.074916 - }, - { - "epoch": 0.2544394381129075, - "grad_norm": 4.625, - "learning_rate": 8.486786524411096e-06, - "loss": 0.08189753, - "memory(GiB)": 74.41, - "step": 2160, - "train_speed(iter/s)": 0.074916 - }, - { - "epoch": 0.2550284182937244, - "grad_norm": 3.78125, - "learning_rate": 8.480148683832727e-06, - "loss": 0.09474859, - "memory(GiB)": 74.41, - "step": 2165, - "train_speed(iter/s)": 0.074916 - }, - { - "epoch": 0.25561739847454135, - "grad_norm": 3.421875, - "learning_rate": 8.47349892461477e-06, - "loss": 0.08173348, - "memory(GiB)": 74.41, - "step": 2170, - "train_speed(iter/s)": 0.074918 - }, - { - "epoch": 0.2562063786553582, - "grad_norm": 4.09375, - "learning_rate": 8.466837269530991e-06, - "loss": 0.08846778, - "memory(GiB)": 74.41, - "step": 2175, - "train_speed(iter/s)": 0.074918 - }, - { - "epoch": 0.25679535883617516, - "grad_norm": 3.078125, - "learning_rate": 8.460163741395888e-06, - "loss": 0.08622532, - "memory(GiB)": 74.41, - "step": 2180, - "train_speed(iter/s)": 0.074919 - }, - { - "epoch": 0.2573843390169921, - "grad_norm": 3.890625, - "learning_rate": 8.45347836306463e-06, - "loss": 0.10147046, - "memory(GiB)": 74.41, - "step": 2185, - "train_speed(iter/s)": 0.07492 - }, - { - "epoch": 0.25797331919780897, - "grad_norm": 4.125, - "learning_rate": 8.446781157432969e-06, - "loss": 0.07713024, - "memory(GiB)": 74.41, - "step": 2190, - "train_speed(iter/s)": 0.07492 - }, - { - "epoch": 0.2585622993786259, - "grad_norm": 3.84375, - "learning_rate": 8.440072147437158e-06, - "loss": 0.08459938, - "memory(GiB)": 74.41, - "step": 2195, - "train_speed(iter/s)": 0.074922 - }, - { - "epoch": 0.25915127955944284, - "grad_norm": 3.609375, - "learning_rate": 8.433351356053878e-06, - "loss": 0.08433875, - "memory(GiB)": 74.41, - "step": 2200, - "train_speed(iter/s)": 0.074921 - }, - { - "epoch": 0.2597402597402597, - "grad_norm": 3.828125, - "learning_rate": 8.426618806300167e-06, - "loss": 0.08870918, - "memory(GiB)": 74.41, - "step": 2205, - "train_speed(iter/s)": 0.074921 - }, - { - "epoch": 0.26032923992107665, - "grad_norm": 3.34375, - "learning_rate": 8.41987452123332e-06, - "loss": 0.07198619, - "memory(GiB)": 74.41, - "step": 2210, - "train_speed(iter/s)": 0.07492 - }, - { - "epoch": 0.2609182201018936, - "grad_norm": 3.921875, - "learning_rate": 8.413118523950827e-06, - "loss": 0.07893941, - "memory(GiB)": 74.41, - "step": 2215, - "train_speed(iter/s)": 0.07492 - }, - { - "epoch": 0.26150720028271046, - "grad_norm": 3.84375, - "learning_rate": 8.406350837590296e-06, - "loss": 0.09590834, - "memory(GiB)": 74.41, - "step": 2220, - "train_speed(iter/s)": 0.074921 - }, - { - "epoch": 0.2620961804635274, - "grad_norm": 4.53125, - "learning_rate": 8.399571485329356e-06, - "loss": 0.07775668, - "memory(GiB)": 74.41, - "step": 2225, - "train_speed(iter/s)": 0.074921 - }, - { - "epoch": 0.26268516064434433, - "grad_norm": 4.28125, - "learning_rate": 8.392780490385599e-06, - "loss": 0.08939707, - "memory(GiB)": 74.41, - "step": 2230, - "train_speed(iter/s)": 0.074922 - }, - { - "epoch": 0.2632741408251612, - "grad_norm": 4.0625, - "learning_rate": 8.385977876016481e-06, - "loss": 0.0896252, - "memory(GiB)": 74.41, - "step": 2235, - "train_speed(iter/s)": 0.074922 - }, - { - "epoch": 0.26386312100597814, - "grad_norm": 2.78125, - "learning_rate": 8.37916366551926e-06, - "loss": 0.09092049, - "memory(GiB)": 74.41, - "step": 2240, - "train_speed(iter/s)": 0.074922 - }, - { - "epoch": 0.2644521011867951, - "grad_norm": 2.6875, - "learning_rate": 8.372337882230904e-06, - "loss": 0.07872928, - "memory(GiB)": 74.41, - "step": 2245, - "train_speed(iter/s)": 0.074923 - }, - { - "epoch": 0.26504108136761195, - "grad_norm": 5.28125, - "learning_rate": 8.365500549528012e-06, - "loss": 0.08535323, - "memory(GiB)": 74.41, - "step": 2250, - "train_speed(iter/s)": 0.074923 - }, - { - "epoch": 0.2656300615484289, - "grad_norm": 2.953125, - "learning_rate": 8.358651690826742e-06, - "loss": 0.09457763, - "memory(GiB)": 74.41, - "step": 2255, - "train_speed(iter/s)": 0.074923 - }, - { - "epoch": 0.2662190417292458, - "grad_norm": 4.9375, - "learning_rate": 8.351791329582724e-06, - "loss": 0.10520123, - "memory(GiB)": 74.41, - "step": 2260, - "train_speed(iter/s)": 0.074925 - }, - { - "epoch": 0.26680802191006275, - "grad_norm": 3.34375, - "learning_rate": 8.34491948929098e-06, - "loss": 0.08100507, - "memory(GiB)": 74.41, - "step": 2265, - "train_speed(iter/s)": 0.074925 - }, - { - "epoch": 0.26739700209087963, - "grad_norm": 4.0625, - "learning_rate": 8.338036193485846e-06, - "loss": 0.09872182, - "memory(GiB)": 74.41, - "step": 2270, - "train_speed(iter/s)": 0.074926 - }, - { - "epoch": 0.26798598227169657, - "grad_norm": 5.46875, - "learning_rate": 8.331141465740893e-06, - "loss": 0.08186264, - "memory(GiB)": 74.41, - "step": 2275, - "train_speed(iter/s)": 0.074925 - }, - { - "epoch": 0.2685749624525135, - "grad_norm": 4.25, - "learning_rate": 8.324235329668835e-06, - "loss": 0.09497812, - "memory(GiB)": 74.41, - "step": 2280, - "train_speed(iter/s)": 0.074924 - }, - { - "epoch": 0.2691639426333304, - "grad_norm": 5.1875, - "learning_rate": 8.317317808921466e-06, - "loss": 0.09269063, - "memory(GiB)": 74.41, - "step": 2285, - "train_speed(iter/s)": 0.074924 - }, - { - "epoch": 0.2697529228141473, - "grad_norm": 4.5625, - "learning_rate": 8.310388927189564e-06, - "loss": 0.080677, - "memory(GiB)": 74.41, - "step": 2290, - "train_speed(iter/s)": 0.074923 - }, - { - "epoch": 0.27034190299496424, - "grad_norm": 2.59375, - "learning_rate": 8.30344870820282e-06, - "loss": 0.07890679, - "memory(GiB)": 74.41, - "step": 2295, - "train_speed(iter/s)": 0.074923 - }, - { - "epoch": 0.2709308831757811, - "grad_norm": 3.4375, - "learning_rate": 8.296497175729747e-06, - "loss": 0.08448692, - "memory(GiB)": 74.41, - "step": 2300, - "train_speed(iter/s)": 0.074924 - }, - { - "epoch": 0.27151986335659806, - "grad_norm": 3.390625, - "learning_rate": 8.289534353577609e-06, - "loss": 0.07329035, - "memory(GiB)": 74.41, - "step": 2305, - "train_speed(iter/s)": 0.074925 - }, - { - "epoch": 0.272108843537415, - "grad_norm": 7.875, - "learning_rate": 8.28256026559233e-06, - "loss": 0.07775943, - "memory(GiB)": 74.41, - "step": 2310, - "train_speed(iter/s)": 0.074925 - }, - { - "epoch": 0.27269782371823187, - "grad_norm": 4.78125, - "learning_rate": 8.275574935658421e-06, - "loss": 0.09190539, - "memory(GiB)": 74.41, - "step": 2315, - "train_speed(iter/s)": 0.074924 - }, - { - "epoch": 0.2732868038990488, - "grad_norm": 3.96875, - "learning_rate": 8.26857838769889e-06, - "loss": 0.08333533, - "memory(GiB)": 74.41, - "step": 2320, - "train_speed(iter/s)": 0.074925 - }, - { - "epoch": 0.27387578407986574, - "grad_norm": 4.1875, - "learning_rate": 8.261570645675164e-06, - "loss": 0.10267622, - "memory(GiB)": 74.41, - "step": 2325, - "train_speed(iter/s)": 0.074924 - }, - { - "epoch": 0.2744647642606826, - "grad_norm": 3.46875, - "learning_rate": 8.254551733587009e-06, - "loss": 0.07961779, - "memory(GiB)": 74.41, - "step": 2330, - "train_speed(iter/s)": 0.074927 - }, - { - "epoch": 0.27505374444149955, - "grad_norm": 4.46875, - "learning_rate": 8.247521675472447e-06, - "loss": 0.08598344, - "memory(GiB)": 74.41, - "step": 2335, - "train_speed(iter/s)": 0.074928 - }, - { - "epoch": 0.2756427246223165, - "grad_norm": 4.1875, - "learning_rate": 8.240480495407667e-06, - "loss": 0.09493701, - "memory(GiB)": 74.41, - "step": 2340, - "train_speed(iter/s)": 0.07493 - }, - { - "epoch": 0.27623170480313336, - "grad_norm": 4.3125, - "learning_rate": 8.233428217506954e-06, - "loss": 0.09267281, - "memory(GiB)": 74.41, - "step": 2345, - "train_speed(iter/s)": 0.074932 - }, - { - "epoch": 0.2768206849839503, - "grad_norm": 2.734375, - "learning_rate": 8.226364865922598e-06, - "loss": 0.09006658, - "memory(GiB)": 74.41, - "step": 2350, - "train_speed(iter/s)": 0.074933 - }, - { - "epoch": 0.2774096651647672, - "grad_norm": 3.125, - "learning_rate": 8.21929046484481e-06, - "loss": 0.07196817, - "memory(GiB)": 74.41, - "step": 2355, - "train_speed(iter/s)": 0.074934 - }, - { - "epoch": 0.2779986453455841, - "grad_norm": 3.453125, - "learning_rate": 8.212205038501648e-06, - "loss": 0.08442661, - "memory(GiB)": 74.41, - "step": 2360, - "train_speed(iter/s)": 0.074936 - }, - { - "epoch": 0.27858762552640104, - "grad_norm": 4.96875, - "learning_rate": 8.205108611158926e-06, - "loss": 0.09003518, - "memory(GiB)": 74.41, - "step": 2365, - "train_speed(iter/s)": 0.074937 - }, - { - "epoch": 0.279176605707218, - "grad_norm": 3.703125, - "learning_rate": 8.198001207120135e-06, - "loss": 0.09039379, - "memory(GiB)": 74.41, - "step": 2370, - "train_speed(iter/s)": 0.074938 - }, - { - "epoch": 0.27976558588803485, - "grad_norm": 4.0, - "learning_rate": 8.190882850726357e-06, - "loss": 0.08618115, - "memory(GiB)": 74.41, - "step": 2375, - "train_speed(iter/s)": 0.07494 - }, - { - "epoch": 0.2803545660688518, - "grad_norm": 4.34375, - "learning_rate": 8.183753566356182e-06, - "loss": 0.08356151, - "memory(GiB)": 74.41, - "step": 2380, - "train_speed(iter/s)": 0.074941 - }, - { - "epoch": 0.2809435462496687, - "grad_norm": 3.890625, - "learning_rate": 8.176613378425633e-06, - "loss": 0.07245021, - "memory(GiB)": 74.41, - "step": 2385, - "train_speed(iter/s)": 0.074942 - }, - { - "epoch": 0.2815325264304856, - "grad_norm": 4.25, - "learning_rate": 8.169462311388066e-06, - "loss": 0.07743685, - "memory(GiB)": 74.41, - "step": 2390, - "train_speed(iter/s)": 0.074945 - }, - { - "epoch": 0.28212150661130253, - "grad_norm": 3.953125, - "learning_rate": 8.162300389734094e-06, - "loss": 0.08839593, - "memory(GiB)": 74.41, - "step": 2395, - "train_speed(iter/s)": 0.074947 - }, - { - "epoch": 0.28271048679211946, - "grad_norm": 3.78125, - "learning_rate": 8.155127637991518e-06, - "loss": 0.08605137, - "memory(GiB)": 74.41, - "step": 2400, - "train_speed(iter/s)": 0.074948 - }, - { - "epoch": 0.28329946697293634, - "grad_norm": 3.609375, - "learning_rate": 8.147944080725214e-06, - "loss": 0.0692309, - "memory(GiB)": 74.41, - "step": 2405, - "train_speed(iter/s)": 0.07495 - }, - { - "epoch": 0.2838884471537533, - "grad_norm": 3.859375, - "learning_rate": 8.140749742537073e-06, - "loss": 0.08173347, - "memory(GiB)": 74.41, - "step": 2410, - "train_speed(iter/s)": 0.07495 - }, - { - "epoch": 0.2844774273345702, - "grad_norm": 3.390625, - "learning_rate": 8.133544648065905e-06, - "loss": 0.07887983, - "memory(GiB)": 74.41, - "step": 2415, - "train_speed(iter/s)": 0.074951 - }, - { - "epoch": 0.2850664075153871, - "grad_norm": 4.40625, - "learning_rate": 8.126328821987357e-06, - "loss": 0.09297481, - "memory(GiB)": 74.41, - "step": 2420, - "train_speed(iter/s)": 0.074952 - }, - { - "epoch": 0.285655387696204, - "grad_norm": 3.421875, - "learning_rate": 8.119102289013832e-06, - "loss": 0.0871433, - "memory(GiB)": 74.41, - "step": 2425, - "train_speed(iter/s)": 0.074955 - }, - { - "epoch": 0.28624436787702096, - "grad_norm": 4.28125, - "learning_rate": 8.1118650738944e-06, - "loss": 0.08925153, - "memory(GiB)": 74.41, - "step": 2430, - "train_speed(iter/s)": 0.074956 - }, - { - "epoch": 0.28683334805783783, - "grad_norm": 3.515625, - "learning_rate": 8.104617201414711e-06, - "loss": 0.08430476, - "memory(GiB)": 74.41, - "step": 2435, - "train_speed(iter/s)": 0.074957 - }, - { - "epoch": 0.28742232823865477, - "grad_norm": 3.53125, - "learning_rate": 8.09735869639692e-06, - "loss": 0.1002124, - "memory(GiB)": 74.41, - "step": 2440, - "train_speed(iter/s)": 0.074959 - }, - { - "epoch": 0.2880113084194717, - "grad_norm": 4.0625, - "learning_rate": 8.090089583699591e-06, - "loss": 0.07936279, - "memory(GiB)": 74.41, - "step": 2445, - "train_speed(iter/s)": 0.07496 - }, - { - "epoch": 0.2886002886002886, - "grad_norm": 4.65625, - "learning_rate": 8.082809888217622e-06, - "loss": 0.08244927, - "memory(GiB)": 74.41, - "step": 2450, - "train_speed(iter/s)": 0.074961 - }, - { - "epoch": 0.2891892687811055, - "grad_norm": 3.5, - "learning_rate": 8.075519634882146e-06, - "loss": 0.09104651, - "memory(GiB)": 74.41, - "step": 2455, - "train_speed(iter/s)": 0.074962 - }, - { - "epoch": 0.28977824896192245, - "grad_norm": 3.65625, - "learning_rate": 8.068218848660461e-06, - "loss": 0.07435999, - "memory(GiB)": 74.41, - "step": 2460, - "train_speed(iter/s)": 0.074962 - }, - { - "epoch": 0.2903672291427393, - "grad_norm": 3.75, - "learning_rate": 8.060907554555937e-06, - "loss": 0.07937492, - "memory(GiB)": 74.41, - "step": 2465, - "train_speed(iter/s)": 0.074962 - }, - { - "epoch": 0.29095620932355626, - "grad_norm": 3.53125, - "learning_rate": 8.053585777607929e-06, - "loss": 0.08450289, - "memory(GiB)": 74.41, - "step": 2470, - "train_speed(iter/s)": 0.074963 - }, - { - "epoch": 0.2915451895043732, - "grad_norm": 3.734375, - "learning_rate": 8.046253542891689e-06, - "loss": 0.06627862, - "memory(GiB)": 74.41, - "step": 2475, - "train_speed(iter/s)": 0.074962 - }, - { - "epoch": 0.29213416968519007, - "grad_norm": 4.03125, - "learning_rate": 8.038910875518295e-06, - "loss": 0.09286552, - "memory(GiB)": 74.41, - "step": 2480, - "train_speed(iter/s)": 0.074963 - }, - { - "epoch": 0.292723149866007, - "grad_norm": 4.34375, - "learning_rate": 8.031557800634544e-06, - "loss": 0.07346339, - "memory(GiB)": 74.41, - "step": 2485, - "train_speed(iter/s)": 0.074964 - }, - { - "epoch": 0.29331213004682394, - "grad_norm": 3.703125, - "learning_rate": 8.024194343422882e-06, - "loss": 0.08958888, - "memory(GiB)": 74.41, - "step": 2490, - "train_speed(iter/s)": 0.074963 - }, - { - "epoch": 0.2939011102276408, - "grad_norm": 3.1875, - "learning_rate": 8.016820529101308e-06, - "loss": 0.08174566, - "memory(GiB)": 74.41, - "step": 2495, - "train_speed(iter/s)": 0.074964 - }, - { - "epoch": 0.29449009040845775, - "grad_norm": 3.375, - "learning_rate": 8.009436382923294e-06, - "loss": 0.09179025, - "memory(GiB)": 74.41, - "step": 2500, - "train_speed(iter/s)": 0.074965 - }, - { - "epoch": 0.2950790705892747, - "grad_norm": 3.515625, - "learning_rate": 8.002041930177696e-06, - "loss": 0.08729964, - "memory(GiB)": 74.41, - "step": 2505, - "train_speed(iter/s)": 0.074862 - }, - { - "epoch": 0.29566805077009156, - "grad_norm": 4.375, - "learning_rate": 7.994637196188666e-06, - "loss": 0.09242439, - "memory(GiB)": 74.41, - "step": 2510, - "train_speed(iter/s)": 0.074861 - }, - { - "epoch": 0.2962570309509085, - "grad_norm": 4.125, - "learning_rate": 7.987222206315568e-06, - "loss": 0.08498147, - "memory(GiB)": 74.41, - "step": 2515, - "train_speed(iter/s)": 0.074862 - }, - { - "epoch": 0.29684601113172543, - "grad_norm": 3.359375, - "learning_rate": 7.979796985952892e-06, - "loss": 0.07728409, - "memory(GiB)": 74.41, - "step": 2520, - "train_speed(iter/s)": 0.074864 - }, - { - "epoch": 0.2974349913125423, - "grad_norm": 3.953125, - "learning_rate": 7.972361560530157e-06, - "loss": 0.08274835, - "memory(GiB)": 74.41, - "step": 2525, - "train_speed(iter/s)": 0.074864 - }, - { - "epoch": 0.29802397149335924, - "grad_norm": 4.21875, - "learning_rate": 7.96491595551184e-06, - "loss": 0.09903065, - "memory(GiB)": 74.41, - "step": 2530, - "train_speed(iter/s)": 0.074865 - }, - { - "epoch": 0.2986129516741762, - "grad_norm": 3.5625, - "learning_rate": 7.95746019639728e-06, - "loss": 0.07592887, - "memory(GiB)": 74.41, - "step": 2535, - "train_speed(iter/s)": 0.074865 - }, - { - "epoch": 0.29920193185499305, - "grad_norm": 4.40625, - "learning_rate": 7.949994308720584e-06, - "loss": 0.10274631, - "memory(GiB)": 74.41, - "step": 2540, - "train_speed(iter/s)": 0.074867 - }, - { - "epoch": 0.29979091203581, - "grad_norm": 3.71875, - "learning_rate": 7.942518318050555e-06, - "loss": 0.08352513, - "memory(GiB)": 74.41, - "step": 2545, - "train_speed(iter/s)": 0.074868 - }, - { - "epoch": 0.3003798922166269, - "grad_norm": 3.125, - "learning_rate": 7.93503224999059e-06, - "loss": 0.08402941, - "memory(GiB)": 74.41, - "step": 2550, - "train_speed(iter/s)": 0.074869 - }, - { - "epoch": 0.3009688723974438, - "grad_norm": 3.0625, - "learning_rate": 7.927536130178605e-06, - "loss": 0.07813259, - "memory(GiB)": 74.41, - "step": 2555, - "train_speed(iter/s)": 0.074869 - }, - { - "epoch": 0.30155785257826073, - "grad_norm": 4.09375, - "learning_rate": 7.920029984286935e-06, - "loss": 0.07638797, - "memory(GiB)": 74.41, - "step": 2560, - "train_speed(iter/s)": 0.07487 - }, - { - "epoch": 0.30214683275907767, - "grad_norm": 3.640625, - "learning_rate": 7.912513838022253e-06, - "loss": 0.08150139, - "memory(GiB)": 74.41, - "step": 2565, - "train_speed(iter/s)": 0.074872 - }, - { - "epoch": 0.3027358129398946, - "grad_norm": 3.265625, - "learning_rate": 7.904987717125485e-06, - "loss": 0.07865617, - "memory(GiB)": 74.41, - "step": 2570, - "train_speed(iter/s)": 0.074873 - }, - { - "epoch": 0.3033247931207115, - "grad_norm": 4.03125, - "learning_rate": 7.89745164737171e-06, - "loss": 0.10058107, - "memory(GiB)": 74.41, - "step": 2575, - "train_speed(iter/s)": 0.074875 - }, - { - "epoch": 0.3039137733015284, - "grad_norm": 4.625, - "learning_rate": 7.88990565457009e-06, - "loss": 0.08737856, - "memory(GiB)": 74.41, - "step": 2580, - "train_speed(iter/s)": 0.074876 - }, - { - "epoch": 0.30450275348234535, - "grad_norm": 3.796875, - "learning_rate": 7.882349764563758e-06, - "loss": 0.08744036, - "memory(GiB)": 74.41, - "step": 2585, - "train_speed(iter/s)": 0.074879 - }, - { - "epoch": 0.3050917336631622, - "grad_norm": 4.34375, - "learning_rate": 7.874784003229751e-06, - "loss": 0.11328788, - "memory(GiB)": 74.41, - "step": 2590, - "train_speed(iter/s)": 0.07488 - }, - { - "epoch": 0.30568071384397916, - "grad_norm": 4.84375, - "learning_rate": 7.867208396478914e-06, - "loss": 0.08754985, - "memory(GiB)": 74.41, - "step": 2595, - "train_speed(iter/s)": 0.074881 - }, - { - "epoch": 0.3062696940247961, - "grad_norm": 3.09375, - "learning_rate": 7.859622970255804e-06, - "loss": 0.07025304, - "memory(GiB)": 74.41, - "step": 2600, - "train_speed(iter/s)": 0.074882 - }, - { - "epoch": 0.30685867420561297, - "grad_norm": 3.75, - "learning_rate": 7.852027750538613e-06, - "loss": 0.09671483, - "memory(GiB)": 74.41, - "step": 2605, - "train_speed(iter/s)": 0.074883 - }, - { - "epoch": 0.3074476543864299, - "grad_norm": 3.484375, - "learning_rate": 7.844422763339066e-06, - "loss": 0.08538601, - "memory(GiB)": 74.41, - "step": 2610, - "train_speed(iter/s)": 0.074883 - }, - { - "epoch": 0.30803663456724684, - "grad_norm": 2.953125, - "learning_rate": 7.836808034702348e-06, - "loss": 0.0819549, - "memory(GiB)": 74.41, - "step": 2615, - "train_speed(iter/s)": 0.074885 - }, - { - "epoch": 0.3086256147480637, - "grad_norm": 3.125, - "learning_rate": 7.829183590707001e-06, - "loss": 0.06693002, - "memory(GiB)": 74.41, - "step": 2620, - "train_speed(iter/s)": 0.074886 - }, - { - "epoch": 0.30921459492888065, - "grad_norm": 3.75, - "learning_rate": 7.82154945746484e-06, - "loss": 0.07030878, - "memory(GiB)": 74.41, - "step": 2625, - "train_speed(iter/s)": 0.074887 - }, - { - "epoch": 0.3098035751096976, - "grad_norm": 3.234375, - "learning_rate": 7.813905661120862e-06, - "loss": 0.08062052, - "memory(GiB)": 74.41, - "step": 2630, - "train_speed(iter/s)": 0.074888 - }, - { - "epoch": 0.31039255529051446, - "grad_norm": 3.734375, - "learning_rate": 7.806252227853161e-06, - "loss": 0.09003597, - "memory(GiB)": 74.41, - "step": 2635, - "train_speed(iter/s)": 0.074889 - }, - { - "epoch": 0.3109815354713314, - "grad_norm": 3.09375, - "learning_rate": 7.798589183872833e-06, - "loss": 0.08022001, - "memory(GiB)": 74.41, - "step": 2640, - "train_speed(iter/s)": 0.074889 - }, - { - "epoch": 0.3115705156521483, - "grad_norm": 4.59375, - "learning_rate": 7.79091655542389e-06, - "loss": 0.09506249, - "memory(GiB)": 74.41, - "step": 2645, - "train_speed(iter/s)": 0.07489 - }, - { - "epoch": 0.3121594958329652, - "grad_norm": 4.09375, - "learning_rate": 7.783234368783166e-06, - "loss": 0.07678704, - "memory(GiB)": 74.41, - "step": 2650, - "train_speed(iter/s)": 0.07489 - }, - { - "epoch": 0.31274847601378214, - "grad_norm": 3.21875, - "learning_rate": 7.775542650260231e-06, - "loss": 0.08948871, - "memory(GiB)": 74.41, - "step": 2655, - "train_speed(iter/s)": 0.074892 - }, - { - "epoch": 0.3133374561945991, - "grad_norm": 4.78125, - "learning_rate": 7.767841426197297e-06, - "loss": 0.09192417, - "memory(GiB)": 74.41, - "step": 2660, - "train_speed(iter/s)": 0.074893 - }, - { - "epoch": 0.31392643637541595, - "grad_norm": 3.5, - "learning_rate": 7.760130722969135e-06, - "loss": 0.07221487, - "memory(GiB)": 74.41, - "step": 2665, - "train_speed(iter/s)": 0.074894 - }, - { - "epoch": 0.3145154165562329, - "grad_norm": 4.6875, - "learning_rate": 7.752410566982973e-06, - "loss": 0.07583697, - "memory(GiB)": 74.41, - "step": 2670, - "train_speed(iter/s)": 0.074897 - }, - { - "epoch": 0.3151043967370498, - "grad_norm": 3.203125, - "learning_rate": 7.74468098467842e-06, - "loss": 0.07944269, - "memory(GiB)": 74.41, - "step": 2675, - "train_speed(iter/s)": 0.074896 - }, - { - "epoch": 0.3156933769178667, - "grad_norm": 3.078125, - "learning_rate": 7.736942002527362e-06, - "loss": 0.0761774, - "memory(GiB)": 74.41, - "step": 2680, - "train_speed(iter/s)": 0.074897 - }, - { - "epoch": 0.31628235709868363, - "grad_norm": 3.390625, - "learning_rate": 7.729193647033879e-06, - "loss": 0.07489694, - "memory(GiB)": 74.41, - "step": 2685, - "train_speed(iter/s)": 0.074899 - }, - { - "epoch": 0.31687133727950056, - "grad_norm": 3.9375, - "learning_rate": 7.72143594473415e-06, - "loss": 0.08410535, - "memory(GiB)": 74.41, - "step": 2690, - "train_speed(iter/s)": 0.074901 - }, - { - "epoch": 0.31746031746031744, - "grad_norm": 4.21875, - "learning_rate": 7.713668922196372e-06, - "loss": 0.08352896, - "memory(GiB)": 74.41, - "step": 2695, - "train_speed(iter/s)": 0.074903 - }, - { - "epoch": 0.3180492976411344, - "grad_norm": 2.765625, - "learning_rate": 7.705892606020652e-06, - "loss": 0.0845727, - "memory(GiB)": 74.41, - "step": 2700, - "train_speed(iter/s)": 0.074904 - }, - { - "epoch": 0.3186382778219513, - "grad_norm": 3.671875, - "learning_rate": 7.698107022838931e-06, - "loss": 0.09101894, - "memory(GiB)": 74.41, - "step": 2705, - "train_speed(iter/s)": 0.074905 - }, - { - "epoch": 0.3192272580027682, - "grad_norm": 3.890625, - "learning_rate": 7.690312199314887e-06, - "loss": 0.08560072, - "memory(GiB)": 74.41, - "step": 2710, - "train_speed(iter/s)": 0.074906 - }, - { - "epoch": 0.3198162381835851, - "grad_norm": 3.375, - "learning_rate": 7.682508162143843e-06, - "loss": 0.09032207, - "memory(GiB)": 74.41, - "step": 2715, - "train_speed(iter/s)": 0.074907 - }, - { - "epoch": 0.32040521836440206, - "grad_norm": 4.15625, - "learning_rate": 7.674694938052673e-06, - "loss": 0.08928435, - "memory(GiB)": 74.41, - "step": 2720, - "train_speed(iter/s)": 0.074909 - }, - { - "epoch": 0.32099419854521893, - "grad_norm": 3.34375, - "learning_rate": 7.666872553799719e-06, - "loss": 0.06355714, - "memory(GiB)": 74.41, - "step": 2725, - "train_speed(iter/s)": 0.07491 - }, - { - "epoch": 0.32158317872603587, - "grad_norm": 3.609375, - "learning_rate": 7.659041036174693e-06, - "loss": 0.09267324, - "memory(GiB)": 74.41, - "step": 2730, - "train_speed(iter/s)": 0.07491 - }, - { - "epoch": 0.3221721589068528, - "grad_norm": 4.5, - "learning_rate": 7.651200411998585e-06, - "loss": 0.08836827, - "memory(GiB)": 74.41, - "step": 2735, - "train_speed(iter/s)": 0.074911 - }, - { - "epoch": 0.3227611390876697, - "grad_norm": 4.1875, - "learning_rate": 7.643350708123574e-06, - "loss": 0.09088976, - "memory(GiB)": 74.41, - "step": 2740, - "train_speed(iter/s)": 0.074912 - }, - { - "epoch": 0.3233501192684866, - "grad_norm": 3.03125, - "learning_rate": 7.635491951432931e-06, - "loss": 0.08111639, - "memory(GiB)": 74.41, - "step": 2745, - "train_speed(iter/s)": 0.074913 - }, - { - "epoch": 0.32393909944930355, - "grad_norm": 3.28125, - "learning_rate": 7.62762416884094e-06, - "loss": 0.07648246, - "memory(GiB)": 74.41, - "step": 2750, - "train_speed(iter/s)": 0.074911 - }, - { - "epoch": 0.3245280796301204, - "grad_norm": 4.1875, - "learning_rate": 7.619747387292786e-06, - "loss": 0.08834222, - "memory(GiB)": 74.41, - "step": 2755, - "train_speed(iter/s)": 0.074911 - }, - { - "epoch": 0.32511705981093736, - "grad_norm": 3.53125, - "learning_rate": 7.611861633764479e-06, - "loss": 0.08350364, - "memory(GiB)": 74.41, - "step": 2760, - "train_speed(iter/s)": 0.074911 - }, - { - "epoch": 0.3257060399917543, - "grad_norm": 2.640625, - "learning_rate": 7.603966935262752e-06, - "loss": 0.08396969, - "memory(GiB)": 74.41, - "step": 2765, - "train_speed(iter/s)": 0.074911 - }, - { - "epoch": 0.32629502017257117, - "grad_norm": 2.9375, - "learning_rate": 7.596063318824978e-06, - "loss": 0.06955547, - "memory(GiB)": 74.41, - "step": 2770, - "train_speed(iter/s)": 0.074912 - }, - { - "epoch": 0.3268840003533881, - "grad_norm": 2.875, - "learning_rate": 7.588150811519066e-06, - "loss": 0.08324391, - "memory(GiB)": 74.41, - "step": 2775, - "train_speed(iter/s)": 0.074914 - }, - { - "epoch": 0.32747298053420504, - "grad_norm": 4.28125, - "learning_rate": 7.580229440443378e-06, - "loss": 0.08206114, - "memory(GiB)": 74.41, - "step": 2780, - "train_speed(iter/s)": 0.074916 - }, - { - "epoch": 0.3280619607150219, - "grad_norm": 3.703125, - "learning_rate": 7.572299232726627e-06, - "loss": 0.08156523, - "memory(GiB)": 74.41, - "step": 2785, - "train_speed(iter/s)": 0.074917 - }, - { - "epoch": 0.32865094089583885, - "grad_norm": 3.5625, - "learning_rate": 7.564360215527797e-06, - "loss": 0.07584106, - "memory(GiB)": 74.41, - "step": 2790, - "train_speed(iter/s)": 0.074919 - }, - { - "epoch": 0.3292399210766558, - "grad_norm": 4.28125, - "learning_rate": 7.556412416036036e-06, - "loss": 0.08185716, - "memory(GiB)": 74.41, - "step": 2795, - "train_speed(iter/s)": 0.07492 - }, - { - "epoch": 0.32982890125747266, - "grad_norm": 3.625, - "learning_rate": 7.5484558614705705e-06, - "loss": 0.0776104, - "memory(GiB)": 74.41, - "step": 2800, - "train_speed(iter/s)": 0.074922 - }, - { - "epoch": 0.3304178814382896, - "grad_norm": 3.46875, - "learning_rate": 7.540490579080613e-06, - "loss": 0.07412539, - "memory(GiB)": 74.41, - "step": 2805, - "train_speed(iter/s)": 0.074922 - }, - { - "epoch": 0.33100686161910653, - "grad_norm": 3.453125, - "learning_rate": 7.532516596145263e-06, - "loss": 0.06569082, - "memory(GiB)": 74.41, - "step": 2810, - "train_speed(iter/s)": 0.074924 - }, - { - "epoch": 0.3315958417999234, - "grad_norm": 3.890625, - "learning_rate": 7.524533939973424e-06, - "loss": 0.08522917, - "memory(GiB)": 74.41, - "step": 2815, - "train_speed(iter/s)": 0.074923 - }, - { - "epoch": 0.33218482198074034, - "grad_norm": 4.03125, - "learning_rate": 7.516542637903692e-06, - "loss": 0.10329256, - "memory(GiB)": 74.41, - "step": 2820, - "train_speed(iter/s)": 0.074923 - }, - { - "epoch": 0.3327738021615573, - "grad_norm": 2.640625, - "learning_rate": 7.508542717304286e-06, - "loss": 0.06652896, - "memory(GiB)": 74.41, - "step": 2825, - "train_speed(iter/s)": 0.074923 - }, - { - "epoch": 0.33336278234237415, - "grad_norm": 3.03125, - "learning_rate": 7.500534205572932e-06, - "loss": 0.07229707, - "memory(GiB)": 74.41, - "step": 2830, - "train_speed(iter/s)": 0.074923 - }, - { - "epoch": 0.3339517625231911, - "grad_norm": 3.40625, - "learning_rate": 7.492517130136781e-06, - "loss": 0.07162905, - "memory(GiB)": 74.41, - "step": 2835, - "train_speed(iter/s)": 0.074923 - }, - { - "epoch": 0.334540742704008, - "grad_norm": 3.40625, - "learning_rate": 7.484491518452315e-06, - "loss": 0.06838317, - "memory(GiB)": 74.41, - "step": 2840, - "train_speed(iter/s)": 0.074924 - }, - { - "epoch": 0.3351297228848249, - "grad_norm": 3.75, - "learning_rate": 7.476457398005246e-06, - "loss": 0.09434291, - "memory(GiB)": 74.41, - "step": 2845, - "train_speed(iter/s)": 0.074925 - }, - { - "epoch": 0.33571870306564183, - "grad_norm": 3.9375, - "learning_rate": 7.46841479631043e-06, - "loss": 0.08622286, - "memory(GiB)": 74.41, - "step": 2850, - "train_speed(iter/s)": 0.074925 - }, - { - "epoch": 0.33630768324645877, - "grad_norm": 3.171875, - "learning_rate": 7.460363740911769e-06, - "loss": 0.0905494, - "memory(GiB)": 74.41, - "step": 2855, - "train_speed(iter/s)": 0.074926 - }, - { - "epoch": 0.3368966634272757, - "grad_norm": 3.71875, - "learning_rate": 7.452304259382115e-06, - "loss": 0.08356917, - "memory(GiB)": 74.41, - "step": 2860, - "train_speed(iter/s)": 0.074927 - }, - { - "epoch": 0.3374856436080926, - "grad_norm": 4.40625, - "learning_rate": 7.4442363793231775e-06, - "loss": 0.09314897, - "memory(GiB)": 74.41, - "step": 2865, - "train_speed(iter/s)": 0.074926 - }, - { - "epoch": 0.3380746237889095, - "grad_norm": 3.34375, - "learning_rate": 7.436160128365431e-06, - "loss": 0.0789938, - "memory(GiB)": 74.41, - "step": 2870, - "train_speed(iter/s)": 0.074925 - }, - { - "epoch": 0.33866360396972645, - "grad_norm": 3.34375, - "learning_rate": 7.428075534168015e-06, - "loss": 0.08869631, - "memory(GiB)": 74.41, - "step": 2875, - "train_speed(iter/s)": 0.074924 - }, - { - "epoch": 0.3392525841505433, - "grad_norm": 3.140625, - "learning_rate": 7.419982624418647e-06, - "loss": 0.07138891, - "memory(GiB)": 74.41, - "step": 2880, - "train_speed(iter/s)": 0.074924 - }, - { - "epoch": 0.33984156433136026, - "grad_norm": 4.40625, - "learning_rate": 7.4118814268335205e-06, - "loss": 0.09039434, - "memory(GiB)": 74.41, - "step": 2885, - "train_speed(iter/s)": 0.074924 - }, - { - "epoch": 0.3404305445121772, - "grad_norm": 3.640625, - "learning_rate": 7.403771969157209e-06, - "loss": 0.08224541, - "memory(GiB)": 74.41, - "step": 2890, - "train_speed(iter/s)": 0.074924 - }, - { - "epoch": 0.34101952469299407, - "grad_norm": 3.25, - "learning_rate": 7.39565427916258e-06, - "loss": 0.08286152, - "memory(GiB)": 74.41, - "step": 2895, - "train_speed(iter/s)": 0.074924 - }, - { - "epoch": 0.341608504873811, - "grad_norm": 3.140625, - "learning_rate": 7.387528384650699e-06, - "loss": 0.06858167, - "memory(GiB)": 74.41, - "step": 2900, - "train_speed(iter/s)": 0.074923 - }, - { - "epoch": 0.34219748505462794, - "grad_norm": 3.40625, - "learning_rate": 7.379394313450714e-06, - "loss": 0.07839249, - "memory(GiB)": 74.41, - "step": 2905, - "train_speed(iter/s)": 0.074923 - }, - { - "epoch": 0.3427864652354448, - "grad_norm": 3.734375, - "learning_rate": 7.371252093419795e-06, - "loss": 0.06531736, - "memory(GiB)": 74.41, - "step": 2910, - "train_speed(iter/s)": 0.074922 - }, - { - "epoch": 0.34337544541626175, - "grad_norm": 2.953125, - "learning_rate": 7.363101752443007e-06, - "loss": 0.07913511, - "memory(GiB)": 74.41, - "step": 2915, - "train_speed(iter/s)": 0.074922 - }, - { - "epoch": 0.3439644255970787, - "grad_norm": 3.015625, - "learning_rate": 7.3549433184332306e-06, - "loss": 0.08093199, - "memory(GiB)": 74.41, - "step": 2920, - "train_speed(iter/s)": 0.074922 - }, - { - "epoch": 0.34455340577789556, - "grad_norm": 4.84375, - "learning_rate": 7.346776819331065e-06, - "loss": 0.06076463, - "memory(GiB)": 74.41, - "step": 2925, - "train_speed(iter/s)": 0.074923 - }, - { - "epoch": 0.3451423859587125, - "grad_norm": 4.15625, - "learning_rate": 7.338602283104731e-06, - "loss": 0.08175699, - "memory(GiB)": 74.41, - "step": 2930, - "train_speed(iter/s)": 0.074922 - }, - { - "epoch": 0.34573136613952943, - "grad_norm": 3.4375, - "learning_rate": 7.330419737749967e-06, - "loss": 0.06969818, - "memory(GiB)": 74.41, - "step": 2935, - "train_speed(iter/s)": 0.074921 - }, - { - "epoch": 0.3463203463203463, - "grad_norm": 3.546875, - "learning_rate": 7.3222292112899505e-06, - "loss": 0.08905634, - "memory(GiB)": 74.41, - "step": 2940, - "train_speed(iter/s)": 0.074921 - }, - { - "epoch": 0.34690932650116324, - "grad_norm": 3.390625, - "learning_rate": 7.314030731775187e-06, - "loss": 0.07148146, - "memory(GiB)": 74.41, - "step": 2945, - "train_speed(iter/s)": 0.074921 - }, - { - "epoch": 0.3474983066819802, - "grad_norm": 3.375, - "learning_rate": 7.305824327283419e-06, - "loss": 0.08839546, - "memory(GiB)": 74.41, - "step": 2950, - "train_speed(iter/s)": 0.07492 - }, - { - "epoch": 0.34808728686279705, - "grad_norm": 4.5625, - "learning_rate": 7.297610025919533e-06, - "loss": 0.079582, - "memory(GiB)": 74.41, - "step": 2955, - "train_speed(iter/s)": 0.074921 - }, - { - "epoch": 0.348676267043614, - "grad_norm": 5.0, - "learning_rate": 7.289387855815455e-06, - "loss": 0.07907141, - "memory(GiB)": 74.41, - "step": 2960, - "train_speed(iter/s)": 0.07492 - }, - { - "epoch": 0.3492652472244309, - "grad_norm": 3.46875, - "learning_rate": 7.281157845130067e-06, - "loss": 0.08737714, - "memory(GiB)": 74.41, - "step": 2965, - "train_speed(iter/s)": 0.074919 - }, - { - "epoch": 0.3498542274052478, - "grad_norm": 3.5, - "learning_rate": 7.272920022049097e-06, - "loss": 0.07906392, - "memory(GiB)": 74.41, - "step": 2970, - "train_speed(iter/s)": 0.074919 - }, - { - "epoch": 0.35044320758606473, - "grad_norm": 3.453125, - "learning_rate": 7.264674414785031e-06, - "loss": 0.08779007, - "memory(GiB)": 74.41, - "step": 2975, - "train_speed(iter/s)": 0.074921 - }, - { - "epoch": 0.35103218776688166, - "grad_norm": 3.265625, - "learning_rate": 7.256421051577013e-06, - "loss": 0.06584383, - "memory(GiB)": 74.41, - "step": 2980, - "train_speed(iter/s)": 0.074921 - }, - { - "epoch": 0.35162116794769854, - "grad_norm": 4.21875, - "learning_rate": 7.24815996069075e-06, - "loss": 0.08898224, - "memory(GiB)": 74.41, - "step": 2985, - "train_speed(iter/s)": 0.074921 - }, - { - "epoch": 0.3522101481285155, - "grad_norm": 3.6875, - "learning_rate": 7.239891170418414e-06, - "loss": 0.08532063, - "memory(GiB)": 74.41, - "step": 2990, - "train_speed(iter/s)": 0.074921 - }, - { - "epoch": 0.3527991283093324, - "grad_norm": 3.28125, - "learning_rate": 7.231614709078545e-06, - "loss": 0.08233502, - "memory(GiB)": 74.41, - "step": 2995, - "train_speed(iter/s)": 0.074922 - }, - { - "epoch": 0.3533881084901493, - "grad_norm": 3.3125, - "learning_rate": 7.223330605015955e-06, - "loss": 0.09144391, - "memory(GiB)": 74.41, - "step": 3000, - "train_speed(iter/s)": 0.074923 + "train_speed(iter/s)": 0.018489 } ], "logging_steps": 5, - "max_steps": 8489, + "max_steps": 4244, "num_input_tokens_seen": 0, "num_train_epochs": 1, - "save_steps": 500, + "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { @@ -5435,7 +755,7 @@ "attributes": {} } }, - "total_flos": 4.858899770166477e+17, + "total_flos": 4.4341415068565504e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null