{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.99836867862969, "eval_steps": 100, "global_step": 306, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01631321370309951, "grad_norm": 2.53125, "learning_rate": 3.225806451612903e-06, "loss": 1.4286, "mean_token_accuracy": 0.6584272754958135, "step": 5 }, { "epoch": 0.03262642740619902, "grad_norm": 2.4375, "learning_rate": 6.451612903225806e-06, "loss": 1.4087, "mean_token_accuracy": 0.662085565849035, "step": 10 }, { "epoch": 0.048939641109298535, "grad_norm": 2.640625, "learning_rate": 9.67741935483871e-06, "loss": 1.5058, "mean_token_accuracy": 0.6418076518243186, "step": 15 }, { "epoch": 0.06525285481239804, "grad_norm": 1.7421875, "learning_rate": 1.2903225806451613e-05, "loss": 1.4118, "mean_token_accuracy": 0.6534564204767823, "step": 20 }, { "epoch": 0.08156606851549755, "grad_norm": 1.515625, "learning_rate": 1.6129032258064517e-05, "loss": 1.3321, "mean_token_accuracy": 0.6651619066075792, "step": 25 }, { "epoch": 0.09787928221859707, "grad_norm": 1.3671875, "learning_rate": 1.935483870967742e-05, "loss": 1.3164, "mean_token_accuracy": 0.6717782021288032, "step": 30 }, { "epoch": 0.11419249592169657, "grad_norm": 1.3359375, "learning_rate": 1.9989561243382313e-05, "loss": 1.1912, "mean_token_accuracy": 0.694698959999242, "step": 35 }, { "epoch": 0.13050570962479607, "grad_norm": 1.3203125, "learning_rate": 1.9947191143073185e-05, "loss": 1.2719, "mean_token_accuracy": 0.6780847732470013, "step": 40 }, { "epoch": 0.1468189233278956, "grad_norm": 1.234375, "learning_rate": 1.9872375372801627e-05, "loss": 1.1799, "mean_token_accuracy": 0.6968185226866065, "step": 45 }, { "epoch": 0.1631321370309951, "grad_norm": 1.2421875, "learning_rate": 1.9765357966059638e-05, "loss": 1.2948, "mean_token_accuracy": 0.672758400550632, "step": 50 }, { "epoch": 0.17944535073409462, "grad_norm": 1.203125, "learning_rate": 1.9626487991384194e-05, "loss": 1.217, "mean_token_accuracy": 0.6891739777463799, "step": 55 }, { "epoch": 0.19575856443719414, "grad_norm": 1.2109375, "learning_rate": 1.945621841376825e-05, "loss": 1.1489, "mean_token_accuracy": 0.7052165754765654, "step": 60 }, { "epoch": 0.21207177814029363, "grad_norm": 1.296875, "learning_rate": 1.9255104617183068e-05, "loss": 1.2015, "mean_token_accuracy": 0.6921520639873796, "step": 65 }, { "epoch": 0.22838499184339314, "grad_norm": 1.140625, "learning_rate": 1.9023802593031156e-05, "loss": 1.2204, "mean_token_accuracy": 0.6918742825268815, "step": 70 }, { "epoch": 0.24469820554649266, "grad_norm": 1.234375, "learning_rate": 1.8763066800438638e-05, "loss": 1.2876, "mean_token_accuracy": 0.6744394009627726, "step": 75 }, { "epoch": 0.26101141924959215, "grad_norm": 1.2421875, "learning_rate": 1.8473747705366427e-05, "loss": 1.2127, "mean_token_accuracy": 0.6919606949918741, "step": 80 }, { "epoch": 0.27732463295269166, "grad_norm": 1.234375, "learning_rate": 1.8156789006567018e-05, "loss": 1.2829, "mean_token_accuracy": 0.6730983288833745, "step": 85 }, { "epoch": 0.2936378466557912, "grad_norm": 1.171875, "learning_rate": 1.7813224557435313e-05, "loss": 1.2617, "mean_token_accuracy": 0.677167603530814, "step": 90 }, { "epoch": 0.3099510603588907, "grad_norm": 1.1953125, "learning_rate": 1.744417499379372e-05, "loss": 1.2741, "mean_token_accuracy": 0.6805341821135611, "step": 95 }, { "epoch": 0.3262642740619902, "grad_norm": 1.125, "learning_rate": 1.7050844078611058e-05, "loss": 1.2389, "mean_token_accuracy": 0.6857057946710582, "step": 100 }, { "epoch": 0.3425774877650897, "grad_norm": 1.2578125, "learning_rate": 1.663451477557792e-05, "loss": 1.2223, "mean_token_accuracy": 0.687469468283924, "step": 105 }, { "epoch": 0.35889070146818924, "grad_norm": 1.234375, "learning_rate": 1.6196545064345813e-05, "loss": 1.214, "mean_token_accuracy": 0.6902886780811677, "step": 110 }, { "epoch": 0.37520391517128876, "grad_norm": 1.203125, "learning_rate": 1.5738363511079776e-05, "loss": 1.2245, "mean_token_accuracy": 0.6857961919369919, "step": 115 }, { "epoch": 0.3915171288743883, "grad_norm": 1.2109375, "learning_rate": 1.5261464608772487e-05, "loss": 1.1697, "mean_token_accuracy": 0.7004586354413661, "step": 120 }, { "epoch": 0.4078303425774878, "grad_norm": 1.1875, "learning_rate": 1.476740390251875e-05, "loss": 1.1864, "mean_token_accuracy": 0.6946914374684998, "step": 125 }, { "epoch": 0.42414355628058725, "grad_norm": 1.2578125, "learning_rate": 1.4257792915650728e-05, "loss": 1.2256, "mean_token_accuracy": 0.6917642628755305, "step": 130 }, { "epoch": 0.44045676998368677, "grad_norm": 1.2109375, "learning_rate": 1.3734293893283783e-05, "loss": 1.2234, "mean_token_accuracy": 0.6803009834502376, "step": 135 }, { "epoch": 0.4567699836867863, "grad_norm": 1.296875, "learning_rate": 1.3198614380418412e-05, "loss": 1.2891, "mean_token_accuracy": 0.66906340898559, "step": 140 }, { "epoch": 0.4730831973898858, "grad_norm": 1.140625, "learning_rate": 1.2652501652283378e-05, "loss": 1.2833, "mean_token_accuracy": 0.6748017583716992, "step": 145 }, { "epoch": 0.4893964110929853, "grad_norm": 1.234375, "learning_rate": 1.2097737015087094e-05, "loss": 1.1936, "mean_token_accuracy": 0.691278874465356, "step": 150 }, { "epoch": 0.5057096247960848, "grad_norm": 1.140625, "learning_rate": 1.1536129995766995e-05, "loss": 1.1923, "mean_token_accuracy": 0.6915639418258935, "step": 155 }, { "epoch": 0.5220228384991843, "grad_norm": 1.2578125, "learning_rate": 1.0969512439688816e-05, "loss": 1.2689, "mean_token_accuracy": 0.6795721711260161, "step": 160 }, { "epoch": 0.5383360522022839, "grad_norm": 1.2109375, "learning_rate": 1.0399732535547735e-05, "loss": 1.2322, "mean_token_accuracy": 0.6911618495404431, "step": 165 }, { "epoch": 0.5546492659053833, "grad_norm": 1.140625, "learning_rate": 9.828648786961009e-06, "loss": 1.3013, "mean_token_accuracy": 0.6691213045203754, "step": 170 }, { "epoch": 0.5709624796084829, "grad_norm": 1.140625, "learning_rate": 9.25812395041548e-06, "loss": 1.2362, "mean_token_accuracy": 0.6846231446931508, "step": 175 }, { "epoch": 0.5872756933115824, "grad_norm": 1.1171875, "learning_rate": 8.690018959343071e-06, "loss": 1.1778, "mean_token_accuracy": 0.6980786468955738, "step": 180 }, { "epoch": 0.6035889070146819, "grad_norm": 1.1796875, "learning_rate": 8.126186854142752e-06, "loss": 1.1546, "mean_token_accuracy": 0.698524151500448, "step": 185 }, { "epoch": 0.6199021207177814, "grad_norm": 1.1875, "learning_rate": 7.568466737947905e-06, "loss": 1.1121, "mean_token_accuracy": 0.7087621864221246, "step": 190 }, { "epoch": 0.636215334420881, "grad_norm": 1.28125, "learning_rate": 7.018677777854158e-06, "loss": 1.2979, "mean_token_accuracy": 0.6694032774839085, "step": 195 }, { "epoch": 0.6525285481239804, "grad_norm": 1.2734375, "learning_rate": 6.478613271174453e-06, "loss": 1.3048, "mean_token_accuracy": 0.6659829648734708, "step": 200 }, { "epoch": 0.6688417618270799, "grad_norm": 1.2265625, "learning_rate": 5.950034796075948e-06, "loss": 1.2435, "mean_token_accuracy": 0.6830356432134138, "step": 205 }, { "epoch": 0.6851549755301795, "grad_norm": 1.203125, "learning_rate": 5.434666465678176e-06, "loss": 1.3219, "mean_token_accuracy": 0.6673393247330371, "step": 210 }, { "epoch": 0.7014681892332789, "grad_norm": 1.3046875, "learning_rate": 4.934189304354418e-06, "loss": 1.2301, "mean_token_accuracy": 0.6839774101528373, "step": 215 }, { "epoch": 0.7177814029363785, "grad_norm": 1.1328125, "learning_rate": 4.450235764579598e-06, "loss": 1.2208, "mean_token_accuracy": 0.6862650424423485, "step": 220 }, { "epoch": 0.734094616639478, "grad_norm": 1.265625, "learning_rate": 3.984384402209613e-06, "loss": 1.2143, "mean_token_accuracy": 0.6849734049971641, "step": 225 }, { "epoch": 0.7504078303425775, "grad_norm": 1.140625, "learning_rate": 3.538154727560259e-06, "loss": 1.1558, "mean_token_accuracy": 0.6994565541878519, "step": 230 }, { "epoch": 0.766721044045677, "grad_norm": 1.09375, "learning_rate": 3.1130022490803856e-06, "loss": 1.1774, "mean_token_accuracy": 0.6945823398256653, "step": 235 }, { "epoch": 0.7830342577487766, "grad_norm": 1.25, "learning_rate": 2.7103137257858867e-06, "loss": 1.142, "mean_token_accuracy": 0.7025941539981695, "step": 240 }, { "epoch": 0.799347471451876, "grad_norm": 1.2109375, "learning_rate": 2.3314026439400217e-06, "loss": 1.2946, "mean_token_accuracy": 0.6670428901128733, "step": 245 }, { "epoch": 0.8156606851549756, "grad_norm": 1.09375, "learning_rate": 1.9775049327342486e-06, "loss": 1.1889, "mean_token_accuracy": 0.6966172545481755, "step": 250 }, { "epoch": 0.831973898858075, "grad_norm": 1.1640625, "learning_rate": 1.649774932944075e-06, "loss": 1.1777, "mean_token_accuracy": 0.7020263962587155, "step": 255 }, { "epoch": 0.8482871125611745, "grad_norm": 1.1796875, "learning_rate": 1.3492816317093894e-06, "loss": 1.1444, "mean_token_accuracy": 0.7035694430910295, "step": 260 }, { "epoch": 0.8646003262642741, "grad_norm": 1.1171875, "learning_rate": 1.0770051757206078e-06, "loss": 1.1997, "mean_token_accuracy": 0.6922763738951797, "step": 265 }, { "epoch": 0.8809135399673735, "grad_norm": 1.140625, "learning_rate": 8.338336741838837e-07, "loss": 1.1993, "mean_token_accuracy": 0.6892363770014897, "step": 270 }, { "epoch": 0.8972267536704731, "grad_norm": 1.1328125, "learning_rate": 6.205603019934791e-07, "loss": 1.2311, "mean_token_accuracy": 0.6834054369061775, "step": 275 }, { "epoch": 0.9135399673735726, "grad_norm": 1.1328125, "learning_rate": 4.3788071256013033e-07, "loss": 1.2076, "mean_token_accuracy": 0.6909252205130498, "step": 280 }, { "epoch": 0.9298531810766721, "grad_norm": 1.1640625, "learning_rate": 2.863907687341949e-07, "loss": 1.1762, "mean_token_accuracy": 0.69716578948841, "step": 285 }, { "epoch": 0.9461663947797716, "grad_norm": 1.1484375, "learning_rate": 1.665845992249071e-07, "loss": 1.1792, "mean_token_accuracy": 0.6928531967188305, "step": 290 }, { "epoch": 0.9624796084828712, "grad_norm": 1.3046875, "learning_rate": 7.885298685522235e-08, "loss": 1.1642, "mean_token_accuracy": 0.694887794722856, "step": 295 }, { "epoch": 0.9787928221859706, "grad_norm": 1.28125, "learning_rate": 2.348209390947376e-08, "loss": 1.234, "mean_token_accuracy": 0.6835920887329172, "step": 300 }, { "epoch": 0.9951060358890701, "grad_norm": 1.203125, "learning_rate": 6.525287314851358e-10, "loss": 1.1268, "mean_token_accuracy": 0.7035911209586428, "step": 305 }, { "epoch": 0.99836867862969, "mean_token_accuracy": 0.7046485858069031, "step": 306, "total_flos": 7.883277455484518e+16, "train_loss": 1.23619465266957, "train_runtime": 1783.7305, "train_samples_per_second": 1.374, "train_steps_per_second": 0.172 } ], "logging_steps": 5, "max_steps": 306, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.883277455484518e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }