|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.99836867862969, |
|
"eval_steps": 100, |
|
"global_step": 306, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01631321370309951, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 3.225806451612903e-06, |
|
"loss": 1.4286, |
|
"mean_token_accuracy": 0.6584272754958135, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03262642740619902, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 6.451612903225806e-06, |
|
"loss": 1.4087, |
|
"mean_token_accuracy": 0.662085565849035, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.048939641109298535, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 9.67741935483871e-06, |
|
"loss": 1.5058, |
|
"mean_token_accuracy": 0.6418076518243186, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06525285481239804, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.2903225806451613e-05, |
|
"loss": 1.4118, |
|
"mean_token_accuracy": 0.6534564204767823, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08156606851549755, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 1.6129032258064517e-05, |
|
"loss": 1.3321, |
|
"mean_token_accuracy": 0.6651619066075792, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.09787928221859707, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 1.935483870967742e-05, |
|
"loss": 1.3164, |
|
"mean_token_accuracy": 0.6717782021288032, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11419249592169657, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.9989561243382313e-05, |
|
"loss": 1.1912, |
|
"mean_token_accuracy": 0.694698959999242, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.13050570962479607, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 1.9947191143073185e-05, |
|
"loss": 1.2719, |
|
"mean_token_accuracy": 0.6780847732470013, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1468189233278956, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 1.9872375372801627e-05, |
|
"loss": 1.1799, |
|
"mean_token_accuracy": 0.6968185226866065, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1631321370309951, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 1.9765357966059638e-05, |
|
"loss": 1.2948, |
|
"mean_token_accuracy": 0.672758400550632, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.17944535073409462, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 1.9626487991384194e-05, |
|
"loss": 1.217, |
|
"mean_token_accuracy": 0.6891739777463799, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.19575856443719414, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.945621841376825e-05, |
|
"loss": 1.1489, |
|
"mean_token_accuracy": 0.7052165754765654, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21207177814029363, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 1.9255104617183068e-05, |
|
"loss": 1.2015, |
|
"mean_token_accuracy": 0.6921520639873796, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.22838499184339314, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 1.9023802593031156e-05, |
|
"loss": 1.2204, |
|
"mean_token_accuracy": 0.6918742825268815, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.24469820554649266, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 1.8763066800438638e-05, |
|
"loss": 1.2876, |
|
"mean_token_accuracy": 0.6744394009627726, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.26101141924959215, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 1.8473747705366427e-05, |
|
"loss": 1.2127, |
|
"mean_token_accuracy": 0.6919606949918741, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.27732463295269166, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 1.8156789006567018e-05, |
|
"loss": 1.2829, |
|
"mean_token_accuracy": 0.6730983288833745, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2936378466557912, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 1.7813224557435313e-05, |
|
"loss": 1.2617, |
|
"mean_token_accuracy": 0.677167603530814, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3099510603588907, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 1.744417499379372e-05, |
|
"loss": 1.2741, |
|
"mean_token_accuracy": 0.6805341821135611, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3262642740619902, |
|
"grad_norm": 1.125, |
|
"learning_rate": 1.7050844078611058e-05, |
|
"loss": 1.2389, |
|
"mean_token_accuracy": 0.6857057946710582, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3425774877650897, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 1.663451477557792e-05, |
|
"loss": 1.2223, |
|
"mean_token_accuracy": 0.687469468283924, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.35889070146818924, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 1.6196545064345813e-05, |
|
"loss": 1.214, |
|
"mean_token_accuracy": 0.6902886780811677, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.37520391517128876, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 1.5738363511079776e-05, |
|
"loss": 1.2245, |
|
"mean_token_accuracy": 0.6857961919369919, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3915171288743883, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.5261464608772487e-05, |
|
"loss": 1.1697, |
|
"mean_token_accuracy": 0.7004586354413661, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4078303425774878, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 1.476740390251875e-05, |
|
"loss": 1.1864, |
|
"mean_token_accuracy": 0.6946914374684998, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.42414355628058725, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 1.4257792915650728e-05, |
|
"loss": 1.2256, |
|
"mean_token_accuracy": 0.6917642628755305, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.44045676998368677, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.3734293893283783e-05, |
|
"loss": 1.2234, |
|
"mean_token_accuracy": 0.6803009834502376, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4567699836867863, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 1.3198614380418412e-05, |
|
"loss": 1.2891, |
|
"mean_token_accuracy": 0.66906340898559, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4730831973898858, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 1.2652501652283378e-05, |
|
"loss": 1.2833, |
|
"mean_token_accuracy": 0.6748017583716992, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4893964110929853, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 1.2097737015087094e-05, |
|
"loss": 1.1936, |
|
"mean_token_accuracy": 0.691278874465356, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5057096247960848, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 1.1536129995766995e-05, |
|
"loss": 1.1923, |
|
"mean_token_accuracy": 0.6915639418258935, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5220228384991843, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 1.0969512439688816e-05, |
|
"loss": 1.2689, |
|
"mean_token_accuracy": 0.6795721711260161, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5383360522022839, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.0399732535547735e-05, |
|
"loss": 1.2322, |
|
"mean_token_accuracy": 0.6911618495404431, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5546492659053833, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 9.828648786961009e-06, |
|
"loss": 1.3013, |
|
"mean_token_accuracy": 0.6691213045203754, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5709624796084829, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 9.25812395041548e-06, |
|
"loss": 1.2362, |
|
"mean_token_accuracy": 0.6846231446931508, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5872756933115824, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 8.690018959343071e-06, |
|
"loss": 1.1778, |
|
"mean_token_accuracy": 0.6980786468955738, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6035889070146819, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 8.126186854142752e-06, |
|
"loss": 1.1546, |
|
"mean_token_accuracy": 0.698524151500448, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6199021207177814, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 7.568466737947905e-06, |
|
"loss": 1.1121, |
|
"mean_token_accuracy": 0.7087621864221246, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.636215334420881, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 7.018677777854158e-06, |
|
"loss": 1.2979, |
|
"mean_token_accuracy": 0.6694032774839085, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6525285481239804, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 6.478613271174453e-06, |
|
"loss": 1.3048, |
|
"mean_token_accuracy": 0.6659829648734708, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6688417618270799, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 5.950034796075948e-06, |
|
"loss": 1.2435, |
|
"mean_token_accuracy": 0.6830356432134138, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.6851549755301795, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 5.434666465678176e-06, |
|
"loss": 1.3219, |
|
"mean_token_accuracy": 0.6673393247330371, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7014681892332789, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 4.934189304354418e-06, |
|
"loss": 1.2301, |
|
"mean_token_accuracy": 0.6839774101528373, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7177814029363785, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 4.450235764579598e-06, |
|
"loss": 1.2208, |
|
"mean_token_accuracy": 0.6862650424423485, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.734094616639478, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 3.984384402209613e-06, |
|
"loss": 1.2143, |
|
"mean_token_accuracy": 0.6849734049971641, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7504078303425775, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 3.538154727560259e-06, |
|
"loss": 1.1558, |
|
"mean_token_accuracy": 0.6994565541878519, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.766721044045677, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 3.1130022490803856e-06, |
|
"loss": 1.1774, |
|
"mean_token_accuracy": 0.6945823398256653, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7830342577487766, |
|
"grad_norm": 1.25, |
|
"learning_rate": 2.7103137257858867e-06, |
|
"loss": 1.142, |
|
"mean_token_accuracy": 0.7025941539981695, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.799347471451876, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 2.3314026439400217e-06, |
|
"loss": 1.2946, |
|
"mean_token_accuracy": 0.6670428901128733, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.8156606851549756, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.9775049327342486e-06, |
|
"loss": 1.1889, |
|
"mean_token_accuracy": 0.6966172545481755, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.831973898858075, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 1.649774932944075e-06, |
|
"loss": 1.1777, |
|
"mean_token_accuracy": 0.7020263962587155, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8482871125611745, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 1.3492816317093894e-06, |
|
"loss": 1.1444, |
|
"mean_token_accuracy": 0.7035694430910295, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8646003262642741, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.0770051757206078e-06, |
|
"loss": 1.1997, |
|
"mean_token_accuracy": 0.6922763738951797, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.8809135399673735, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 8.338336741838837e-07, |
|
"loss": 1.1993, |
|
"mean_token_accuracy": 0.6892363770014897, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8972267536704731, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 6.205603019934791e-07, |
|
"loss": 1.2311, |
|
"mean_token_accuracy": 0.6834054369061775, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9135399673735726, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 4.3788071256013033e-07, |
|
"loss": 1.2076, |
|
"mean_token_accuracy": 0.6909252205130498, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9298531810766721, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 2.863907687341949e-07, |
|
"loss": 1.1762, |
|
"mean_token_accuracy": 0.69716578948841, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9461663947797716, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 1.665845992249071e-07, |
|
"loss": 1.1792, |
|
"mean_token_accuracy": 0.6928531967188305, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9624796084828712, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 7.885298685522235e-08, |
|
"loss": 1.1642, |
|
"mean_token_accuracy": 0.694887794722856, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.9787928221859706, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 2.348209390947376e-08, |
|
"loss": 1.234, |
|
"mean_token_accuracy": 0.6835920887329172, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9951060358890701, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 6.525287314851358e-10, |
|
"loss": 1.1268, |
|
"mean_token_accuracy": 0.7035911209586428, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.99836867862969, |
|
"mean_token_accuracy": 0.7046485858069031, |
|
"step": 306, |
|
"total_flos": 7.883277455484518e+16, |
|
"train_loss": 1.23619465266957, |
|
"train_runtime": 1783.7305, |
|
"train_samples_per_second": 1.374, |
|
"train_steps_per_second": 0.172 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 306, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.883277455484518e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|