{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9978976874562018, "eval_steps": 500, "global_step": 267, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01868722261153936, "grad_norm": 3.5830482905634797, "learning_rate": 1.785714285714286e-05, "loss": 0.8152, "mean_token_accuracy": 0.7747266553342342, "step": 5 }, { "epoch": 0.03737444522307872, "grad_norm": 1.6625011834876648, "learning_rate": 3.571428571428572e-05, "loss": 0.7025, "mean_token_accuracy": 0.7917468748986721, "step": 10 }, { "epoch": 0.05606166783461808, "grad_norm": 1.5899441749058187, "learning_rate": 4.9998265374824964e-05, "loss": 0.629, "mean_token_accuracy": 0.8071001760661602, "step": 15 }, { "epoch": 0.07474889044615744, "grad_norm": 1.5979853248536116, "learning_rate": 4.993758157237536e-05, "loss": 0.5972, "mean_token_accuracy": 0.8143526442348957, "step": 20 }, { "epoch": 0.09343611305769679, "grad_norm": 1.2674965103641067, "learning_rate": 4.979043378581744e-05, "loss": 0.5727, "mean_token_accuracy": 0.8199357651174068, "step": 25 }, { "epoch": 0.11212333566923616, "grad_norm": 1.3685254142570102, "learning_rate": 4.9557389054153965e-05, "loss": 0.5555, "mean_token_accuracy": 0.8241050921380519, "step": 30 }, { "epoch": 0.1308105582807755, "grad_norm": 0.9457691254500147, "learning_rate": 4.923934542318854e-05, "loss": 0.5409, "mean_token_accuracy": 0.8273717932403087, "step": 35 }, { "epoch": 0.14949778089231489, "grad_norm": 0.9189569415750541, "learning_rate": 4.883752848487571e-05, "loss": 0.5272, "mean_token_accuracy": 0.8308477029204369, "step": 40 }, { "epoch": 0.16818500350385424, "grad_norm": 0.7752642460093544, "learning_rate": 4.835348665446049e-05, "loss": 0.5213, "mean_token_accuracy": 0.8322811633348465, "step": 45 }, { "epoch": 0.18687222611539359, "grad_norm": 0.8083941553455664, "learning_rate": 4.7789085203607664e-05, "loss": 0.5118, "mean_token_accuracy": 0.8346851594746113, "step": 50 }, { "epoch": 0.20555944872693296, "grad_norm": 0.9161612194082712, "learning_rate": 4.714649907251388e-05, "loss": 0.5088, "mean_token_accuracy": 0.8352511122822761, "step": 55 }, { "epoch": 0.22424667133847231, "grad_norm": 0.6723034421711911, "learning_rate": 4.6428204488701576e-05, "loss": 0.5018, "mean_token_accuracy": 0.837028643488884, "step": 60 }, { "epoch": 0.2429338939500117, "grad_norm": 0.7871127694104725, "learning_rate": 4.563696942479205e-05, "loss": 0.5061, "mean_token_accuracy": 0.8351837247610092, "step": 65 }, { "epoch": 0.261621116561551, "grad_norm": 0.761272463434446, "learning_rate": 4.477584293202868e-05, "loss": 0.4939, "mean_token_accuracy": 0.8390413090586663, "step": 70 }, { "epoch": 0.2803083391730904, "grad_norm": 0.5083377211370889, "learning_rate": 4.384814339065424e-05, "loss": 0.4914, "mean_token_accuracy": 0.8395063504576683, "step": 75 }, { "epoch": 0.29899556178462977, "grad_norm": 0.908648503032802, "learning_rate": 4.285744572241972e-05, "loss": 0.4972, "mean_token_accuracy": 0.8376469679176808, "step": 80 }, { "epoch": 0.3176827843961691, "grad_norm": 0.5938541774391267, "learning_rate": 4.180756761450171e-05, "loss": 0.4816, "mean_token_accuracy": 0.8424232035875321, "step": 85 }, { "epoch": 0.33637000700770847, "grad_norm": 0.4849394515949123, "learning_rate": 4.070255480791492e-05, "loss": 0.491, "mean_token_accuracy": 0.8394017495214939, "step": 90 }, { "epoch": 0.35505722961924785, "grad_norm": 0.5718927262308711, "learning_rate": 3.954666550711159e-05, "loss": 0.4851, "mean_token_accuracy": 0.8409382797777653, "step": 95 }, { "epoch": 0.37374445223078717, "grad_norm": 0.5859959003534665, "learning_rate": 3.8344353970845606e-05, "loss": 0.4862, "mean_token_accuracy": 0.8411718301475049, "step": 100 }, { "epoch": 0.39243167484232655, "grad_norm": 0.3782337735273932, "learning_rate": 3.710025334753495e-05, "loss": 0.4834, "mean_token_accuracy": 0.8412449143826961, "step": 105 }, { "epoch": 0.41111889745386593, "grad_norm": 0.41111283052381054, "learning_rate": 3.581915782126652e-05, "loss": 0.476, "mean_token_accuracy": 0.8432458408176899, "step": 110 }, { "epoch": 0.4298061200654053, "grad_norm": 0.3851706170856147, "learning_rate": 3.4506004137244676e-05, "loss": 0.4851, "mean_token_accuracy": 0.8405788190662861, "step": 115 }, { "epoch": 0.44849334267694463, "grad_norm": 0.5062426249148336, "learning_rate": 3.3165852577875546e-05, "loss": 0.4785, "mean_token_accuracy": 0.8426314078271389, "step": 120 }, { "epoch": 0.467180565288484, "grad_norm": 0.40499027788768055, "learning_rate": 3.180386746279663e-05, "loss": 0.4747, "mean_token_accuracy": 0.843725374341011, "step": 125 }, { "epoch": 0.4858677879000234, "grad_norm": 0.47044740848341193, "learning_rate": 3.04252972479953e-05, "loss": 0.472, "mean_token_accuracy": 0.8443724811077118, "step": 130 }, { "epoch": 0.5045550105115627, "grad_norm": 0.4272272972180237, "learning_rate": 2.90354543007051e-05, "loss": 0.4725, "mean_token_accuracy": 0.8439367160201072, "step": 135 }, { "epoch": 0.523242233123102, "grad_norm": 0.4584455682048115, "learning_rate": 2.7639694428017792e-05, "loss": 0.4777, "mean_token_accuracy": 0.842538620531559, "step": 140 }, { "epoch": 0.5419294557346415, "grad_norm": 0.4589522708203329, "learning_rate": 2.6243396238098518e-05, "loss": 0.4693, "mean_token_accuracy": 0.8448904320597649, "step": 145 }, { "epoch": 0.5606166783461808, "grad_norm": 0.5239037802900407, "learning_rate": 2.4851940413536174e-05, "loss": 0.4697, "mean_token_accuracy": 0.8447436839342117, "step": 150 }, { "epoch": 0.5793039009577201, "grad_norm": 0.4866344062046232, "learning_rate": 2.347068897669999e-05, "loss": 0.4687, "mean_token_accuracy": 0.8448469452559948, "step": 155 }, { "epoch": 0.5979911235692595, "grad_norm": 0.3163374085877721, "learning_rate": 2.2104964627003848e-05, "loss": 0.4629, "mean_token_accuracy": 0.846843034029007, "step": 160 }, { "epoch": 0.6166783461807989, "grad_norm": 0.2747336073106856, "learning_rate": 2.0760030229702972e-05, "loss": 0.4612, "mean_token_accuracy": 0.8469885870814323, "step": 165 }, { "epoch": 0.6353655687923382, "grad_norm": 0.24722982457005138, "learning_rate": 1.9441068535263564e-05, "loss": 0.4596, "mean_token_accuracy": 0.8476050347089767, "step": 170 }, { "epoch": 0.6540527914038776, "grad_norm": 0.23852151209010736, "learning_rate": 1.815316220745756e-05, "loss": 0.4636, "mean_token_accuracy": 0.8460029393434525, "step": 175 }, { "epoch": 0.6727400140154169, "grad_norm": 0.2597676847867842, "learning_rate": 1.6901274237144782e-05, "loss": 0.4669, "mean_token_accuracy": 0.8451244607567787, "step": 180 }, { "epoch": 0.6914272366269563, "grad_norm": 0.8605153163863832, "learning_rate": 1.5690228817218815e-05, "loss": 0.4668, "mean_token_accuracy": 0.8468827910721302, "step": 185 }, { "epoch": 0.7101144592384957, "grad_norm": 0.24915841679008277, "learning_rate": 1.4524692752415493e-05, "loss": 0.4591, "mean_token_accuracy": 0.8473225735127926, "step": 190 }, { "epoch": 0.728801681850035, "grad_norm": 0.29798920762515824, "learning_rate": 1.3409157475622094e-05, "loss": 0.4576, "mean_token_accuracy": 0.847739252448082, "step": 195 }, { "epoch": 0.7474889044615743, "grad_norm": 0.26110601129292016, "learning_rate": 1.2347921739987815e-05, "loss": 0.4611, "mean_token_accuracy": 0.8468295410275459, "step": 200 }, { "epoch": 0.7661761270731138, "grad_norm": 0.2785413649007615, "learning_rate": 1.1345075053532287e-05, "loss": 0.4615, "mean_token_accuracy": 0.846584790199995, "step": 205 }, { "epoch": 0.7848633496846531, "grad_norm": 0.281588896295376, "learning_rate": 1.0404481920087206e-05, "loss": 0.4532, "mean_token_accuracy": 0.8491566374897956, "step": 210 }, { "epoch": 0.8035505722961925, "grad_norm": 0.26784943568942043, "learning_rate": 9.529766947299371e-06, "loss": 0.4555, "mean_token_accuracy": 0.8485622465610504, "step": 215 }, { "epoch": 0.8222377949077319, "grad_norm": 0.22085700610769077, "learning_rate": 8.724300879081718e-06, "loss": 0.4584, "mean_token_accuracy": 0.8476461283862591, "step": 220 }, { "epoch": 0.8409250175192712, "grad_norm": 0.1833370072353519, "learning_rate": 7.991187606337009e-06, "loss": 0.452, "mean_token_accuracy": 0.8494263976812363, "step": 225 }, { "epoch": 0.8596122401308106, "grad_norm": 0.20272051001866034, "learning_rate": 7.333252206008559e-06, "loss": 0.4538, "mean_token_accuracy": 0.8487676382064819, "step": 230 }, { "epoch": 0.8782994627423499, "grad_norm": 0.19595652872474512, "learning_rate": 6.753030054550158e-06, "loss": 0.4506, "mean_token_accuracy": 0.8496683083474637, "step": 235 }, { "epoch": 0.8969866853538893, "grad_norm": 0.19319039281954486, "learning_rate": 6.25275705776658e-06, "loss": 0.4519, "mean_token_accuracy": 0.8493411011993885, "step": 240 }, { "epoch": 0.9156739079654287, "grad_norm": 0.20035376740680347, "learning_rate": 5.834361034674521e-06, "loss": 0.4557, "mean_token_accuracy": 0.8482660032808781, "step": 245 }, { "epoch": 0.934361130576968, "grad_norm": 0.2010469617138832, "learning_rate": 5.499454288586379e-06, "loss": 0.453, "mean_token_accuracy": 0.8490205124020577, "step": 250 }, { "epoch": 0.9530483531885073, "grad_norm": 0.2001591137777418, "learning_rate": 5.24932739404462e-06, "loss": 0.4488, "mean_token_accuracy": 0.8502446681261062, "step": 255 }, { "epoch": 0.9717355758000468, "grad_norm": 0.18321355893669744, "learning_rate": 5.08494422354882e-06, "loss": 0.4518, "mean_token_accuracy": 0.849391470849514, "step": 260 }, { "epoch": 0.9904227984115861, "grad_norm": 0.18951844466307075, "learning_rate": 5.006938233240212e-06, "loss": 0.4554, "mean_token_accuracy": 0.8482832841575145, "step": 265 }, { "epoch": 0.9978976874562018, "mean_token_accuracy": 0.8498711809515953, "step": 267, "total_flos": 2841831180075008.0, "train_loss": 0.4941176689519418, "train_runtime": 26749.4727, "train_samples_per_second": 1.28, "train_steps_per_second": 0.01 } ], "logging_steps": 5, "max_steps": 267, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2841831180075008.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }