|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9978976874562018, |
|
"eval_steps": 500, |
|
"global_step": 267, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01868722261153936, |
|
"grad_norm": 3.5830482905634797, |
|
"learning_rate": 1.785714285714286e-05, |
|
"loss": 0.8152, |
|
"mean_token_accuracy": 0.7747266553342342, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03737444522307872, |
|
"grad_norm": 1.6625011834876648, |
|
"learning_rate": 3.571428571428572e-05, |
|
"loss": 0.7025, |
|
"mean_token_accuracy": 0.7917468748986721, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05606166783461808, |
|
"grad_norm": 1.5899441749058187, |
|
"learning_rate": 4.9998265374824964e-05, |
|
"loss": 0.629, |
|
"mean_token_accuracy": 0.8071001760661602, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.07474889044615744, |
|
"grad_norm": 1.5979853248536116, |
|
"learning_rate": 4.993758157237536e-05, |
|
"loss": 0.5972, |
|
"mean_token_accuracy": 0.8143526442348957, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09343611305769679, |
|
"grad_norm": 1.2674965103641067, |
|
"learning_rate": 4.979043378581744e-05, |
|
"loss": 0.5727, |
|
"mean_token_accuracy": 0.8199357651174068, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.11212333566923616, |
|
"grad_norm": 1.3685254142570102, |
|
"learning_rate": 4.9557389054153965e-05, |
|
"loss": 0.5555, |
|
"mean_token_accuracy": 0.8241050921380519, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1308105582807755, |
|
"grad_norm": 0.9457691254500147, |
|
"learning_rate": 4.923934542318854e-05, |
|
"loss": 0.5409, |
|
"mean_token_accuracy": 0.8273717932403087, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.14949778089231489, |
|
"grad_norm": 0.9189569415750541, |
|
"learning_rate": 4.883752848487571e-05, |
|
"loss": 0.5272, |
|
"mean_token_accuracy": 0.8308477029204369, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.16818500350385424, |
|
"grad_norm": 0.7752642460093544, |
|
"learning_rate": 4.835348665446049e-05, |
|
"loss": 0.5213, |
|
"mean_token_accuracy": 0.8322811633348465, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.18687222611539359, |
|
"grad_norm": 0.8083941553455664, |
|
"learning_rate": 4.7789085203607664e-05, |
|
"loss": 0.5118, |
|
"mean_token_accuracy": 0.8346851594746113, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.20555944872693296, |
|
"grad_norm": 0.9161612194082712, |
|
"learning_rate": 4.714649907251388e-05, |
|
"loss": 0.5088, |
|
"mean_token_accuracy": 0.8352511122822761, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.22424667133847231, |
|
"grad_norm": 0.6723034421711911, |
|
"learning_rate": 4.6428204488701576e-05, |
|
"loss": 0.5018, |
|
"mean_token_accuracy": 0.837028643488884, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2429338939500117, |
|
"grad_norm": 0.7871127694104725, |
|
"learning_rate": 4.563696942479205e-05, |
|
"loss": 0.5061, |
|
"mean_token_accuracy": 0.8351837247610092, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.261621116561551, |
|
"grad_norm": 0.761272463434446, |
|
"learning_rate": 4.477584293202868e-05, |
|
"loss": 0.4939, |
|
"mean_token_accuracy": 0.8390413090586663, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2803083391730904, |
|
"grad_norm": 0.5083377211370889, |
|
"learning_rate": 4.384814339065424e-05, |
|
"loss": 0.4914, |
|
"mean_token_accuracy": 0.8395063504576683, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.29899556178462977, |
|
"grad_norm": 0.908648503032802, |
|
"learning_rate": 4.285744572241972e-05, |
|
"loss": 0.4972, |
|
"mean_token_accuracy": 0.8376469679176808, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3176827843961691, |
|
"grad_norm": 0.5938541774391267, |
|
"learning_rate": 4.180756761450171e-05, |
|
"loss": 0.4816, |
|
"mean_token_accuracy": 0.8424232035875321, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.33637000700770847, |
|
"grad_norm": 0.4849394515949123, |
|
"learning_rate": 4.070255480791492e-05, |
|
"loss": 0.491, |
|
"mean_token_accuracy": 0.8394017495214939, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.35505722961924785, |
|
"grad_norm": 0.5718927262308711, |
|
"learning_rate": 3.954666550711159e-05, |
|
"loss": 0.4851, |
|
"mean_token_accuracy": 0.8409382797777653, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.37374445223078717, |
|
"grad_norm": 0.5859959003534665, |
|
"learning_rate": 3.8344353970845606e-05, |
|
"loss": 0.4862, |
|
"mean_token_accuracy": 0.8411718301475049, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.39243167484232655, |
|
"grad_norm": 0.3782337735273932, |
|
"learning_rate": 3.710025334753495e-05, |
|
"loss": 0.4834, |
|
"mean_token_accuracy": 0.8412449143826961, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.41111889745386593, |
|
"grad_norm": 0.41111283052381054, |
|
"learning_rate": 3.581915782126652e-05, |
|
"loss": 0.476, |
|
"mean_token_accuracy": 0.8432458408176899, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4298061200654053, |
|
"grad_norm": 0.3851706170856147, |
|
"learning_rate": 3.4506004137244676e-05, |
|
"loss": 0.4851, |
|
"mean_token_accuracy": 0.8405788190662861, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.44849334267694463, |
|
"grad_norm": 0.5062426249148336, |
|
"learning_rate": 3.3165852577875546e-05, |
|
"loss": 0.4785, |
|
"mean_token_accuracy": 0.8426314078271389, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.467180565288484, |
|
"grad_norm": 0.40499027788768055, |
|
"learning_rate": 3.180386746279663e-05, |
|
"loss": 0.4747, |
|
"mean_token_accuracy": 0.843725374341011, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.4858677879000234, |
|
"grad_norm": 0.47044740848341193, |
|
"learning_rate": 3.04252972479953e-05, |
|
"loss": 0.472, |
|
"mean_token_accuracy": 0.8443724811077118, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5045550105115627, |
|
"grad_norm": 0.4272272972180237, |
|
"learning_rate": 2.90354543007051e-05, |
|
"loss": 0.4725, |
|
"mean_token_accuracy": 0.8439367160201072, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.523242233123102, |
|
"grad_norm": 0.4584455682048115, |
|
"learning_rate": 2.7639694428017792e-05, |
|
"loss": 0.4777, |
|
"mean_token_accuracy": 0.842538620531559, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5419294557346415, |
|
"grad_norm": 0.4589522708203329, |
|
"learning_rate": 2.6243396238098518e-05, |
|
"loss": 0.4693, |
|
"mean_token_accuracy": 0.8448904320597649, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5606166783461808, |
|
"grad_norm": 0.5239037802900407, |
|
"learning_rate": 2.4851940413536174e-05, |
|
"loss": 0.4697, |
|
"mean_token_accuracy": 0.8447436839342117, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5793039009577201, |
|
"grad_norm": 0.4866344062046232, |
|
"learning_rate": 2.347068897669999e-05, |
|
"loss": 0.4687, |
|
"mean_token_accuracy": 0.8448469452559948, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5979911235692595, |
|
"grad_norm": 0.3163374085877721, |
|
"learning_rate": 2.2104964627003848e-05, |
|
"loss": 0.4629, |
|
"mean_token_accuracy": 0.846843034029007, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6166783461807989, |
|
"grad_norm": 0.2747336073106856, |
|
"learning_rate": 2.0760030229702972e-05, |
|
"loss": 0.4612, |
|
"mean_token_accuracy": 0.8469885870814323, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6353655687923382, |
|
"grad_norm": 0.24722982457005138, |
|
"learning_rate": 1.9441068535263564e-05, |
|
"loss": 0.4596, |
|
"mean_token_accuracy": 0.8476050347089767, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6540527914038776, |
|
"grad_norm": 0.23852151209010736, |
|
"learning_rate": 1.815316220745756e-05, |
|
"loss": 0.4636, |
|
"mean_token_accuracy": 0.8460029393434525, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6727400140154169, |
|
"grad_norm": 0.2597676847867842, |
|
"learning_rate": 1.6901274237144782e-05, |
|
"loss": 0.4669, |
|
"mean_token_accuracy": 0.8451244607567787, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6914272366269563, |
|
"grad_norm": 0.8605153163863832, |
|
"learning_rate": 1.5690228817218815e-05, |
|
"loss": 0.4668, |
|
"mean_token_accuracy": 0.8468827910721302, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.7101144592384957, |
|
"grad_norm": 0.24915841679008277, |
|
"learning_rate": 1.4524692752415493e-05, |
|
"loss": 0.4591, |
|
"mean_token_accuracy": 0.8473225735127926, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.728801681850035, |
|
"grad_norm": 0.29798920762515824, |
|
"learning_rate": 1.3409157475622094e-05, |
|
"loss": 0.4576, |
|
"mean_token_accuracy": 0.847739252448082, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.7474889044615743, |
|
"grad_norm": 0.26110601129292016, |
|
"learning_rate": 1.2347921739987815e-05, |
|
"loss": 0.4611, |
|
"mean_token_accuracy": 0.8468295410275459, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7661761270731138, |
|
"grad_norm": 0.2785413649007615, |
|
"learning_rate": 1.1345075053532287e-05, |
|
"loss": 0.4615, |
|
"mean_token_accuracy": 0.846584790199995, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.7848633496846531, |
|
"grad_norm": 0.281588896295376, |
|
"learning_rate": 1.0404481920087206e-05, |
|
"loss": 0.4532, |
|
"mean_token_accuracy": 0.8491566374897956, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8035505722961925, |
|
"grad_norm": 0.26784943568942043, |
|
"learning_rate": 9.529766947299371e-06, |
|
"loss": 0.4555, |
|
"mean_token_accuracy": 0.8485622465610504, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.8222377949077319, |
|
"grad_norm": 0.22085700610769077, |
|
"learning_rate": 8.724300879081718e-06, |
|
"loss": 0.4584, |
|
"mean_token_accuracy": 0.8476461283862591, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8409250175192712, |
|
"grad_norm": 0.1833370072353519, |
|
"learning_rate": 7.991187606337009e-06, |
|
"loss": 0.452, |
|
"mean_token_accuracy": 0.8494263976812363, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.8596122401308106, |
|
"grad_norm": 0.20272051001866034, |
|
"learning_rate": 7.333252206008559e-06, |
|
"loss": 0.4538, |
|
"mean_token_accuracy": 0.8487676382064819, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8782994627423499, |
|
"grad_norm": 0.19595652872474512, |
|
"learning_rate": 6.753030054550158e-06, |
|
"loss": 0.4506, |
|
"mean_token_accuracy": 0.8496683083474637, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.8969866853538893, |
|
"grad_norm": 0.19319039281954486, |
|
"learning_rate": 6.25275705776658e-06, |
|
"loss": 0.4519, |
|
"mean_token_accuracy": 0.8493411011993885, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9156739079654287, |
|
"grad_norm": 0.20035376740680347, |
|
"learning_rate": 5.834361034674521e-06, |
|
"loss": 0.4557, |
|
"mean_token_accuracy": 0.8482660032808781, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.934361130576968, |
|
"grad_norm": 0.2010469617138832, |
|
"learning_rate": 5.499454288586379e-06, |
|
"loss": 0.453, |
|
"mean_token_accuracy": 0.8490205124020577, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9530483531885073, |
|
"grad_norm": 0.2001591137777418, |
|
"learning_rate": 5.24932739404462e-06, |
|
"loss": 0.4488, |
|
"mean_token_accuracy": 0.8502446681261062, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.9717355758000468, |
|
"grad_norm": 0.18321355893669744, |
|
"learning_rate": 5.08494422354882e-06, |
|
"loss": 0.4518, |
|
"mean_token_accuracy": 0.849391470849514, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9904227984115861, |
|
"grad_norm": 0.18951844466307075, |
|
"learning_rate": 5.006938233240212e-06, |
|
"loss": 0.4554, |
|
"mean_token_accuracy": 0.8482832841575145, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.9978976874562018, |
|
"mean_token_accuracy": 0.8498711809515953, |
|
"step": 267, |
|
"total_flos": 2841831180075008.0, |
|
"train_loss": 0.4941176689519418, |
|
"train_runtime": 26749.4727, |
|
"train_samples_per_second": 1.28, |
|
"train_steps_per_second": 0.01 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 267, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2841831180075008.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|