|
{ |
|
"best_metric": 1.6244161128997803, |
|
"best_model_checkpoint": "./output/checkpoints/2024-06-11_10-58-33/checkpoint-30", |
|
"epoch": 1.0, |
|
"eval_steps": 1, |
|
"global_step": 37, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02702702702702703, |
|
"grad_norm": 3.2274646759033203, |
|
"learning_rate": 0.0001, |
|
"loss": 5.7374, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02702702702702703, |
|
"eval_loss": 5.6431050300598145, |
|
"eval_runtime": 10.9746, |
|
"eval_samples_per_second": 11.299, |
|
"eval_steps_per_second": 0.729, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.05405405405405406, |
|
"grad_norm": 3.1417839527130127, |
|
"learning_rate": 0.0002, |
|
"loss": 5.6423, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.05405405405405406, |
|
"eval_loss": 5.1099138259887695, |
|
"eval_runtime": 11.0219, |
|
"eval_samples_per_second": 11.25, |
|
"eval_steps_per_second": 0.726, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.08108108108108109, |
|
"grad_norm": 3.1990153789520264, |
|
"learning_rate": 0.00030000000000000003, |
|
"loss": 5.0948, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.08108108108108109, |
|
"eval_loss": 3.559605836868286, |
|
"eval_runtime": 11.0991, |
|
"eval_samples_per_second": 11.172, |
|
"eval_steps_per_second": 0.721, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.10810810810810811, |
|
"grad_norm": 3.2903366088867188, |
|
"learning_rate": 0.0004, |
|
"loss": 3.4375, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.10810810810810811, |
|
"eval_loss": 2.3610196113586426, |
|
"eval_runtime": 11.0573, |
|
"eval_samples_per_second": 11.214, |
|
"eval_steps_per_second": 0.724, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.13513513513513514, |
|
"grad_norm": 1.878879189491272, |
|
"learning_rate": 0.0003878787878787879, |
|
"loss": 2.2693, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.13513513513513514, |
|
"eval_loss": 1.8543975353240967, |
|
"eval_runtime": 11.1541, |
|
"eval_samples_per_second": 11.117, |
|
"eval_steps_per_second": 0.717, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.16216216216216217, |
|
"grad_norm": 1.2040495872497559, |
|
"learning_rate": 0.0003757575757575758, |
|
"loss": 1.7546, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.16216216216216217, |
|
"eval_loss": 1.7222554683685303, |
|
"eval_runtime": 11.0742, |
|
"eval_samples_per_second": 11.197, |
|
"eval_steps_per_second": 0.722, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1891891891891892, |
|
"grad_norm": 1.080614447593689, |
|
"learning_rate": 0.00036363636363636367, |
|
"loss": 1.6633, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.1891891891891892, |
|
"eval_loss": 1.610931158065796, |
|
"eval_runtime": 11.0872, |
|
"eval_samples_per_second": 11.184, |
|
"eval_steps_per_second": 0.722, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.21621621621621623, |
|
"grad_norm": 0.28874385356903076, |
|
"learning_rate": 0.00035151515151515155, |
|
"loss": 1.5122, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.21621621621621623, |
|
"eval_loss": 1.5804481506347656, |
|
"eval_runtime": 11.053, |
|
"eval_samples_per_second": 11.219, |
|
"eval_steps_per_second": 0.724, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.24324324324324326, |
|
"grad_norm": 0.32991790771484375, |
|
"learning_rate": 0.00033939393939393943, |
|
"loss": 1.4316, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.24324324324324326, |
|
"eval_loss": 1.5746935606002808, |
|
"eval_runtime": 11.152, |
|
"eval_samples_per_second": 11.119, |
|
"eval_steps_per_second": 0.717, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"grad_norm": 0.5137693285942078, |
|
"learning_rate": 0.0003272727272727273, |
|
"loss": 1.3161, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"eval_loss": 1.651396632194519, |
|
"eval_runtime": 11.1562, |
|
"eval_samples_per_second": 11.115, |
|
"eval_steps_per_second": 0.717, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.2972972972972973, |
|
"grad_norm": 0.25246673822402954, |
|
"learning_rate": 0.00031515151515151515, |
|
"loss": 1.207, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.2972972972972973, |
|
"eval_loss": 1.7246230840682983, |
|
"eval_runtime": 11.1298, |
|
"eval_samples_per_second": 11.141, |
|
"eval_steps_per_second": 0.719, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.32432432432432434, |
|
"grad_norm": 0.2032381296157837, |
|
"learning_rate": 0.00030303030303030303, |
|
"loss": 1.158, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.32432432432432434, |
|
"eval_loss": 1.7255425453186035, |
|
"eval_runtime": 11.0733, |
|
"eval_samples_per_second": 11.198, |
|
"eval_steps_per_second": 0.722, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.35135135135135137, |
|
"grad_norm": 0.2133413404226303, |
|
"learning_rate": 0.0002909090909090909, |
|
"loss": 1.1137, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.35135135135135137, |
|
"eval_loss": 1.6880252361297607, |
|
"eval_runtime": 11.2007, |
|
"eval_samples_per_second": 11.071, |
|
"eval_steps_per_second": 0.714, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.3783783783783784, |
|
"grad_norm": 0.20175401866436005, |
|
"learning_rate": 0.0002787878787878788, |
|
"loss": 1.1059, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.3783783783783784, |
|
"eval_loss": 1.6500831842422485, |
|
"eval_runtime": 11.1367, |
|
"eval_samples_per_second": 11.134, |
|
"eval_steps_per_second": 0.718, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.40540540540540543, |
|
"grad_norm": 0.22595511376857758, |
|
"learning_rate": 0.0002666666666666667, |
|
"loss": 1.0483, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.40540540540540543, |
|
"eval_loss": 1.6288588047027588, |
|
"eval_runtime": 11.1914, |
|
"eval_samples_per_second": 11.08, |
|
"eval_steps_per_second": 0.715, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.43243243243243246, |
|
"grad_norm": 0.17468485236167908, |
|
"learning_rate": 0.00025454545454545456, |
|
"loss": 1.0584, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.43243243243243246, |
|
"eval_loss": 1.6247642040252686, |
|
"eval_runtime": 11.1035, |
|
"eval_samples_per_second": 11.168, |
|
"eval_steps_per_second": 0.72, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.4594594594594595, |
|
"grad_norm": 0.1654416024684906, |
|
"learning_rate": 0.00024242424242424245, |
|
"loss": 1.0402, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.4594594594594595, |
|
"eval_loss": 1.6316722631454468, |
|
"eval_runtime": 11.2065, |
|
"eval_samples_per_second": 11.065, |
|
"eval_steps_per_second": 0.714, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.4864864864864865, |
|
"grad_norm": 0.10361829400062561, |
|
"learning_rate": 0.00023030303030303033, |
|
"loss": 1.0301, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.4864864864864865, |
|
"eval_loss": 1.6415338516235352, |
|
"eval_runtime": 11.18, |
|
"eval_samples_per_second": 11.091, |
|
"eval_steps_per_second": 0.716, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.5135135135135135, |
|
"grad_norm": 0.09156349301338196, |
|
"learning_rate": 0.00021818181818181818, |
|
"loss": 1.0183, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.5135135135135135, |
|
"eval_loss": 1.6544169187545776, |
|
"eval_runtime": 11.1626, |
|
"eval_samples_per_second": 11.109, |
|
"eval_steps_per_second": 0.717, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.5405405405405406, |
|
"grad_norm": 0.087005615234375, |
|
"learning_rate": 0.00020606060606060607, |
|
"loss": 1.028, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.5405405405405406, |
|
"eval_loss": 1.6620415449142456, |
|
"eval_runtime": 11.2393, |
|
"eval_samples_per_second": 11.033, |
|
"eval_steps_per_second": 0.712, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.5675675675675675, |
|
"grad_norm": 0.09235216677188873, |
|
"learning_rate": 0.00019393939393939395, |
|
"loss": 0.9825, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.5675675675675675, |
|
"eval_loss": 1.6642476320266724, |
|
"eval_runtime": 11.2278, |
|
"eval_samples_per_second": 11.044, |
|
"eval_steps_per_second": 0.713, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.5945945945945946, |
|
"grad_norm": 0.0915454775094986, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.9991, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.5945945945945946, |
|
"eval_loss": 1.6625572443008423, |
|
"eval_runtime": 11.1424, |
|
"eval_samples_per_second": 11.129, |
|
"eval_steps_per_second": 0.718, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.6216216216216216, |
|
"grad_norm": 0.09213992953300476, |
|
"learning_rate": 0.00016969696969696972, |
|
"loss": 1.0211, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.6216216216216216, |
|
"eval_loss": 1.6593235731124878, |
|
"eval_runtime": 11.1978, |
|
"eval_samples_per_second": 11.074, |
|
"eval_steps_per_second": 0.714, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.6486486486486487, |
|
"grad_norm": 0.0854020044207573, |
|
"learning_rate": 0.00015757575757575757, |
|
"loss": 1.0291, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.6486486486486487, |
|
"eval_loss": 1.6526458263397217, |
|
"eval_runtime": 11.2323, |
|
"eval_samples_per_second": 11.04, |
|
"eval_steps_per_second": 0.712, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.6756756756756757, |
|
"grad_norm": 0.08045388758182526, |
|
"learning_rate": 0.00014545454545454546, |
|
"loss": 0.9887, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.6756756756756757, |
|
"eval_loss": 1.6451815366744995, |
|
"eval_runtime": 11.1905, |
|
"eval_samples_per_second": 11.081, |
|
"eval_steps_per_second": 0.715, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.7027027027027027, |
|
"grad_norm": 0.07576093822717667, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 1.0044, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.7027027027027027, |
|
"eval_loss": 1.6377238035202026, |
|
"eval_runtime": 11.1714, |
|
"eval_samples_per_second": 11.1, |
|
"eval_steps_per_second": 0.716, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.7297297297297297, |
|
"grad_norm": 0.07311829924583435, |
|
"learning_rate": 0.00012121212121212122, |
|
"loss": 0.9772, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.7297297297297297, |
|
"eval_loss": 1.6314424276351929, |
|
"eval_runtime": 11.1489, |
|
"eval_samples_per_second": 11.122, |
|
"eval_steps_per_second": 0.718, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.7567567567567568, |
|
"grad_norm": 0.07776332646608353, |
|
"learning_rate": 0.00010909090909090909, |
|
"loss": 0.9902, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.7567567567567568, |
|
"eval_loss": 1.625641942024231, |
|
"eval_runtime": 11.1261, |
|
"eval_samples_per_second": 11.145, |
|
"eval_steps_per_second": 0.719, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.7837837837837838, |
|
"grad_norm": 0.07536856085062027, |
|
"learning_rate": 9.696969696969698e-05, |
|
"loss": 0.9902, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.7837837837837838, |
|
"eval_loss": 1.6233930587768555, |
|
"eval_runtime": 11.1754, |
|
"eval_samples_per_second": 11.096, |
|
"eval_steps_per_second": 0.716, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"grad_norm": 0.07941398024559021, |
|
"learning_rate": 8.484848484848486e-05, |
|
"loss": 0.9784, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"eval_loss": 1.6244161128997803, |
|
"eval_runtime": 11.2511, |
|
"eval_samples_per_second": 11.021, |
|
"eval_steps_per_second": 0.711, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.8378378378378378, |
|
"grad_norm": 0.07617861032485962, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 1.0064, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.8378378378378378, |
|
"eval_loss": 1.62636399269104, |
|
"eval_runtime": 11.1098, |
|
"eval_samples_per_second": 11.161, |
|
"eval_steps_per_second": 0.72, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.8648648648648649, |
|
"grad_norm": 0.06959453225135803, |
|
"learning_rate": 6.060606060606061e-05, |
|
"loss": 0.9764, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.8648648648648649, |
|
"eval_loss": 1.6286530494689941, |
|
"eval_runtime": 11.2497, |
|
"eval_samples_per_second": 11.023, |
|
"eval_steps_per_second": 0.711, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.8918918918918919, |
|
"grad_norm": 0.07171300053596497, |
|
"learning_rate": 4.848484848484849e-05, |
|
"loss": 0.9921, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.8918918918918919, |
|
"eval_loss": 1.630918264389038, |
|
"eval_runtime": 11.1794, |
|
"eval_samples_per_second": 11.092, |
|
"eval_steps_per_second": 0.716, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.918918918918919, |
|
"grad_norm": 0.07644116133451462, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.9716, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.918918918918919, |
|
"eval_loss": 1.63330078125, |
|
"eval_runtime": 11.1352, |
|
"eval_samples_per_second": 11.136, |
|
"eval_steps_per_second": 0.718, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.9459459459459459, |
|
"grad_norm": 0.07242273539304733, |
|
"learning_rate": 2.4242424242424244e-05, |
|
"loss": 0.9781, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.9459459459459459, |
|
"eval_loss": 1.634429931640625, |
|
"eval_runtime": 11.203, |
|
"eval_samples_per_second": 11.069, |
|
"eval_steps_per_second": 0.714, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.972972972972973, |
|
"grad_norm": 0.069486603140831, |
|
"learning_rate": 1.2121212121212122e-05, |
|
"loss": 0.9592, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.972972972972973, |
|
"eval_loss": 1.6349563598632812, |
|
"eval_runtime": 11.1077, |
|
"eval_samples_per_second": 11.163, |
|
"eval_steps_per_second": 0.72, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.07558272778987885, |
|
"learning_rate": 0.0, |
|
"loss": 0.9368, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.6352812051773071, |
|
"eval_runtime": 11.1164, |
|
"eval_samples_per_second": 11.155, |
|
"eval_steps_per_second": 0.72, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 37, |
|
"total_flos": 1.3641878835560448e+16, |
|
"train_loss": 1.5526673584371, |
|
"train_runtime": 758.4004, |
|
"train_samples_per_second": 1.552, |
|
"train_steps_per_second": 0.049 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 37, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3641878835560448e+16, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|