|
{ |
|
"best_metric": 0.7003, |
|
"best_model_checkpoint": "runs/legis-tucano-2b-valid/checkpoint-710", |
|
"epoch": 0.9994065281899109, |
|
"eval_steps": 10, |
|
"global_step": 842, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011869436201780416, |
|
"grad_norm": 1.3277966052420052, |
|
"learning_rate": 2.3529411764705884e-05, |
|
"loss": 1.3563, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.011869436201780416, |
|
"eval_loss": 1.2333489656448364, |
|
"eval_runtime": 7.1731, |
|
"eval_samples_per_second": 7.249, |
|
"eval_steps_per_second": 0.976, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02373887240356083, |
|
"grad_norm": 0.2988625315142482, |
|
"learning_rate": 4.705882352941177e-05, |
|
"loss": 1.2538, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02373887240356083, |
|
"eval_loss": 1.117323398590088, |
|
"eval_runtime": 7.1487, |
|
"eval_samples_per_second": 7.274, |
|
"eval_steps_per_second": 0.979, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03560830860534125, |
|
"grad_norm": 0.15105150205016427, |
|
"learning_rate": 7.058823529411765e-05, |
|
"loss": 1.1462, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03560830860534125, |
|
"eval_loss": 1.0231214761734009, |
|
"eval_runtime": 7.1358, |
|
"eval_samples_per_second": 7.287, |
|
"eval_steps_per_second": 0.981, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04747774480712166, |
|
"grad_norm": 0.07095151302909826, |
|
"learning_rate": 9.411764705882353e-05, |
|
"loss": 1.0635, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04747774480712166, |
|
"eval_loss": 0.9480358958244324, |
|
"eval_runtime": 7.1596, |
|
"eval_samples_per_second": 7.263, |
|
"eval_steps_per_second": 0.978, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05934718100890208, |
|
"grad_norm": 0.0510771961177552, |
|
"learning_rate": 0.00011764705882352942, |
|
"loss": 0.9848, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05934718100890208, |
|
"eval_loss": 0.8925089240074158, |
|
"eval_runtime": 7.1518, |
|
"eval_samples_per_second": 7.271, |
|
"eval_steps_per_second": 0.979, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0712166172106825, |
|
"grad_norm": 0.040539214461615646, |
|
"learning_rate": 0.0001411764705882353, |
|
"loss": 0.9513, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0712166172106825, |
|
"eval_loss": 0.8546838164329529, |
|
"eval_runtime": 7.1461, |
|
"eval_samples_per_second": 7.277, |
|
"eval_steps_per_second": 0.98, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0830860534124629, |
|
"grad_norm": 0.038791558909017074, |
|
"learning_rate": 0.0001647058823529412, |
|
"loss": 0.9142, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0830860534124629, |
|
"eval_loss": 0.8263636231422424, |
|
"eval_runtime": 7.3085, |
|
"eval_samples_per_second": 7.115, |
|
"eval_steps_per_second": 0.958, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09495548961424333, |
|
"grad_norm": 0.04169274461251293, |
|
"learning_rate": 0.00018823529411764707, |
|
"loss": 0.8854, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09495548961424333, |
|
"eval_loss": 0.8020428419113159, |
|
"eval_runtime": 7.247, |
|
"eval_samples_per_second": 7.175, |
|
"eval_steps_per_second": 0.966, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10682492581602374, |
|
"grad_norm": 0.043015312738591396, |
|
"learning_rate": 0.00019997847206287532, |
|
"loss": 0.8632, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.10682492581602374, |
|
"eval_loss": 0.7831454873085022, |
|
"eval_runtime": 7.2244, |
|
"eval_samples_per_second": 7.198, |
|
"eval_steps_per_second": 0.969, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11869436201780416, |
|
"grad_norm": 0.05736189956437236, |
|
"learning_rate": 0.00019980630417613612, |
|
"loss": 0.8669, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11869436201780416, |
|
"eval_loss": 0.7694521546363831, |
|
"eval_runtime": 7.2277, |
|
"eval_samples_per_second": 7.194, |
|
"eval_steps_per_second": 0.968, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.13056379821958458, |
|
"grad_norm": 0.04913014792038086, |
|
"learning_rate": 0.0001994622648842964, |
|
"loss": 0.8314, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.13056379821958458, |
|
"eval_loss": 0.7584220170974731, |
|
"eval_runtime": 7.2488, |
|
"eval_samples_per_second": 7.174, |
|
"eval_steps_per_second": 0.966, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.142433234421365, |
|
"grad_norm": 0.04446480334718679, |
|
"learning_rate": 0.00019894694664007728, |
|
"loss": 0.8223, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.142433234421365, |
|
"eval_loss": 0.749190092086792, |
|
"eval_runtime": 7.2447, |
|
"eval_samples_per_second": 7.178, |
|
"eval_steps_per_second": 0.966, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1543026706231454, |
|
"grad_norm": 0.05216966997945006, |
|
"learning_rate": 0.00019826123684704952, |
|
"loss": 0.8141, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1543026706231454, |
|
"eval_loss": 0.7411435842514038, |
|
"eval_runtime": 7.2499, |
|
"eval_samples_per_second": 7.173, |
|
"eval_steps_per_second": 0.966, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1661721068249258, |
|
"grad_norm": 0.04233605582577254, |
|
"learning_rate": 0.00019740631633148045, |
|
"loss": 0.8023, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1661721068249258, |
|
"eval_loss": 0.7342764139175415, |
|
"eval_runtime": 7.2264, |
|
"eval_samples_per_second": 7.196, |
|
"eval_steps_per_second": 0.969, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.17804154302670624, |
|
"grad_norm": 0.04704056707854844, |
|
"learning_rate": 0.00019638365730889265, |
|
"loss": 0.7895, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.17804154302670624, |
|
"eval_loss": 0.7279341220855713, |
|
"eval_runtime": 7.2397, |
|
"eval_samples_per_second": 7.183, |
|
"eval_steps_per_second": 0.967, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.18991097922848665, |
|
"grad_norm": 0.044026531833322384, |
|
"learning_rate": 0.00019519502084883582, |
|
"loss": 0.7916, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.18991097922848665, |
|
"eval_loss": 0.7229446768760681, |
|
"eval_runtime": 7.2583, |
|
"eval_samples_per_second": 7.164, |
|
"eval_steps_per_second": 0.964, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.20178041543026706, |
|
"grad_norm": 0.05191891103370369, |
|
"learning_rate": 0.00019384245384223767, |
|
"loss": 0.791, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.20178041543026706, |
|
"eval_loss": 0.7180152535438538, |
|
"eval_runtime": 7.2578, |
|
"eval_samples_per_second": 7.165, |
|
"eval_steps_per_second": 0.964, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.21364985163204747, |
|
"grad_norm": 0.05572444654938424, |
|
"learning_rate": 0.00019232828547655615, |
|
"loss": 0.7955, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.21364985163204747, |
|
"eval_loss": 0.7139907479286194, |
|
"eval_runtime": 7.2429, |
|
"eval_samples_per_second": 7.179, |
|
"eval_steps_per_second": 0.966, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.22551928783382788, |
|
"grad_norm": 0.0516837327891167, |
|
"learning_rate": 0.00019065512322480332, |
|
"loss": 0.7776, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.22551928783382788, |
|
"eval_loss": 0.7107158899307251, |
|
"eval_runtime": 7.2537, |
|
"eval_samples_per_second": 7.169, |
|
"eval_steps_per_second": 0.965, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.23738872403560832, |
|
"grad_norm": 0.050846230869752186, |
|
"learning_rate": 0.00018882584835534737, |
|
"loss": 0.785, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.23738872403560832, |
|
"eval_loss": 0.7068641185760498, |
|
"eval_runtime": 7.2578, |
|
"eval_samples_per_second": 7.165, |
|
"eval_steps_per_second": 0.964, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.24925816023738873, |
|
"grad_norm": 0.05343887989356088, |
|
"learning_rate": 0.00018684361097022568, |
|
"loss": 0.7825, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.24925816023738873, |
|
"eval_loss": 0.703826904296875, |
|
"eval_runtime": 7.2465, |
|
"eval_samples_per_second": 7.176, |
|
"eval_steps_per_second": 0.966, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.26112759643916916, |
|
"grad_norm": 0.05242956764694665, |
|
"learning_rate": 0.00018471182458051287, |
|
"loss": 0.7539, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.26112759643916916, |
|
"eval_loss": 0.7010646462440491, |
|
"eval_runtime": 7.2455, |
|
"eval_samples_per_second": 7.177, |
|
"eval_steps_per_second": 0.966, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.27299703264094954, |
|
"grad_norm": 0.051064873049660696, |
|
"learning_rate": 0.00018243416022808547, |
|
"loss": 0.7785, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.27299703264094954, |
|
"eval_loss": 0.6979869604110718, |
|
"eval_runtime": 7.2598, |
|
"eval_samples_per_second": 7.163, |
|
"eval_steps_per_second": 0.964, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.28486646884273, |
|
"grad_norm": 0.046329966712940596, |
|
"learning_rate": 0.00018001454016390586, |
|
"loss": 0.7792, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.28486646884273, |
|
"eval_loss": 0.6954675912857056, |
|
"eval_runtime": 7.2628, |
|
"eval_samples_per_second": 7.16, |
|
"eval_steps_per_second": 0.964, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.29673590504451036, |
|
"grad_norm": 0.05390500282652213, |
|
"learning_rate": 0.00017745713109371139, |
|
"loss": 0.753, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.29673590504451036, |
|
"eval_loss": 0.6930329203605652, |
|
"eval_runtime": 7.2472, |
|
"eval_samples_per_second": 7.175, |
|
"eval_steps_per_second": 0.966, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3086053412462908, |
|
"grad_norm": 0.043546536548247354, |
|
"learning_rate": 0.00017476633700274068, |
|
"loss": 0.765, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3086053412462908, |
|
"eval_loss": 0.6907065510749817, |
|
"eval_runtime": 7.2525, |
|
"eval_samples_per_second": 7.17, |
|
"eval_steps_per_second": 0.965, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.32047477744807124, |
|
"grad_norm": 0.06771031218563213, |
|
"learning_rate": 0.00017194679157185255, |
|
"loss": 0.7464, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.32047477744807124, |
|
"eval_loss": 0.6883173584938049, |
|
"eval_runtime": 7.1658, |
|
"eval_samples_per_second": 7.257, |
|
"eval_steps_per_second": 0.977, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3323442136498516, |
|
"grad_norm": 0.04731592266489432, |
|
"learning_rate": 0.00016900335019809782, |
|
"loss": 0.7649, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3323442136498516, |
|
"eval_loss": 0.6865535378456116, |
|
"eval_runtime": 7.1591, |
|
"eval_samples_per_second": 7.264, |
|
"eval_steps_per_second": 0.978, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.34421364985163205, |
|
"grad_norm": 0.05422567227576833, |
|
"learning_rate": 0.00016594108163348493, |
|
"loss": 0.764, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.34421364985163205, |
|
"eval_loss": 0.6839883327484131, |
|
"eval_runtime": 7.1803, |
|
"eval_samples_per_second": 7.242, |
|
"eval_steps_per_second": 0.975, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3560830860534125, |
|
"grad_norm": 0.04716111383242718, |
|
"learning_rate": 0.0001627652592563373, |
|
"loss": 0.7457, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3560830860534125, |
|
"eval_loss": 0.6814995408058167, |
|
"eval_runtime": 7.1545, |
|
"eval_samples_per_second": 7.268, |
|
"eval_steps_per_second": 0.978, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.36795252225519287, |
|
"grad_norm": 0.04833237070306901, |
|
"learning_rate": 0.00015948135199027474, |
|
"loss": 0.7427, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.36795252225519287, |
|
"eval_loss": 0.6795143485069275, |
|
"eval_runtime": 7.164, |
|
"eval_samples_per_second": 7.258, |
|
"eval_steps_per_second": 0.977, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3798219584569733, |
|
"grad_norm": 0.04790062682337991, |
|
"learning_rate": 0.00015609501488645554, |
|
"loss": 0.7564, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3798219584569733, |
|
"eval_loss": 0.6774563193321228, |
|
"eval_runtime": 7.1529, |
|
"eval_samples_per_second": 7.27, |
|
"eval_steps_per_second": 0.979, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3916913946587537, |
|
"grad_norm": 0.04849257718202548, |
|
"learning_rate": 0.00015261207938529808, |
|
"loss": 0.7352, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.3916913946587537, |
|
"eval_loss": 0.676394522190094, |
|
"eval_runtime": 7.1522, |
|
"eval_samples_per_second": 7.27, |
|
"eval_steps_per_second": 0.979, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4035608308605341, |
|
"grad_norm": 0.0449523115474861, |
|
"learning_rate": 0.00014903854327445116, |
|
"loss": 0.7471, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4035608308605341, |
|
"eval_loss": 0.6733577251434326, |
|
"eval_runtime": 7.1516, |
|
"eval_samples_per_second": 7.271, |
|
"eval_steps_per_second": 0.979, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.41543026706231456, |
|
"grad_norm": 0.044399706693004984, |
|
"learning_rate": 0.00014538056036030622, |
|
"loss": 0.7408, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.41543026706231456, |
|
"eval_loss": 0.6722955107688904, |
|
"eval_runtime": 7.1523, |
|
"eval_samples_per_second": 7.27, |
|
"eval_steps_per_second": 0.979, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.42729970326409494, |
|
"grad_norm": 0.04749195230072245, |
|
"learning_rate": 0.00014164442987083762, |
|
"loss": 0.7493, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.42729970326409494, |
|
"eval_loss": 0.6708287000656128, |
|
"eval_runtime": 7.158, |
|
"eval_samples_per_second": 7.265, |
|
"eval_steps_per_second": 0.978, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4391691394658754, |
|
"grad_norm": 0.04622607097065686, |
|
"learning_rate": 0.0001378365856080198, |
|
"loss": 0.7375, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4391691394658754, |
|
"eval_loss": 0.6693353056907654, |
|
"eval_runtime": 7.1529, |
|
"eval_samples_per_second": 7.27, |
|
"eval_steps_per_second": 0.979, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.45103857566765576, |
|
"grad_norm": 0.04779103392401475, |
|
"learning_rate": 0.00013396358486850103, |
|
"loss": 0.7469, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.45103857566765576, |
|
"eval_loss": 0.6675475835800171, |
|
"eval_runtime": 7.1609, |
|
"eval_samples_per_second": 7.262, |
|
"eval_steps_per_second": 0.978, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4629080118694362, |
|
"grad_norm": 0.04367335948535579, |
|
"learning_rate": 0.0001300320971516136, |
|
"loss": 0.7388, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4629080118694362, |
|
"eval_loss": 0.6652542948722839, |
|
"eval_runtime": 7.1535, |
|
"eval_samples_per_second": 7.269, |
|
"eval_steps_per_second": 0.979, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.47477744807121663, |
|
"grad_norm": 0.04906513653224005, |
|
"learning_rate": 0.0001260488926741651, |
|
"loss": 0.7434, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.47477744807121663, |
|
"eval_loss": 0.6640903949737549, |
|
"eval_runtime": 7.1588, |
|
"eval_samples_per_second": 7.264, |
|
"eval_steps_per_second": 0.978, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.486646884272997, |
|
"grad_norm": 0.04507001139303972, |
|
"learning_rate": 0.00012202083071178938, |
|
"loss": 0.7271, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.486646884272997, |
|
"eval_loss": 0.6632689237594604, |
|
"eval_runtime": 7.1544, |
|
"eval_samples_per_second": 7.268, |
|
"eval_steps_per_second": 0.978, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.49851632047477745, |
|
"grad_norm": 0.054746961897722525, |
|
"learning_rate": 0.00011795484778693382, |
|
"loss": 0.731, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.49851632047477745, |
|
"eval_loss": 0.6615473031997681, |
|
"eval_runtime": 7.1789, |
|
"eval_samples_per_second": 7.243, |
|
"eval_steps_per_second": 0.975, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5103857566765578, |
|
"grad_norm": 0.04685925852904185, |
|
"learning_rate": 0.00011385794572382357, |
|
"loss": 0.7571, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5103857566765578, |
|
"eval_loss": 0.660754919052124, |
|
"eval_runtime": 7.1679, |
|
"eval_samples_per_second": 7.255, |
|
"eval_steps_per_second": 0.977, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5222551928783383, |
|
"grad_norm": 0.05188129116000765, |
|
"learning_rate": 0.00010973717959097327, |
|
"loss": 0.7355, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5222551928783383, |
|
"eval_loss": 0.6598291993141174, |
|
"eval_runtime": 7.1682, |
|
"eval_samples_per_second": 7.254, |
|
"eval_steps_per_second": 0.977, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5341246290801187, |
|
"grad_norm": 0.04549793036088958, |
|
"learning_rate": 0.00010559964555200889, |
|
"loss": 0.7379, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5341246290801187, |
|
"eval_loss": 0.658793568611145, |
|
"eval_runtime": 7.1553, |
|
"eval_samples_per_second": 7.267, |
|
"eval_steps_per_second": 0.978, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5459940652818991, |
|
"grad_norm": 0.05162604134236413, |
|
"learning_rate": 0.0001014524686457223, |
|
"loss": 0.7363, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5459940652818991, |
|
"eval_loss": 0.6571336984634399, |
|
"eval_runtime": 7.156, |
|
"eval_samples_per_second": 7.267, |
|
"eval_steps_per_second": 0.978, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5578635014836796, |
|
"grad_norm": 0.04783457254038487, |
|
"learning_rate": 9.730279051640112e-05, |
|
"loss": 0.7177, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5578635014836796, |
|
"eval_loss": 0.655822217464447, |
|
"eval_runtime": 7.1579, |
|
"eval_samples_per_second": 7.265, |
|
"eval_steps_per_second": 0.978, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.56973293768546, |
|
"grad_norm": 0.04436373760940458, |
|
"learning_rate": 9.315775711556298e-05, |
|
"loss": 0.7334, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.56973293768546, |
|
"eval_loss": 0.6540331244468689, |
|
"eval_runtime": 7.1522, |
|
"eval_samples_per_second": 7.271, |
|
"eval_steps_per_second": 0.979, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5816023738872403, |
|
"grad_norm": 0.04519231140145815, |
|
"learning_rate": 8.902450639627288e-05, |
|
"loss": 0.737, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5816023738872403, |
|
"eval_loss": 0.6534115672111511, |
|
"eval_runtime": 7.156, |
|
"eval_samples_per_second": 7.267, |
|
"eval_steps_per_second": 0.978, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5934718100890207, |
|
"grad_norm": 0.04247565082967707, |
|
"learning_rate": 8.491015602123368e-05, |
|
"loss": 0.7413, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5934718100890207, |
|
"eval_loss": 0.6529149413108826, |
|
"eval_runtime": 7.1578, |
|
"eval_samples_per_second": 7.265, |
|
"eval_steps_per_second": 0.978, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6053412462908012, |
|
"grad_norm": 0.048070529053236284, |
|
"learning_rate": 8.082179110581838e-05, |
|
"loss": 0.7141, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6053412462908012, |
|
"eval_loss": 0.651964545249939, |
|
"eval_runtime": 7.1573, |
|
"eval_samples_per_second": 7.265, |
|
"eval_steps_per_second": 0.978, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6172106824925816, |
|
"grad_norm": 0.04071752578010737, |
|
"learning_rate": 7.676645201714983e-05, |
|
"loss": 0.7248, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6172106824925816, |
|
"eval_loss": 0.6511014103889465, |
|
"eval_runtime": 7.1566, |
|
"eval_samples_per_second": 7.266, |
|
"eval_steps_per_second": 0.978, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.629080118694362, |
|
"grad_norm": 0.04168337358828772, |
|
"learning_rate": 7.27511222502395e-05, |
|
"loss": 0.725, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.629080118694362, |
|
"eval_loss": 0.6503429412841797, |
|
"eval_runtime": 7.1538, |
|
"eval_samples_per_second": 7.269, |
|
"eval_steps_per_second": 0.979, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6409495548961425, |
|
"grad_norm": 0.05260277652628936, |
|
"learning_rate": 6.878271640206281e-05, |
|
"loss": 0.7299, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6409495548961425, |
|
"eval_loss": 0.6499541401863098, |
|
"eval_runtime": 7.1571, |
|
"eval_samples_per_second": 7.265, |
|
"eval_steps_per_second": 0.978, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6528189910979229, |
|
"grad_norm": 0.04558705692362712, |
|
"learning_rate": 6.486806826428016e-05, |
|
"loss": 0.7381, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6528189910979229, |
|
"eval_loss": 0.6490299701690674, |
|
"eval_runtime": 7.1586, |
|
"eval_samples_per_second": 7.264, |
|
"eval_steps_per_second": 0.978, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6646884272997032, |
|
"grad_norm": 0.046779386203552986, |
|
"learning_rate": 6.101391905510889e-05, |
|
"loss": 0.7052, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6646884272997032, |
|
"eval_loss": 0.6482182741165161, |
|
"eval_runtime": 7.1587, |
|
"eval_samples_per_second": 7.264, |
|
"eval_steps_per_second": 0.978, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6765578635014837, |
|
"grad_norm": 0.04433666961158006, |
|
"learning_rate": 5.72269058106111e-05, |
|
"loss": 0.7267, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6765578635014837, |
|
"eval_loss": 0.6474419832229614, |
|
"eval_runtime": 7.1585, |
|
"eval_samples_per_second": 7.264, |
|
"eval_steps_per_second": 0.978, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6884272997032641, |
|
"grad_norm": 0.040969171268037584, |
|
"learning_rate": 5.351354995538859e-05, |
|
"loss": 0.7217, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6884272997032641, |
|
"eval_loss": 0.6467857956886292, |
|
"eval_runtime": 7.1605, |
|
"eval_samples_per_second": 7.262, |
|
"eval_steps_per_second": 0.978, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.7002967359050445, |
|
"grad_norm": 0.04326822029385217, |
|
"learning_rate": 4.988024607236619e-05, |
|
"loss": 0.7225, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7002967359050445, |
|
"eval_loss": 0.6460662484169006, |
|
"eval_runtime": 7.1561, |
|
"eval_samples_per_second": 7.267, |
|
"eval_steps_per_second": 0.978, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.712166172106825, |
|
"grad_norm": 0.04151648905301352, |
|
"learning_rate": 4.633325089100289e-05, |
|
"loss": 0.7305, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.712166172106825, |
|
"eval_loss": 0.6456657648086548, |
|
"eval_runtime": 7.1693, |
|
"eval_samples_per_second": 7.253, |
|
"eval_steps_per_second": 0.976, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7240356083086054, |
|
"grad_norm": 0.043211271741518106, |
|
"learning_rate": 4.287867251289348e-05, |
|
"loss": 0.7202, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7240356083086054, |
|
"eval_loss": 0.6450461745262146, |
|
"eval_runtime": 7.1607, |
|
"eval_samples_per_second": 7.262, |
|
"eval_steps_per_second": 0.978, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7359050445103857, |
|
"grad_norm": 0.043394130216544674, |
|
"learning_rate": 3.952245989331466e-05, |
|
"loss": 0.7182, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7359050445103857, |
|
"eval_loss": 0.6443552374839783, |
|
"eval_runtime": 7.1601, |
|
"eval_samples_per_second": 7.262, |
|
"eval_steps_per_second": 0.978, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7477744807121661, |
|
"grad_norm": 0.0440813623198057, |
|
"learning_rate": 3.627039259682899e-05, |
|
"loss": 0.7269, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7477744807121661, |
|
"eval_loss": 0.6440042853355408, |
|
"eval_runtime": 7.1528, |
|
"eval_samples_per_second": 7.27, |
|
"eval_steps_per_second": 0.979, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7596439169139466, |
|
"grad_norm": 0.04196493736245454, |
|
"learning_rate": 3.312807084458831e-05, |
|
"loss": 0.7321, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7596439169139466, |
|
"eval_loss": 0.6436744332313538, |
|
"eval_runtime": 7.157, |
|
"eval_samples_per_second": 7.266, |
|
"eval_steps_per_second": 0.978, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.771513353115727, |
|
"grad_norm": 0.04409502511550107, |
|
"learning_rate": 3.0100905870475006e-05, |
|
"loss": 0.7127, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.771513353115727, |
|
"eval_loss": 0.6433111429214478, |
|
"eval_runtime": 7.1488, |
|
"eval_samples_per_second": 7.274, |
|
"eval_steps_per_second": 0.979, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7833827893175074, |
|
"grad_norm": 0.04613176858045456, |
|
"learning_rate": 2.7194110602689026e-05, |
|
"loss": 0.7084, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7833827893175074, |
|
"eval_loss": 0.6427409648895264, |
|
"eval_runtime": 7.1651, |
|
"eval_samples_per_second": 7.257, |
|
"eval_steps_per_second": 0.977, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7952522255192879, |
|
"grad_norm": 0.04107118214802092, |
|
"learning_rate": 2.4412690686827e-05, |
|
"loss": 0.7318, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7952522255192879, |
|
"eval_loss": 0.6424255967140198, |
|
"eval_runtime": 7.151, |
|
"eval_samples_per_second": 7.272, |
|
"eval_steps_per_second": 0.979, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.8071216617210683, |
|
"grad_norm": 0.04076823498258544, |
|
"learning_rate": 2.1761435865912296e-05, |
|
"loss": 0.7111, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8071216617210683, |
|
"eval_loss": 0.6420266032218933, |
|
"eval_runtime": 7.1562, |
|
"eval_samples_per_second": 7.266, |
|
"eval_steps_per_second": 0.978, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8189910979228486, |
|
"grad_norm": 0.0402931102905212, |
|
"learning_rate": 1.9244911732219918e-05, |
|
"loss": 0.7173, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8189910979228486, |
|
"eval_loss": 0.6417113542556763, |
|
"eval_runtime": 7.1559, |
|
"eval_samples_per_second": 7.267, |
|
"eval_steps_per_second": 0.978, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8308605341246291, |
|
"grad_norm": 0.040268557762468446, |
|
"learning_rate": 1.6867451865100414e-05, |
|
"loss": 0.7187, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8308605341246291, |
|
"eval_loss": 0.6414252519607544, |
|
"eval_runtime": 7.1614, |
|
"eval_samples_per_second": 7.261, |
|
"eval_steps_per_second": 0.977, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8427299703264095, |
|
"grad_norm": 0.03910974935669944, |
|
"learning_rate": 1.4633150368341153e-05, |
|
"loss": 0.7003, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8427299703264095, |
|
"eval_loss": 0.6411222815513611, |
|
"eval_runtime": 7.158, |
|
"eval_samples_per_second": 7.265, |
|
"eval_steps_per_second": 0.978, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8545994065281899, |
|
"grad_norm": 0.041327420976484716, |
|
"learning_rate": 1.2545854819916646e-05, |
|
"loss": 0.7146, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8545994065281899, |
|
"eval_loss": 0.6409789323806763, |
|
"eval_runtime": 7.1632, |
|
"eval_samples_per_second": 7.259, |
|
"eval_steps_per_second": 0.977, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8664688427299704, |
|
"grad_norm": 0.0415664844879509, |
|
"learning_rate": 1.0609159646268506e-05, |
|
"loss": 0.7326, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8664688427299704, |
|
"eval_loss": 0.640900194644928, |
|
"eval_runtime": 7.1625, |
|
"eval_samples_per_second": 7.26, |
|
"eval_steps_per_second": 0.977, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8783382789317508, |
|
"grad_norm": 0.040948908512356046, |
|
"learning_rate": 8.82639993252482e-06, |
|
"loss": 0.735, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8783382789317508, |
|
"eval_loss": 0.6406245827674866, |
|
"eval_runtime": 7.1641, |
|
"eval_samples_per_second": 7.258, |
|
"eval_steps_per_second": 0.977, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8902077151335311, |
|
"grad_norm": 0.039353242405877606, |
|
"learning_rate": 7.20064567931813e-06, |
|
"loss": 0.7072, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8902077151335311, |
|
"eval_loss": 0.6405530571937561, |
|
"eval_runtime": 7.1654, |
|
"eval_samples_per_second": 7.257, |
|
"eval_steps_per_second": 0.977, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9020771513353115, |
|
"grad_norm": 0.040045103519338765, |
|
"learning_rate": 5.734696516092253e-06, |
|
"loss": 0.7242, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9020771513353115, |
|
"eval_loss": 0.6403719186782837, |
|
"eval_runtime": 7.1641, |
|
"eval_samples_per_second": 7.258, |
|
"eval_steps_per_second": 0.977, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.913946587537092, |
|
"grad_norm": 0.03936062921235335, |
|
"learning_rate": 4.431076880001439e-06, |
|
"loss": 0.725, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.913946587537092, |
|
"eval_loss": 0.6402689218521118, |
|
"eval_runtime": 7.1586, |
|
"eval_samples_per_second": 7.264, |
|
"eval_steps_per_second": 0.978, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9258160237388724, |
|
"grad_norm": 0.038795606374443216, |
|
"learning_rate": 3.292031668704398e-06, |
|
"loss": 0.7051, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9258160237388724, |
|
"eval_loss": 0.6402180790901184, |
|
"eval_runtime": 7.1753, |
|
"eval_samples_per_second": 7.247, |
|
"eval_steps_per_second": 0.976, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9376854599406528, |
|
"grad_norm": 0.04044906534894689, |
|
"learning_rate": 2.3195223745392737e-06, |
|
"loss": 0.7187, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9376854599406528, |
|
"eval_loss": 0.6401559114456177, |
|
"eval_runtime": 7.178, |
|
"eval_samples_per_second": 7.244, |
|
"eval_steps_per_second": 0.975, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9495548961424333, |
|
"grad_norm": 0.03933803718026143, |
|
"learning_rate": 1.5152237067365239e-06, |
|
"loss": 0.7185, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9495548961424333, |
|
"eval_loss": 0.6400689482688904, |
|
"eval_runtime": 7.1614, |
|
"eval_samples_per_second": 7.261, |
|
"eval_steps_per_second": 0.977, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9614243323442137, |
|
"grad_norm": 0.03894023495076019, |
|
"learning_rate": 8.805207074865873e-07, |
|
"loss": 0.7213, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9614243323442137, |
|
"eval_loss": 0.6400617957115173, |
|
"eval_runtime": 7.1732, |
|
"eval_samples_per_second": 7.249, |
|
"eval_steps_per_second": 0.976, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.973293768545994, |
|
"grad_norm": 0.03930788498385229, |
|
"learning_rate": 4.165063668285396e-07, |
|
"loss": 0.7192, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.973293768545994, |
|
"eval_loss": 0.6400607228279114, |
|
"eval_runtime": 7.1626, |
|
"eval_samples_per_second": 7.26, |
|
"eval_steps_per_second": 0.977, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9851632047477745, |
|
"grad_norm": 0.03939532170183159, |
|
"learning_rate": 1.2397974046707283e-07, |
|
"loss": 0.7214, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.9851632047477745, |
|
"eval_loss": 0.6400620341300964, |
|
"eval_runtime": 7.1644, |
|
"eval_samples_per_second": 7.258, |
|
"eval_steps_per_second": 0.977, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.9970326409495549, |
|
"grad_norm": 0.04057041649987638, |
|
"learning_rate": 3.444573758937253e-09, |
|
"loss": 0.7317, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9970326409495549, |
|
"eval_loss": 0.640075147151947, |
|
"eval_runtime": 7.1623, |
|
"eval_samples_per_second": 7.26, |
|
"eval_steps_per_second": 0.977, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9994065281899109, |
|
"step": 842, |
|
"total_flos": 6.615025481416704e+16, |
|
"train_loss": 0.6960575034103031, |
|
"train_runtime": 22091.905, |
|
"train_samples_per_second": 2.44, |
|
"train_steps_per_second": 0.038 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 842, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.615025481416704e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|