|
{ |
|
"best_metric": 3.1603705883026123, |
|
"best_model_checkpoint": "./models/full-finetuning/LLaMmlein_120M/checkpoint-58000", |
|
"epoch": 1.0, |
|
"eval_steps": 1000, |
|
"global_step": 59835, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008356313194618534, |
|
"grad_norm": 40.14425277709961, |
|
"learning_rate": 4.96e-05, |
|
"loss": 4.8477, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.016712626389237067, |
|
"grad_norm": 22.107275009155273, |
|
"learning_rate": 4.958203421252212e-05, |
|
"loss": 4.4466, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.016712626389237067, |
|
"eval_loss": 4.3118462562561035, |
|
"eval_runtime": 22.0404, |
|
"eval_samples_per_second": 202.446, |
|
"eval_steps_per_second": 25.317, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.025068939583855605, |
|
"grad_norm": 19.609399795532227, |
|
"learning_rate": 4.916069773320974e-05, |
|
"loss": 4.2986, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.033425252778474135, |
|
"grad_norm": 18.64438247680664, |
|
"learning_rate": 4.873936125389736e-05, |
|
"loss": 4.2295, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.033425252778474135, |
|
"eval_loss": 4.121812343597412, |
|
"eval_runtime": 22.1428, |
|
"eval_samples_per_second": 201.51, |
|
"eval_steps_per_second": 25.2, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.04178156597309267, |
|
"grad_norm": 19.098369598388672, |
|
"learning_rate": 4.8318024774584986e-05, |
|
"loss": 4.1165, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.05013787916771121, |
|
"grad_norm": 17.83785629272461, |
|
"learning_rate": 4.789668829527261e-05, |
|
"loss": 4.029, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.05013787916771121, |
|
"eval_loss": 4.040452480316162, |
|
"eval_runtime": 22.088, |
|
"eval_samples_per_second": 202.01, |
|
"eval_steps_per_second": 25.263, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.05849419236232974, |
|
"grad_norm": 14.394288063049316, |
|
"learning_rate": 4.747535181596023e-05, |
|
"loss": 4.0701, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.06685050555694827, |
|
"grad_norm": 17.005945205688477, |
|
"learning_rate": 4.705401533664785e-05, |
|
"loss": 4.0239, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.06685050555694827, |
|
"eval_loss": 3.948943853378296, |
|
"eval_runtime": 22.0828, |
|
"eval_samples_per_second": 202.058, |
|
"eval_steps_per_second": 25.269, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.0752068187515668, |
|
"grad_norm": 20.297489166259766, |
|
"learning_rate": 4.663267885733547e-05, |
|
"loss": 3.9704, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.08356313194618534, |
|
"grad_norm": 17.280521392822266, |
|
"learning_rate": 4.621134237802309e-05, |
|
"loss": 3.9118, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.08356313194618534, |
|
"eval_loss": 3.891711950302124, |
|
"eval_runtime": 22.1454, |
|
"eval_samples_per_second": 201.487, |
|
"eval_steps_per_second": 25.197, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.09191944514080387, |
|
"grad_norm": 18.847597122192383, |
|
"learning_rate": 4.579000589871071e-05, |
|
"loss": 3.9024, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.10027575833542242, |
|
"grad_norm": 14.654472351074219, |
|
"learning_rate": 4.5368669419398335e-05, |
|
"loss": 3.8479, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.10027575833542242, |
|
"eval_loss": 3.8378305435180664, |
|
"eval_runtime": 22.1051, |
|
"eval_samples_per_second": 201.854, |
|
"eval_steps_per_second": 25.243, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.10863207153004095, |
|
"grad_norm": 18.267274856567383, |
|
"learning_rate": 4.494817561304458e-05, |
|
"loss": 3.8116, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.11698838472465949, |
|
"grad_norm": 13.313653945922852, |
|
"learning_rate": 4.45268391337322e-05, |
|
"loss": 3.8592, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.11698838472465949, |
|
"eval_loss": 3.8072171211242676, |
|
"eval_runtime": 22.2351, |
|
"eval_samples_per_second": 200.674, |
|
"eval_steps_per_second": 25.095, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.12534469791927802, |
|
"grad_norm": 16.960010528564453, |
|
"learning_rate": 4.410550265441982e-05, |
|
"loss": 3.8291, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.13370101111389654, |
|
"grad_norm": 15.129623413085938, |
|
"learning_rate": 4.3684166175107444e-05, |
|
"loss": 3.7697, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.13370101111389654, |
|
"eval_loss": 3.777130126953125, |
|
"eval_runtime": 22.2765, |
|
"eval_samples_per_second": 200.3, |
|
"eval_steps_per_second": 25.049, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.1420573243085151, |
|
"grad_norm": 18.825756072998047, |
|
"learning_rate": 4.326535771467094e-05, |
|
"loss": 3.7714, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.1504136375031336, |
|
"grad_norm": 13.93099308013916, |
|
"learning_rate": 4.284402123535856e-05, |
|
"loss": 3.7719, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.1504136375031336, |
|
"eval_loss": 3.747012138366699, |
|
"eval_runtime": 22.2951, |
|
"eval_samples_per_second": 200.134, |
|
"eval_steps_per_second": 25.028, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.15876995069775215, |
|
"grad_norm": 12.885889053344727, |
|
"learning_rate": 4.2422684756046185e-05, |
|
"loss": 3.7127, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.16712626389237067, |
|
"grad_norm": 15.362942695617676, |
|
"learning_rate": 4.200134827673381e-05, |
|
"loss": 3.7135, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.16712626389237067, |
|
"eval_loss": 3.7190206050872803, |
|
"eval_runtime": 22.2344, |
|
"eval_samples_per_second": 200.68, |
|
"eval_steps_per_second": 25.096, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.17548257708698922, |
|
"grad_norm": 12.432964324951172, |
|
"learning_rate": 4.158001179742142e-05, |
|
"loss": 3.6715, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.18383889028160774, |
|
"grad_norm": 25.97146987915039, |
|
"learning_rate": 4.115867531810904e-05, |
|
"loss": 3.6599, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.18383889028160774, |
|
"eval_loss": 3.6818652153015137, |
|
"eval_runtime": 22.2972, |
|
"eval_samples_per_second": 200.115, |
|
"eval_steps_per_second": 25.026, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.1921952034762263, |
|
"grad_norm": 21.378082275390625, |
|
"learning_rate": 4.073733883879666e-05, |
|
"loss": 3.6754, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.20055151667084484, |
|
"grad_norm": 11.59192943572998, |
|
"learning_rate": 4.031600235948428e-05, |
|
"loss": 3.669, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.20055151667084484, |
|
"eval_loss": 3.661188840866089, |
|
"eval_runtime": 22.2553, |
|
"eval_samples_per_second": 200.492, |
|
"eval_steps_per_second": 25.073, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.20890782986546336, |
|
"grad_norm": 17.75707244873047, |
|
"learning_rate": 3.9894665880171905e-05, |
|
"loss": 3.6794, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.2172641430600819, |
|
"grad_norm": 10.825678825378418, |
|
"learning_rate": 3.947332940085953e-05, |
|
"loss": 3.6113, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.2172641430600819, |
|
"eval_loss": 3.6508119106292725, |
|
"eval_runtime": 22.2982, |
|
"eval_samples_per_second": 200.106, |
|
"eval_steps_per_second": 25.024, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.22562045625470042, |
|
"grad_norm": 10.04261302947998, |
|
"learning_rate": 3.905199292154715e-05, |
|
"loss": 3.5963, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.23397676944931897, |
|
"grad_norm": 13.926618576049805, |
|
"learning_rate": 3.863065644223477e-05, |
|
"loss": 3.5997, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.23397676944931897, |
|
"eval_loss": 3.6223905086517334, |
|
"eval_runtime": 22.2717, |
|
"eval_samples_per_second": 200.344, |
|
"eval_steps_per_second": 25.054, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.2423330826439375, |
|
"grad_norm": 11.00304889678955, |
|
"learning_rate": 3.820931996292239e-05, |
|
"loss": 3.5991, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.25068939583855604, |
|
"grad_norm": 16.099769592285156, |
|
"learning_rate": 3.778798348361001e-05, |
|
"loss": 3.6042, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.25068939583855604, |
|
"eval_loss": 3.5953731536865234, |
|
"eval_runtime": 22.2814, |
|
"eval_samples_per_second": 200.257, |
|
"eval_steps_per_second": 25.043, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.2590457090331746, |
|
"grad_norm": 12.459487915039062, |
|
"learning_rate": 3.736664700429763e-05, |
|
"loss": 3.5871, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.2674020222277931, |
|
"grad_norm": 16.979909896850586, |
|
"learning_rate": 3.6946153197943875e-05, |
|
"loss": 3.5238, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.2674020222277931, |
|
"eval_loss": 3.590113401412964, |
|
"eval_runtime": 22.2293, |
|
"eval_samples_per_second": 200.726, |
|
"eval_steps_per_second": 25.102, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.2757583354224116, |
|
"grad_norm": 23.20758056640625, |
|
"learning_rate": 3.65248167186315e-05, |
|
"loss": 3.5646, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.2841146486170302, |
|
"grad_norm": 18.35931396484375, |
|
"learning_rate": 3.610348023931912e-05, |
|
"loss": 3.5445, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.2841146486170302, |
|
"eval_loss": 3.563676595687866, |
|
"eval_runtime": 22.2571, |
|
"eval_samples_per_second": 200.475, |
|
"eval_steps_per_second": 25.071, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.2924709618116487, |
|
"grad_norm": 17.187950134277344, |
|
"learning_rate": 3.568214376000674e-05, |
|
"loss": 3.494, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.3008272750062672, |
|
"grad_norm": 15.331987380981445, |
|
"learning_rate": 3.5261649953652984e-05, |
|
"loss": 3.4913, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.3008272750062672, |
|
"eval_loss": 3.541306495666504, |
|
"eval_runtime": 22.2598, |
|
"eval_samples_per_second": 200.451, |
|
"eval_steps_per_second": 25.068, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.30918358820088576, |
|
"grad_norm": 16.340852737426758, |
|
"learning_rate": 3.484031347434061e-05, |
|
"loss": 3.4969, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.3175399013955043, |
|
"grad_norm": 12.265207290649414, |
|
"learning_rate": 3.441897699502823e-05, |
|
"loss": 3.4934, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.3175399013955043, |
|
"eval_loss": 3.520357847213745, |
|
"eval_runtime": 22.2447, |
|
"eval_samples_per_second": 200.587, |
|
"eval_steps_per_second": 25.085, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.32589621459012286, |
|
"grad_norm": 15.456232070922852, |
|
"learning_rate": 3.399764051571585e-05, |
|
"loss": 3.5013, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.33425252778474135, |
|
"grad_norm": 15.721699714660645, |
|
"learning_rate": 3.3576304036403474e-05, |
|
"loss": 3.4627, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.33425252778474135, |
|
"eval_loss": 3.5179378986358643, |
|
"eval_runtime": 22.2594, |
|
"eval_samples_per_second": 200.455, |
|
"eval_steps_per_second": 25.068, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.3426088409793599, |
|
"grad_norm": 12.118553161621094, |
|
"learning_rate": 3.3154967557091096e-05, |
|
"loss": 3.5006, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.35096515417397844, |
|
"grad_norm": 8.990864753723145, |
|
"learning_rate": 3.273447375073734e-05, |
|
"loss": 3.4367, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.35096515417397844, |
|
"eval_loss": 3.5118658542633057, |
|
"eval_runtime": 22.2415, |
|
"eval_samples_per_second": 200.616, |
|
"eval_steps_per_second": 25.088, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.359321467368597, |
|
"grad_norm": 9.9972562789917, |
|
"learning_rate": 3.231313727142496e-05, |
|
"loss": 3.4498, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.3676777805632155, |
|
"grad_norm": 10.996673583984375, |
|
"learning_rate": 3.189180079211258e-05, |
|
"loss": 3.4643, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.3676777805632155, |
|
"eval_loss": 3.483738899230957, |
|
"eval_runtime": 22.2582, |
|
"eval_samples_per_second": 200.465, |
|
"eval_steps_per_second": 25.069, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.37603409375783403, |
|
"grad_norm": 14.55636978149414, |
|
"learning_rate": 3.14704643128002e-05, |
|
"loss": 3.5215, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.3843904069524526, |
|
"grad_norm": 13.585105895996094, |
|
"learning_rate": 3.104912783348782e-05, |
|
"loss": 3.419, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.3843904069524526, |
|
"eval_loss": 3.47660231590271, |
|
"eval_runtime": 22.226, |
|
"eval_samples_per_second": 200.756, |
|
"eval_steps_per_second": 25.106, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.3927467201470711, |
|
"grad_norm": 11.853238105773926, |
|
"learning_rate": 3.062779135417544e-05, |
|
"loss": 3.4438, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.4011030333416897, |
|
"grad_norm": 13.06174373626709, |
|
"learning_rate": 3.020729754782169e-05, |
|
"loss": 3.4029, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.4011030333416897, |
|
"eval_loss": 3.4587268829345703, |
|
"eval_runtime": 22.2726, |
|
"eval_samples_per_second": 200.336, |
|
"eval_steps_per_second": 25.053, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.40945934653630817, |
|
"grad_norm": 16.874757766723633, |
|
"learning_rate": 2.9786803741467938e-05, |
|
"loss": 3.3971, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.4178156597309267, |
|
"grad_norm": 11.108474731445312, |
|
"learning_rate": 2.936546726215556e-05, |
|
"loss": 3.3574, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.4178156597309267, |
|
"eval_loss": 3.446179151535034, |
|
"eval_runtime": 22.3522, |
|
"eval_samples_per_second": 199.622, |
|
"eval_steps_per_second": 24.964, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.42617197292554526, |
|
"grad_norm": 12.936110496520996, |
|
"learning_rate": 2.8944130782843183e-05, |
|
"loss": 3.3829, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.4345282861201638, |
|
"grad_norm": 12.90854549407959, |
|
"learning_rate": 2.8522794303530802e-05, |
|
"loss": 3.4156, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.4345282861201638, |
|
"eval_loss": 3.44026517868042, |
|
"eval_runtime": 22.2661, |
|
"eval_samples_per_second": 200.394, |
|
"eval_steps_per_second": 25.061, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.4428845993147823, |
|
"grad_norm": 10.326555252075195, |
|
"learning_rate": 2.8101457824218424e-05, |
|
"loss": 3.3607, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.45124091250940085, |
|
"grad_norm": 12.372066497802734, |
|
"learning_rate": 2.7681806690823293e-05, |
|
"loss": 3.3836, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.45124091250940085, |
|
"eval_loss": 3.4253649711608887, |
|
"eval_runtime": 22.2507, |
|
"eval_samples_per_second": 200.533, |
|
"eval_steps_per_second": 25.078, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.4595972257040194, |
|
"grad_norm": 9.778299331665039, |
|
"learning_rate": 2.7260470211510912e-05, |
|
"loss": 3.3671, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.46795353889863794, |
|
"grad_norm": 20.047178268432617, |
|
"learning_rate": 2.6839133732198535e-05, |
|
"loss": 3.3395, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.46795353889863794, |
|
"eval_loss": 3.41679048538208, |
|
"eval_runtime": 22.2707, |
|
"eval_samples_per_second": 200.353, |
|
"eval_steps_per_second": 25.055, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.47630985209325644, |
|
"grad_norm": 9.312335968017578, |
|
"learning_rate": 2.6417797252886157e-05, |
|
"loss": 3.3616, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.484666165287875, |
|
"grad_norm": 10.994682312011719, |
|
"learning_rate": 2.5996460773573776e-05, |
|
"loss": 3.3719, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.484666165287875, |
|
"eval_loss": 3.4018924236297607, |
|
"eval_runtime": 22.2565, |
|
"eval_samples_per_second": 200.481, |
|
"eval_steps_per_second": 25.071, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.49302247848249353, |
|
"grad_norm": 13.464505195617676, |
|
"learning_rate": 2.5575124294261398e-05, |
|
"loss": 3.3312, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.5013787916771121, |
|
"grad_norm": 12.18619441986084, |
|
"learning_rate": 2.515378781494902e-05, |
|
"loss": 3.386, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.5013787916771121, |
|
"eval_loss": 3.3899354934692383, |
|
"eval_runtime": 22.2658, |
|
"eval_samples_per_second": 200.397, |
|
"eval_steps_per_second": 25.061, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.5097351048717306, |
|
"grad_norm": 14.552848815917969, |
|
"learning_rate": 2.4732451335636643e-05, |
|
"loss": 3.3377, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.5180914180663492, |
|
"grad_norm": 15.032088279724121, |
|
"learning_rate": 2.4311114856324262e-05, |
|
"loss": 3.3131, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.5180914180663492, |
|
"eval_loss": 3.378127336502075, |
|
"eval_runtime": 22.2423, |
|
"eval_samples_per_second": 200.609, |
|
"eval_steps_per_second": 25.087, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.5264477312609677, |
|
"grad_norm": 14.666757583618164, |
|
"learning_rate": 2.388977837701188e-05, |
|
"loss": 3.3457, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.5348040444555862, |
|
"grad_norm": 11.800482749938965, |
|
"learning_rate": 2.3468441897699503e-05, |
|
"loss": 3.3192, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.5348040444555862, |
|
"eval_loss": 3.3670458793640137, |
|
"eval_runtime": 22.256, |
|
"eval_samples_per_second": 200.485, |
|
"eval_steps_per_second": 25.072, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.5431603576502048, |
|
"grad_norm": 10.835103034973145, |
|
"learning_rate": 2.3047105418387125e-05, |
|
"loss": 3.3235, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.5515166708448233, |
|
"grad_norm": 12.06092357635498, |
|
"learning_rate": 2.2625768939074744e-05, |
|
"loss": 3.2969, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.5515166708448233, |
|
"eval_loss": 3.356658935546875, |
|
"eval_runtime": 22.2404, |
|
"eval_samples_per_second": 200.626, |
|
"eval_steps_per_second": 25.089, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.5598729840394417, |
|
"grad_norm": 15.398877143859863, |
|
"learning_rate": 2.2204432459762367e-05, |
|
"loss": 3.3181, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.5682292972340603, |
|
"grad_norm": 10.425477027893066, |
|
"learning_rate": 2.178309598044999e-05, |
|
"loss": 3.3202, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.5682292972340603, |
|
"eval_loss": 3.34324312210083, |
|
"eval_runtime": 22.2237, |
|
"eval_samples_per_second": 200.777, |
|
"eval_steps_per_second": 25.108, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.5765856104286788, |
|
"grad_norm": 13.118115425109863, |
|
"learning_rate": 2.136175950113761e-05, |
|
"loss": 3.3028, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.5849419236232974, |
|
"grad_norm": 8.235157012939453, |
|
"learning_rate": 2.0941265694783854e-05, |
|
"loss": 3.2403, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.5849419236232974, |
|
"eval_loss": 3.3430681228637695, |
|
"eval_runtime": 22.2974, |
|
"eval_samples_per_second": 200.113, |
|
"eval_steps_per_second": 25.025, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.5932982368179159, |
|
"grad_norm": 15.389208793640137, |
|
"learning_rate": 2.0519929215471476e-05, |
|
"loss": 3.3105, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.6016545500125344, |
|
"grad_norm": 12.708732604980469, |
|
"learning_rate": 2.0098592736159098e-05, |
|
"loss": 3.2775, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.6016545500125344, |
|
"eval_loss": 3.3276991844177246, |
|
"eval_runtime": 22.2643, |
|
"eval_samples_per_second": 200.411, |
|
"eval_steps_per_second": 25.063, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.610010863207153, |
|
"grad_norm": 13.642451286315918, |
|
"learning_rate": 1.9677256256846717e-05, |
|
"loss": 3.2902, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.6183671764017715, |
|
"grad_norm": 12.606600761413574, |
|
"learning_rate": 1.9256762450492966e-05, |
|
"loss": 3.271, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.6183671764017715, |
|
"eval_loss": 3.3122496604919434, |
|
"eval_runtime": 22.2804, |
|
"eval_samples_per_second": 200.266, |
|
"eval_steps_per_second": 25.044, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.6267234895963901, |
|
"grad_norm": 11.484159469604492, |
|
"learning_rate": 1.8835425971180585e-05, |
|
"loss": 3.2833, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.6350798027910086, |
|
"grad_norm": 12.317131996154785, |
|
"learning_rate": 1.8414089491868204e-05, |
|
"loss": 3.2848, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.6350798027910086, |
|
"eval_loss": 3.3035213947296143, |
|
"eval_runtime": 22.2937, |
|
"eval_samples_per_second": 200.147, |
|
"eval_steps_per_second": 25.03, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.6434361159856271, |
|
"grad_norm": 11.45077896118164, |
|
"learning_rate": 1.7992753012555827e-05, |
|
"loss": 3.202, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.6517924291802457, |
|
"grad_norm": 12.859657287597656, |
|
"learning_rate": 1.7572259206202076e-05, |
|
"loss": 3.2376, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.6517924291802457, |
|
"eval_loss": 3.2956559658050537, |
|
"eval_runtime": 22.3804, |
|
"eval_samples_per_second": 199.371, |
|
"eval_steps_per_second": 24.933, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.6601487423748642, |
|
"grad_norm": 14.472012519836426, |
|
"learning_rate": 1.7150922726889695e-05, |
|
"loss": 3.1924, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.6685050555694827, |
|
"grad_norm": 13.051079750061035, |
|
"learning_rate": 1.673042892053594e-05, |
|
"loss": 3.2598, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.6685050555694827, |
|
"eval_loss": 3.2878499031066895, |
|
"eval_runtime": 22.2464, |
|
"eval_samples_per_second": 200.572, |
|
"eval_steps_per_second": 25.083, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.6768613687641013, |
|
"grad_norm": 15.44560718536377, |
|
"learning_rate": 1.6309092441223563e-05, |
|
"loss": 3.1978, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.6852176819587198, |
|
"grad_norm": 16.988996505737305, |
|
"learning_rate": 1.588775596191118e-05, |
|
"loss": 3.2247, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.6852176819587198, |
|
"eval_loss": 3.279550313949585, |
|
"eval_runtime": 22.2386, |
|
"eval_samples_per_second": 200.642, |
|
"eval_steps_per_second": 25.091, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.6935739951533384, |
|
"grad_norm": 8.293917655944824, |
|
"learning_rate": 1.5466419482598804e-05, |
|
"loss": 3.1682, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.7019303083479569, |
|
"grad_norm": 10.755880355834961, |
|
"learning_rate": 1.5045925676245051e-05, |
|
"loss": 3.1849, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.7019303083479569, |
|
"eval_loss": 3.2791192531585693, |
|
"eval_runtime": 22.2554, |
|
"eval_samples_per_second": 200.491, |
|
"eval_steps_per_second": 25.073, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.7102866215425754, |
|
"grad_norm": 17.822643280029297, |
|
"learning_rate": 1.462458919693267e-05, |
|
"loss": 3.1714, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.718642934737194, |
|
"grad_norm": 18.230485916137695, |
|
"learning_rate": 1.4203252717620291e-05, |
|
"loss": 3.2112, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.718642934737194, |
|
"eval_loss": 3.260193109512329, |
|
"eval_runtime": 22.2518, |
|
"eval_samples_per_second": 200.523, |
|
"eval_steps_per_second": 25.077, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.7269992479318125, |
|
"grad_norm": 13.363430976867676, |
|
"learning_rate": 1.3781916238307913e-05, |
|
"loss": 3.1655, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.735355561126431, |
|
"grad_norm": 11.570181846618652, |
|
"learning_rate": 1.3360579758995534e-05, |
|
"loss": 3.174, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.735355561126431, |
|
"eval_loss": 3.2490386962890625, |
|
"eval_runtime": 22.2683, |
|
"eval_samples_per_second": 200.374, |
|
"eval_steps_per_second": 25.058, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.7437118743210496, |
|
"grad_norm": 19.80602264404297, |
|
"learning_rate": 1.2939243279683155e-05, |
|
"loss": 3.1987, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.7520681875156681, |
|
"grad_norm": 10.821731567382812, |
|
"learning_rate": 1.2518749473329402e-05, |
|
"loss": 3.1799, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.7520681875156681, |
|
"eval_loss": 3.240847587585449, |
|
"eval_runtime": 22.2794, |
|
"eval_samples_per_second": 200.275, |
|
"eval_steps_per_second": 25.046, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.7604245007102867, |
|
"grad_norm": 16.301612854003906, |
|
"learning_rate": 1.2097412994017023e-05, |
|
"loss": 3.2029, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.7687808139049052, |
|
"grad_norm": 14.699359893798828, |
|
"learning_rate": 1.1676076514704643e-05, |
|
"loss": 3.1752, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.7687808139049052, |
|
"eval_loss": 3.233914852142334, |
|
"eval_runtime": 22.269, |
|
"eval_samples_per_second": 200.369, |
|
"eval_steps_per_second": 25.057, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.7771371270995237, |
|
"grad_norm": 15.696563720703125, |
|
"learning_rate": 1.1254740035392266e-05, |
|
"loss": 3.132, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.7854934402941423, |
|
"grad_norm": 13.062487602233887, |
|
"learning_rate": 1.0833403556079886e-05, |
|
"loss": 3.131, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.7854934402941423, |
|
"eval_loss": 3.2280752658843994, |
|
"eval_runtime": 22.2955, |
|
"eval_samples_per_second": 200.13, |
|
"eval_steps_per_second": 25.027, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.7938497534887607, |
|
"grad_norm": 18.67305564880371, |
|
"learning_rate": 1.0412909749726132e-05, |
|
"loss": 3.1571, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.8022060666833793, |
|
"grad_norm": 10.377827644348145, |
|
"learning_rate": 9.992415943372378e-06, |
|
"loss": 3.181, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.8022060666833793, |
|
"eval_loss": 3.2206084728240967, |
|
"eval_runtime": 22.2943, |
|
"eval_samples_per_second": 200.141, |
|
"eval_steps_per_second": 25.029, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.8105623798779978, |
|
"grad_norm": 12.836233139038086, |
|
"learning_rate": 9.571922137018624e-06, |
|
"loss": 3.139, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.8189186930726163, |
|
"grad_norm": 11.736408233642578, |
|
"learning_rate": 9.150585657706244e-06, |
|
"loss": 3.0932, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.8189186930726163, |
|
"eval_loss": 3.2135069370269775, |
|
"eval_runtime": 22.2506, |
|
"eval_samples_per_second": 200.534, |
|
"eval_steps_per_second": 25.078, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.8272750062672349, |
|
"grad_norm": 16.016298294067383, |
|
"learning_rate": 8.729249178393865e-06, |
|
"loss": 3.1634, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 0.8356313194618534, |
|
"grad_norm": 10.488819122314453, |
|
"learning_rate": 8.307912699081487e-06, |
|
"loss": 3.1376, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.8356313194618534, |
|
"eval_loss": 3.2051162719726562, |
|
"eval_runtime": 22.294, |
|
"eval_samples_per_second": 200.144, |
|
"eval_steps_per_second": 25.029, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.8439876326564719, |
|
"grad_norm": 16.168071746826172, |
|
"learning_rate": 7.886576219769108e-06, |
|
"loss": 3.1121, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 0.8523439458510905, |
|
"grad_norm": 19.903099060058594, |
|
"learning_rate": 7.465239740456729e-06, |
|
"loss": 3.1084, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.8523439458510905, |
|
"eval_loss": 3.198310375213623, |
|
"eval_runtime": 22.3049, |
|
"eval_samples_per_second": 200.046, |
|
"eval_steps_per_second": 25.017, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.860700259045709, |
|
"grad_norm": 12.082676887512207, |
|
"learning_rate": 7.043903261144351e-06, |
|
"loss": 3.0957, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 0.8690565722403276, |
|
"grad_norm": 11.764552116394043, |
|
"learning_rate": 6.622566781831971e-06, |
|
"loss": 3.099, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.8690565722403276, |
|
"eval_loss": 3.193253993988037, |
|
"eval_runtime": 22.2469, |
|
"eval_samples_per_second": 200.567, |
|
"eval_steps_per_second": 25.082, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.8774128854349461, |
|
"grad_norm": 12.482972145080566, |
|
"learning_rate": 6.201230302519592e-06, |
|
"loss": 3.0779, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 0.8857691986295646, |
|
"grad_norm": 14.11436939239502, |
|
"learning_rate": 5.7798938232072135e-06, |
|
"loss": 3.1278, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.8857691986295646, |
|
"eval_loss": 3.1867904663085938, |
|
"eval_runtime": 22.2664, |
|
"eval_samples_per_second": 200.392, |
|
"eval_steps_per_second": 25.06, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.8941255118241832, |
|
"grad_norm": 19.69700813293457, |
|
"learning_rate": 5.358557343894835e-06, |
|
"loss": 3.0968, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 0.9024818250188017, |
|
"grad_norm": 14.537339210510254, |
|
"learning_rate": 4.937220864582456e-06, |
|
"loss": 3.1436, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.9024818250188017, |
|
"eval_loss": 3.180774688720703, |
|
"eval_runtime": 22.293, |
|
"eval_samples_per_second": 200.152, |
|
"eval_steps_per_second": 25.03, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.9108381382134202, |
|
"grad_norm": 16.117996215820312, |
|
"learning_rate": 4.515884385270077e-06, |
|
"loss": 3.1288, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 0.9191944514080388, |
|
"grad_norm": 12.458276748657227, |
|
"learning_rate": 4.094547905957698e-06, |
|
"loss": 3.0763, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.9191944514080388, |
|
"eval_loss": 3.175370216369629, |
|
"eval_runtime": 22.253, |
|
"eval_samples_per_second": 200.513, |
|
"eval_steps_per_second": 25.075, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.9275507646026573, |
|
"grad_norm": 14.115385055541992, |
|
"learning_rate": 3.6732114266453192e-06, |
|
"loss": 3.0642, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 0.9359070777972759, |
|
"grad_norm": 19.65464210510254, |
|
"learning_rate": 3.2518749473329403e-06, |
|
"loss": 3.1248, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.9359070777972759, |
|
"eval_loss": 3.1690962314605713, |
|
"eval_runtime": 22.3258, |
|
"eval_samples_per_second": 199.858, |
|
"eval_steps_per_second": 24.993, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.9442633909918944, |
|
"grad_norm": 11.953753471374512, |
|
"learning_rate": 2.831381140979186e-06, |
|
"loss": 3.1361, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 0.9526197041865129, |
|
"grad_norm": 10.821110725402832, |
|
"learning_rate": 2.4108873346254323e-06, |
|
"loss": 3.0418, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.9526197041865129, |
|
"eval_loss": 3.164776563644409, |
|
"eval_runtime": 22.2464, |
|
"eval_samples_per_second": 200.572, |
|
"eval_steps_per_second": 25.083, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.9609760173811315, |
|
"grad_norm": 11.476717948913574, |
|
"learning_rate": 1.9895508553130533e-06, |
|
"loss": 3.0504, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 0.96933233057575, |
|
"grad_norm": 10.973363876342773, |
|
"learning_rate": 1.5682143760006742e-06, |
|
"loss": 3.0755, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.96933233057575, |
|
"eval_loss": 3.1603705883026123, |
|
"eval_runtime": 22.272, |
|
"eval_samples_per_second": 200.341, |
|
"eval_steps_per_second": 25.054, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.9776886437703685, |
|
"grad_norm": 10.836787223815918, |
|
"learning_rate": 1.1468778966882954e-06, |
|
"loss": 3.1001, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 0.9860449569649871, |
|
"grad_norm": 13.901703834533691, |
|
"learning_rate": 7.255414173759165e-07, |
|
"loss": 3.0633, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.9860449569649871, |
|
"eval_loss": 3.1587648391723633, |
|
"eval_runtime": 22.3994, |
|
"eval_samples_per_second": 199.202, |
|
"eval_steps_per_second": 24.911, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.9944012701596056, |
|
"grad_norm": 19.022567749023438, |
|
"learning_rate": 3.0420493806353753e-07, |
|
"loss": 3.0751, |
|
"step": 59500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 59835, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.943723113325527e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|