innovation-hacking2's picture
Upload folder using huggingface_hub
ad5fb8e verified
{
"best_metric": 3.1603705883026123,
"best_model_checkpoint": "./models/full-finetuning/LLaMmlein_120M/checkpoint-58000",
"epoch": 1.0,
"eval_steps": 1000,
"global_step": 59835,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008356313194618534,
"grad_norm": 40.14425277709961,
"learning_rate": 4.96e-05,
"loss": 4.8477,
"step": 500
},
{
"epoch": 0.016712626389237067,
"grad_norm": 22.107275009155273,
"learning_rate": 4.958203421252212e-05,
"loss": 4.4466,
"step": 1000
},
{
"epoch": 0.016712626389237067,
"eval_loss": 4.3118462562561035,
"eval_runtime": 22.0404,
"eval_samples_per_second": 202.446,
"eval_steps_per_second": 25.317,
"step": 1000
},
{
"epoch": 0.025068939583855605,
"grad_norm": 19.609399795532227,
"learning_rate": 4.916069773320974e-05,
"loss": 4.2986,
"step": 1500
},
{
"epoch": 0.033425252778474135,
"grad_norm": 18.64438247680664,
"learning_rate": 4.873936125389736e-05,
"loss": 4.2295,
"step": 2000
},
{
"epoch": 0.033425252778474135,
"eval_loss": 4.121812343597412,
"eval_runtime": 22.1428,
"eval_samples_per_second": 201.51,
"eval_steps_per_second": 25.2,
"step": 2000
},
{
"epoch": 0.04178156597309267,
"grad_norm": 19.098369598388672,
"learning_rate": 4.8318024774584986e-05,
"loss": 4.1165,
"step": 2500
},
{
"epoch": 0.05013787916771121,
"grad_norm": 17.83785629272461,
"learning_rate": 4.789668829527261e-05,
"loss": 4.029,
"step": 3000
},
{
"epoch": 0.05013787916771121,
"eval_loss": 4.040452480316162,
"eval_runtime": 22.088,
"eval_samples_per_second": 202.01,
"eval_steps_per_second": 25.263,
"step": 3000
},
{
"epoch": 0.05849419236232974,
"grad_norm": 14.394288063049316,
"learning_rate": 4.747535181596023e-05,
"loss": 4.0701,
"step": 3500
},
{
"epoch": 0.06685050555694827,
"grad_norm": 17.005945205688477,
"learning_rate": 4.705401533664785e-05,
"loss": 4.0239,
"step": 4000
},
{
"epoch": 0.06685050555694827,
"eval_loss": 3.948943853378296,
"eval_runtime": 22.0828,
"eval_samples_per_second": 202.058,
"eval_steps_per_second": 25.269,
"step": 4000
},
{
"epoch": 0.0752068187515668,
"grad_norm": 20.297489166259766,
"learning_rate": 4.663267885733547e-05,
"loss": 3.9704,
"step": 4500
},
{
"epoch": 0.08356313194618534,
"grad_norm": 17.280521392822266,
"learning_rate": 4.621134237802309e-05,
"loss": 3.9118,
"step": 5000
},
{
"epoch": 0.08356313194618534,
"eval_loss": 3.891711950302124,
"eval_runtime": 22.1454,
"eval_samples_per_second": 201.487,
"eval_steps_per_second": 25.197,
"step": 5000
},
{
"epoch": 0.09191944514080387,
"grad_norm": 18.847597122192383,
"learning_rate": 4.579000589871071e-05,
"loss": 3.9024,
"step": 5500
},
{
"epoch": 0.10027575833542242,
"grad_norm": 14.654472351074219,
"learning_rate": 4.5368669419398335e-05,
"loss": 3.8479,
"step": 6000
},
{
"epoch": 0.10027575833542242,
"eval_loss": 3.8378305435180664,
"eval_runtime": 22.1051,
"eval_samples_per_second": 201.854,
"eval_steps_per_second": 25.243,
"step": 6000
},
{
"epoch": 0.10863207153004095,
"grad_norm": 18.267274856567383,
"learning_rate": 4.494817561304458e-05,
"loss": 3.8116,
"step": 6500
},
{
"epoch": 0.11698838472465949,
"grad_norm": 13.313653945922852,
"learning_rate": 4.45268391337322e-05,
"loss": 3.8592,
"step": 7000
},
{
"epoch": 0.11698838472465949,
"eval_loss": 3.8072171211242676,
"eval_runtime": 22.2351,
"eval_samples_per_second": 200.674,
"eval_steps_per_second": 25.095,
"step": 7000
},
{
"epoch": 0.12534469791927802,
"grad_norm": 16.960010528564453,
"learning_rate": 4.410550265441982e-05,
"loss": 3.8291,
"step": 7500
},
{
"epoch": 0.13370101111389654,
"grad_norm": 15.129623413085938,
"learning_rate": 4.3684166175107444e-05,
"loss": 3.7697,
"step": 8000
},
{
"epoch": 0.13370101111389654,
"eval_loss": 3.777130126953125,
"eval_runtime": 22.2765,
"eval_samples_per_second": 200.3,
"eval_steps_per_second": 25.049,
"step": 8000
},
{
"epoch": 0.1420573243085151,
"grad_norm": 18.825756072998047,
"learning_rate": 4.326535771467094e-05,
"loss": 3.7714,
"step": 8500
},
{
"epoch": 0.1504136375031336,
"grad_norm": 13.93099308013916,
"learning_rate": 4.284402123535856e-05,
"loss": 3.7719,
"step": 9000
},
{
"epoch": 0.1504136375031336,
"eval_loss": 3.747012138366699,
"eval_runtime": 22.2951,
"eval_samples_per_second": 200.134,
"eval_steps_per_second": 25.028,
"step": 9000
},
{
"epoch": 0.15876995069775215,
"grad_norm": 12.885889053344727,
"learning_rate": 4.2422684756046185e-05,
"loss": 3.7127,
"step": 9500
},
{
"epoch": 0.16712626389237067,
"grad_norm": 15.362942695617676,
"learning_rate": 4.200134827673381e-05,
"loss": 3.7135,
"step": 10000
},
{
"epoch": 0.16712626389237067,
"eval_loss": 3.7190206050872803,
"eval_runtime": 22.2344,
"eval_samples_per_second": 200.68,
"eval_steps_per_second": 25.096,
"step": 10000
},
{
"epoch": 0.17548257708698922,
"grad_norm": 12.432964324951172,
"learning_rate": 4.158001179742142e-05,
"loss": 3.6715,
"step": 10500
},
{
"epoch": 0.18383889028160774,
"grad_norm": 25.97146987915039,
"learning_rate": 4.115867531810904e-05,
"loss": 3.6599,
"step": 11000
},
{
"epoch": 0.18383889028160774,
"eval_loss": 3.6818652153015137,
"eval_runtime": 22.2972,
"eval_samples_per_second": 200.115,
"eval_steps_per_second": 25.026,
"step": 11000
},
{
"epoch": 0.1921952034762263,
"grad_norm": 21.378082275390625,
"learning_rate": 4.073733883879666e-05,
"loss": 3.6754,
"step": 11500
},
{
"epoch": 0.20055151667084484,
"grad_norm": 11.59192943572998,
"learning_rate": 4.031600235948428e-05,
"loss": 3.669,
"step": 12000
},
{
"epoch": 0.20055151667084484,
"eval_loss": 3.661188840866089,
"eval_runtime": 22.2553,
"eval_samples_per_second": 200.492,
"eval_steps_per_second": 25.073,
"step": 12000
},
{
"epoch": 0.20890782986546336,
"grad_norm": 17.75707244873047,
"learning_rate": 3.9894665880171905e-05,
"loss": 3.6794,
"step": 12500
},
{
"epoch": 0.2172641430600819,
"grad_norm": 10.825678825378418,
"learning_rate": 3.947332940085953e-05,
"loss": 3.6113,
"step": 13000
},
{
"epoch": 0.2172641430600819,
"eval_loss": 3.6508119106292725,
"eval_runtime": 22.2982,
"eval_samples_per_second": 200.106,
"eval_steps_per_second": 25.024,
"step": 13000
},
{
"epoch": 0.22562045625470042,
"grad_norm": 10.04261302947998,
"learning_rate": 3.905199292154715e-05,
"loss": 3.5963,
"step": 13500
},
{
"epoch": 0.23397676944931897,
"grad_norm": 13.926618576049805,
"learning_rate": 3.863065644223477e-05,
"loss": 3.5997,
"step": 14000
},
{
"epoch": 0.23397676944931897,
"eval_loss": 3.6223905086517334,
"eval_runtime": 22.2717,
"eval_samples_per_second": 200.344,
"eval_steps_per_second": 25.054,
"step": 14000
},
{
"epoch": 0.2423330826439375,
"grad_norm": 11.00304889678955,
"learning_rate": 3.820931996292239e-05,
"loss": 3.5991,
"step": 14500
},
{
"epoch": 0.25068939583855604,
"grad_norm": 16.099769592285156,
"learning_rate": 3.778798348361001e-05,
"loss": 3.6042,
"step": 15000
},
{
"epoch": 0.25068939583855604,
"eval_loss": 3.5953731536865234,
"eval_runtime": 22.2814,
"eval_samples_per_second": 200.257,
"eval_steps_per_second": 25.043,
"step": 15000
},
{
"epoch": 0.2590457090331746,
"grad_norm": 12.459487915039062,
"learning_rate": 3.736664700429763e-05,
"loss": 3.5871,
"step": 15500
},
{
"epoch": 0.2674020222277931,
"grad_norm": 16.979909896850586,
"learning_rate": 3.6946153197943875e-05,
"loss": 3.5238,
"step": 16000
},
{
"epoch": 0.2674020222277931,
"eval_loss": 3.590113401412964,
"eval_runtime": 22.2293,
"eval_samples_per_second": 200.726,
"eval_steps_per_second": 25.102,
"step": 16000
},
{
"epoch": 0.2757583354224116,
"grad_norm": 23.20758056640625,
"learning_rate": 3.65248167186315e-05,
"loss": 3.5646,
"step": 16500
},
{
"epoch": 0.2841146486170302,
"grad_norm": 18.35931396484375,
"learning_rate": 3.610348023931912e-05,
"loss": 3.5445,
"step": 17000
},
{
"epoch": 0.2841146486170302,
"eval_loss": 3.563676595687866,
"eval_runtime": 22.2571,
"eval_samples_per_second": 200.475,
"eval_steps_per_second": 25.071,
"step": 17000
},
{
"epoch": 0.2924709618116487,
"grad_norm": 17.187950134277344,
"learning_rate": 3.568214376000674e-05,
"loss": 3.494,
"step": 17500
},
{
"epoch": 0.3008272750062672,
"grad_norm": 15.331987380981445,
"learning_rate": 3.5261649953652984e-05,
"loss": 3.4913,
"step": 18000
},
{
"epoch": 0.3008272750062672,
"eval_loss": 3.541306495666504,
"eval_runtime": 22.2598,
"eval_samples_per_second": 200.451,
"eval_steps_per_second": 25.068,
"step": 18000
},
{
"epoch": 0.30918358820088576,
"grad_norm": 16.340852737426758,
"learning_rate": 3.484031347434061e-05,
"loss": 3.4969,
"step": 18500
},
{
"epoch": 0.3175399013955043,
"grad_norm": 12.265207290649414,
"learning_rate": 3.441897699502823e-05,
"loss": 3.4934,
"step": 19000
},
{
"epoch": 0.3175399013955043,
"eval_loss": 3.520357847213745,
"eval_runtime": 22.2447,
"eval_samples_per_second": 200.587,
"eval_steps_per_second": 25.085,
"step": 19000
},
{
"epoch": 0.32589621459012286,
"grad_norm": 15.456232070922852,
"learning_rate": 3.399764051571585e-05,
"loss": 3.5013,
"step": 19500
},
{
"epoch": 0.33425252778474135,
"grad_norm": 15.721699714660645,
"learning_rate": 3.3576304036403474e-05,
"loss": 3.4627,
"step": 20000
},
{
"epoch": 0.33425252778474135,
"eval_loss": 3.5179378986358643,
"eval_runtime": 22.2594,
"eval_samples_per_second": 200.455,
"eval_steps_per_second": 25.068,
"step": 20000
},
{
"epoch": 0.3426088409793599,
"grad_norm": 12.118553161621094,
"learning_rate": 3.3154967557091096e-05,
"loss": 3.5006,
"step": 20500
},
{
"epoch": 0.35096515417397844,
"grad_norm": 8.990864753723145,
"learning_rate": 3.273447375073734e-05,
"loss": 3.4367,
"step": 21000
},
{
"epoch": 0.35096515417397844,
"eval_loss": 3.5118658542633057,
"eval_runtime": 22.2415,
"eval_samples_per_second": 200.616,
"eval_steps_per_second": 25.088,
"step": 21000
},
{
"epoch": 0.359321467368597,
"grad_norm": 9.9972562789917,
"learning_rate": 3.231313727142496e-05,
"loss": 3.4498,
"step": 21500
},
{
"epoch": 0.3676777805632155,
"grad_norm": 10.996673583984375,
"learning_rate": 3.189180079211258e-05,
"loss": 3.4643,
"step": 22000
},
{
"epoch": 0.3676777805632155,
"eval_loss": 3.483738899230957,
"eval_runtime": 22.2582,
"eval_samples_per_second": 200.465,
"eval_steps_per_second": 25.069,
"step": 22000
},
{
"epoch": 0.37603409375783403,
"grad_norm": 14.55636978149414,
"learning_rate": 3.14704643128002e-05,
"loss": 3.5215,
"step": 22500
},
{
"epoch": 0.3843904069524526,
"grad_norm": 13.585105895996094,
"learning_rate": 3.104912783348782e-05,
"loss": 3.419,
"step": 23000
},
{
"epoch": 0.3843904069524526,
"eval_loss": 3.47660231590271,
"eval_runtime": 22.226,
"eval_samples_per_second": 200.756,
"eval_steps_per_second": 25.106,
"step": 23000
},
{
"epoch": 0.3927467201470711,
"grad_norm": 11.853238105773926,
"learning_rate": 3.062779135417544e-05,
"loss": 3.4438,
"step": 23500
},
{
"epoch": 0.4011030333416897,
"grad_norm": 13.06174373626709,
"learning_rate": 3.020729754782169e-05,
"loss": 3.4029,
"step": 24000
},
{
"epoch": 0.4011030333416897,
"eval_loss": 3.4587268829345703,
"eval_runtime": 22.2726,
"eval_samples_per_second": 200.336,
"eval_steps_per_second": 25.053,
"step": 24000
},
{
"epoch": 0.40945934653630817,
"grad_norm": 16.874757766723633,
"learning_rate": 2.9786803741467938e-05,
"loss": 3.3971,
"step": 24500
},
{
"epoch": 0.4178156597309267,
"grad_norm": 11.108474731445312,
"learning_rate": 2.936546726215556e-05,
"loss": 3.3574,
"step": 25000
},
{
"epoch": 0.4178156597309267,
"eval_loss": 3.446179151535034,
"eval_runtime": 22.3522,
"eval_samples_per_second": 199.622,
"eval_steps_per_second": 24.964,
"step": 25000
},
{
"epoch": 0.42617197292554526,
"grad_norm": 12.936110496520996,
"learning_rate": 2.8944130782843183e-05,
"loss": 3.3829,
"step": 25500
},
{
"epoch": 0.4345282861201638,
"grad_norm": 12.90854549407959,
"learning_rate": 2.8522794303530802e-05,
"loss": 3.4156,
"step": 26000
},
{
"epoch": 0.4345282861201638,
"eval_loss": 3.44026517868042,
"eval_runtime": 22.2661,
"eval_samples_per_second": 200.394,
"eval_steps_per_second": 25.061,
"step": 26000
},
{
"epoch": 0.4428845993147823,
"grad_norm": 10.326555252075195,
"learning_rate": 2.8101457824218424e-05,
"loss": 3.3607,
"step": 26500
},
{
"epoch": 0.45124091250940085,
"grad_norm": 12.372066497802734,
"learning_rate": 2.7681806690823293e-05,
"loss": 3.3836,
"step": 27000
},
{
"epoch": 0.45124091250940085,
"eval_loss": 3.4253649711608887,
"eval_runtime": 22.2507,
"eval_samples_per_second": 200.533,
"eval_steps_per_second": 25.078,
"step": 27000
},
{
"epoch": 0.4595972257040194,
"grad_norm": 9.778299331665039,
"learning_rate": 2.7260470211510912e-05,
"loss": 3.3671,
"step": 27500
},
{
"epoch": 0.46795353889863794,
"grad_norm": 20.047178268432617,
"learning_rate": 2.6839133732198535e-05,
"loss": 3.3395,
"step": 28000
},
{
"epoch": 0.46795353889863794,
"eval_loss": 3.41679048538208,
"eval_runtime": 22.2707,
"eval_samples_per_second": 200.353,
"eval_steps_per_second": 25.055,
"step": 28000
},
{
"epoch": 0.47630985209325644,
"grad_norm": 9.312335968017578,
"learning_rate": 2.6417797252886157e-05,
"loss": 3.3616,
"step": 28500
},
{
"epoch": 0.484666165287875,
"grad_norm": 10.994682312011719,
"learning_rate": 2.5996460773573776e-05,
"loss": 3.3719,
"step": 29000
},
{
"epoch": 0.484666165287875,
"eval_loss": 3.4018924236297607,
"eval_runtime": 22.2565,
"eval_samples_per_second": 200.481,
"eval_steps_per_second": 25.071,
"step": 29000
},
{
"epoch": 0.49302247848249353,
"grad_norm": 13.464505195617676,
"learning_rate": 2.5575124294261398e-05,
"loss": 3.3312,
"step": 29500
},
{
"epoch": 0.5013787916771121,
"grad_norm": 12.18619441986084,
"learning_rate": 2.515378781494902e-05,
"loss": 3.386,
"step": 30000
},
{
"epoch": 0.5013787916771121,
"eval_loss": 3.3899354934692383,
"eval_runtime": 22.2658,
"eval_samples_per_second": 200.397,
"eval_steps_per_second": 25.061,
"step": 30000
},
{
"epoch": 0.5097351048717306,
"grad_norm": 14.552848815917969,
"learning_rate": 2.4732451335636643e-05,
"loss": 3.3377,
"step": 30500
},
{
"epoch": 0.5180914180663492,
"grad_norm": 15.032088279724121,
"learning_rate": 2.4311114856324262e-05,
"loss": 3.3131,
"step": 31000
},
{
"epoch": 0.5180914180663492,
"eval_loss": 3.378127336502075,
"eval_runtime": 22.2423,
"eval_samples_per_second": 200.609,
"eval_steps_per_second": 25.087,
"step": 31000
},
{
"epoch": 0.5264477312609677,
"grad_norm": 14.666757583618164,
"learning_rate": 2.388977837701188e-05,
"loss": 3.3457,
"step": 31500
},
{
"epoch": 0.5348040444555862,
"grad_norm": 11.800482749938965,
"learning_rate": 2.3468441897699503e-05,
"loss": 3.3192,
"step": 32000
},
{
"epoch": 0.5348040444555862,
"eval_loss": 3.3670458793640137,
"eval_runtime": 22.256,
"eval_samples_per_second": 200.485,
"eval_steps_per_second": 25.072,
"step": 32000
},
{
"epoch": 0.5431603576502048,
"grad_norm": 10.835103034973145,
"learning_rate": 2.3047105418387125e-05,
"loss": 3.3235,
"step": 32500
},
{
"epoch": 0.5515166708448233,
"grad_norm": 12.06092357635498,
"learning_rate": 2.2625768939074744e-05,
"loss": 3.2969,
"step": 33000
},
{
"epoch": 0.5515166708448233,
"eval_loss": 3.356658935546875,
"eval_runtime": 22.2404,
"eval_samples_per_second": 200.626,
"eval_steps_per_second": 25.089,
"step": 33000
},
{
"epoch": 0.5598729840394417,
"grad_norm": 15.398877143859863,
"learning_rate": 2.2204432459762367e-05,
"loss": 3.3181,
"step": 33500
},
{
"epoch": 0.5682292972340603,
"grad_norm": 10.425477027893066,
"learning_rate": 2.178309598044999e-05,
"loss": 3.3202,
"step": 34000
},
{
"epoch": 0.5682292972340603,
"eval_loss": 3.34324312210083,
"eval_runtime": 22.2237,
"eval_samples_per_second": 200.777,
"eval_steps_per_second": 25.108,
"step": 34000
},
{
"epoch": 0.5765856104286788,
"grad_norm": 13.118115425109863,
"learning_rate": 2.136175950113761e-05,
"loss": 3.3028,
"step": 34500
},
{
"epoch": 0.5849419236232974,
"grad_norm": 8.235157012939453,
"learning_rate": 2.0941265694783854e-05,
"loss": 3.2403,
"step": 35000
},
{
"epoch": 0.5849419236232974,
"eval_loss": 3.3430681228637695,
"eval_runtime": 22.2974,
"eval_samples_per_second": 200.113,
"eval_steps_per_second": 25.025,
"step": 35000
},
{
"epoch": 0.5932982368179159,
"grad_norm": 15.389208793640137,
"learning_rate": 2.0519929215471476e-05,
"loss": 3.3105,
"step": 35500
},
{
"epoch": 0.6016545500125344,
"grad_norm": 12.708732604980469,
"learning_rate": 2.0098592736159098e-05,
"loss": 3.2775,
"step": 36000
},
{
"epoch": 0.6016545500125344,
"eval_loss": 3.3276991844177246,
"eval_runtime": 22.2643,
"eval_samples_per_second": 200.411,
"eval_steps_per_second": 25.063,
"step": 36000
},
{
"epoch": 0.610010863207153,
"grad_norm": 13.642451286315918,
"learning_rate": 1.9677256256846717e-05,
"loss": 3.2902,
"step": 36500
},
{
"epoch": 0.6183671764017715,
"grad_norm": 12.606600761413574,
"learning_rate": 1.9256762450492966e-05,
"loss": 3.271,
"step": 37000
},
{
"epoch": 0.6183671764017715,
"eval_loss": 3.3122496604919434,
"eval_runtime": 22.2804,
"eval_samples_per_second": 200.266,
"eval_steps_per_second": 25.044,
"step": 37000
},
{
"epoch": 0.6267234895963901,
"grad_norm": 11.484159469604492,
"learning_rate": 1.8835425971180585e-05,
"loss": 3.2833,
"step": 37500
},
{
"epoch": 0.6350798027910086,
"grad_norm": 12.317131996154785,
"learning_rate": 1.8414089491868204e-05,
"loss": 3.2848,
"step": 38000
},
{
"epoch": 0.6350798027910086,
"eval_loss": 3.3035213947296143,
"eval_runtime": 22.2937,
"eval_samples_per_second": 200.147,
"eval_steps_per_second": 25.03,
"step": 38000
},
{
"epoch": 0.6434361159856271,
"grad_norm": 11.45077896118164,
"learning_rate": 1.7992753012555827e-05,
"loss": 3.202,
"step": 38500
},
{
"epoch": 0.6517924291802457,
"grad_norm": 12.859657287597656,
"learning_rate": 1.7572259206202076e-05,
"loss": 3.2376,
"step": 39000
},
{
"epoch": 0.6517924291802457,
"eval_loss": 3.2956559658050537,
"eval_runtime": 22.3804,
"eval_samples_per_second": 199.371,
"eval_steps_per_second": 24.933,
"step": 39000
},
{
"epoch": 0.6601487423748642,
"grad_norm": 14.472012519836426,
"learning_rate": 1.7150922726889695e-05,
"loss": 3.1924,
"step": 39500
},
{
"epoch": 0.6685050555694827,
"grad_norm": 13.051079750061035,
"learning_rate": 1.673042892053594e-05,
"loss": 3.2598,
"step": 40000
},
{
"epoch": 0.6685050555694827,
"eval_loss": 3.2878499031066895,
"eval_runtime": 22.2464,
"eval_samples_per_second": 200.572,
"eval_steps_per_second": 25.083,
"step": 40000
},
{
"epoch": 0.6768613687641013,
"grad_norm": 15.44560718536377,
"learning_rate": 1.6309092441223563e-05,
"loss": 3.1978,
"step": 40500
},
{
"epoch": 0.6852176819587198,
"grad_norm": 16.988996505737305,
"learning_rate": 1.588775596191118e-05,
"loss": 3.2247,
"step": 41000
},
{
"epoch": 0.6852176819587198,
"eval_loss": 3.279550313949585,
"eval_runtime": 22.2386,
"eval_samples_per_second": 200.642,
"eval_steps_per_second": 25.091,
"step": 41000
},
{
"epoch": 0.6935739951533384,
"grad_norm": 8.293917655944824,
"learning_rate": 1.5466419482598804e-05,
"loss": 3.1682,
"step": 41500
},
{
"epoch": 0.7019303083479569,
"grad_norm": 10.755880355834961,
"learning_rate": 1.5045925676245051e-05,
"loss": 3.1849,
"step": 42000
},
{
"epoch": 0.7019303083479569,
"eval_loss": 3.2791192531585693,
"eval_runtime": 22.2554,
"eval_samples_per_second": 200.491,
"eval_steps_per_second": 25.073,
"step": 42000
},
{
"epoch": 0.7102866215425754,
"grad_norm": 17.822643280029297,
"learning_rate": 1.462458919693267e-05,
"loss": 3.1714,
"step": 42500
},
{
"epoch": 0.718642934737194,
"grad_norm": 18.230485916137695,
"learning_rate": 1.4203252717620291e-05,
"loss": 3.2112,
"step": 43000
},
{
"epoch": 0.718642934737194,
"eval_loss": 3.260193109512329,
"eval_runtime": 22.2518,
"eval_samples_per_second": 200.523,
"eval_steps_per_second": 25.077,
"step": 43000
},
{
"epoch": 0.7269992479318125,
"grad_norm": 13.363430976867676,
"learning_rate": 1.3781916238307913e-05,
"loss": 3.1655,
"step": 43500
},
{
"epoch": 0.735355561126431,
"grad_norm": 11.570181846618652,
"learning_rate": 1.3360579758995534e-05,
"loss": 3.174,
"step": 44000
},
{
"epoch": 0.735355561126431,
"eval_loss": 3.2490386962890625,
"eval_runtime": 22.2683,
"eval_samples_per_second": 200.374,
"eval_steps_per_second": 25.058,
"step": 44000
},
{
"epoch": 0.7437118743210496,
"grad_norm": 19.80602264404297,
"learning_rate": 1.2939243279683155e-05,
"loss": 3.1987,
"step": 44500
},
{
"epoch": 0.7520681875156681,
"grad_norm": 10.821731567382812,
"learning_rate": 1.2518749473329402e-05,
"loss": 3.1799,
"step": 45000
},
{
"epoch": 0.7520681875156681,
"eval_loss": 3.240847587585449,
"eval_runtime": 22.2794,
"eval_samples_per_second": 200.275,
"eval_steps_per_second": 25.046,
"step": 45000
},
{
"epoch": 0.7604245007102867,
"grad_norm": 16.301612854003906,
"learning_rate": 1.2097412994017023e-05,
"loss": 3.2029,
"step": 45500
},
{
"epoch": 0.7687808139049052,
"grad_norm": 14.699359893798828,
"learning_rate": 1.1676076514704643e-05,
"loss": 3.1752,
"step": 46000
},
{
"epoch": 0.7687808139049052,
"eval_loss": 3.233914852142334,
"eval_runtime": 22.269,
"eval_samples_per_second": 200.369,
"eval_steps_per_second": 25.057,
"step": 46000
},
{
"epoch": 0.7771371270995237,
"grad_norm": 15.696563720703125,
"learning_rate": 1.1254740035392266e-05,
"loss": 3.132,
"step": 46500
},
{
"epoch": 0.7854934402941423,
"grad_norm": 13.062487602233887,
"learning_rate": 1.0833403556079886e-05,
"loss": 3.131,
"step": 47000
},
{
"epoch": 0.7854934402941423,
"eval_loss": 3.2280752658843994,
"eval_runtime": 22.2955,
"eval_samples_per_second": 200.13,
"eval_steps_per_second": 25.027,
"step": 47000
},
{
"epoch": 0.7938497534887607,
"grad_norm": 18.67305564880371,
"learning_rate": 1.0412909749726132e-05,
"loss": 3.1571,
"step": 47500
},
{
"epoch": 0.8022060666833793,
"grad_norm": 10.377827644348145,
"learning_rate": 9.992415943372378e-06,
"loss": 3.181,
"step": 48000
},
{
"epoch": 0.8022060666833793,
"eval_loss": 3.2206084728240967,
"eval_runtime": 22.2943,
"eval_samples_per_second": 200.141,
"eval_steps_per_second": 25.029,
"step": 48000
},
{
"epoch": 0.8105623798779978,
"grad_norm": 12.836233139038086,
"learning_rate": 9.571922137018624e-06,
"loss": 3.139,
"step": 48500
},
{
"epoch": 0.8189186930726163,
"grad_norm": 11.736408233642578,
"learning_rate": 9.150585657706244e-06,
"loss": 3.0932,
"step": 49000
},
{
"epoch": 0.8189186930726163,
"eval_loss": 3.2135069370269775,
"eval_runtime": 22.2506,
"eval_samples_per_second": 200.534,
"eval_steps_per_second": 25.078,
"step": 49000
},
{
"epoch": 0.8272750062672349,
"grad_norm": 16.016298294067383,
"learning_rate": 8.729249178393865e-06,
"loss": 3.1634,
"step": 49500
},
{
"epoch": 0.8356313194618534,
"grad_norm": 10.488819122314453,
"learning_rate": 8.307912699081487e-06,
"loss": 3.1376,
"step": 50000
},
{
"epoch": 0.8356313194618534,
"eval_loss": 3.2051162719726562,
"eval_runtime": 22.294,
"eval_samples_per_second": 200.144,
"eval_steps_per_second": 25.029,
"step": 50000
},
{
"epoch": 0.8439876326564719,
"grad_norm": 16.168071746826172,
"learning_rate": 7.886576219769108e-06,
"loss": 3.1121,
"step": 50500
},
{
"epoch": 0.8523439458510905,
"grad_norm": 19.903099060058594,
"learning_rate": 7.465239740456729e-06,
"loss": 3.1084,
"step": 51000
},
{
"epoch": 0.8523439458510905,
"eval_loss": 3.198310375213623,
"eval_runtime": 22.3049,
"eval_samples_per_second": 200.046,
"eval_steps_per_second": 25.017,
"step": 51000
},
{
"epoch": 0.860700259045709,
"grad_norm": 12.082676887512207,
"learning_rate": 7.043903261144351e-06,
"loss": 3.0957,
"step": 51500
},
{
"epoch": 0.8690565722403276,
"grad_norm": 11.764552116394043,
"learning_rate": 6.622566781831971e-06,
"loss": 3.099,
"step": 52000
},
{
"epoch": 0.8690565722403276,
"eval_loss": 3.193253993988037,
"eval_runtime": 22.2469,
"eval_samples_per_second": 200.567,
"eval_steps_per_second": 25.082,
"step": 52000
},
{
"epoch": 0.8774128854349461,
"grad_norm": 12.482972145080566,
"learning_rate": 6.201230302519592e-06,
"loss": 3.0779,
"step": 52500
},
{
"epoch": 0.8857691986295646,
"grad_norm": 14.11436939239502,
"learning_rate": 5.7798938232072135e-06,
"loss": 3.1278,
"step": 53000
},
{
"epoch": 0.8857691986295646,
"eval_loss": 3.1867904663085938,
"eval_runtime": 22.2664,
"eval_samples_per_second": 200.392,
"eval_steps_per_second": 25.06,
"step": 53000
},
{
"epoch": 0.8941255118241832,
"grad_norm": 19.69700813293457,
"learning_rate": 5.358557343894835e-06,
"loss": 3.0968,
"step": 53500
},
{
"epoch": 0.9024818250188017,
"grad_norm": 14.537339210510254,
"learning_rate": 4.937220864582456e-06,
"loss": 3.1436,
"step": 54000
},
{
"epoch": 0.9024818250188017,
"eval_loss": 3.180774688720703,
"eval_runtime": 22.293,
"eval_samples_per_second": 200.152,
"eval_steps_per_second": 25.03,
"step": 54000
},
{
"epoch": 0.9108381382134202,
"grad_norm": 16.117996215820312,
"learning_rate": 4.515884385270077e-06,
"loss": 3.1288,
"step": 54500
},
{
"epoch": 0.9191944514080388,
"grad_norm": 12.458276748657227,
"learning_rate": 4.094547905957698e-06,
"loss": 3.0763,
"step": 55000
},
{
"epoch": 0.9191944514080388,
"eval_loss": 3.175370216369629,
"eval_runtime": 22.253,
"eval_samples_per_second": 200.513,
"eval_steps_per_second": 25.075,
"step": 55000
},
{
"epoch": 0.9275507646026573,
"grad_norm": 14.115385055541992,
"learning_rate": 3.6732114266453192e-06,
"loss": 3.0642,
"step": 55500
},
{
"epoch": 0.9359070777972759,
"grad_norm": 19.65464210510254,
"learning_rate": 3.2518749473329403e-06,
"loss": 3.1248,
"step": 56000
},
{
"epoch": 0.9359070777972759,
"eval_loss": 3.1690962314605713,
"eval_runtime": 22.3258,
"eval_samples_per_second": 199.858,
"eval_steps_per_second": 24.993,
"step": 56000
},
{
"epoch": 0.9442633909918944,
"grad_norm": 11.953753471374512,
"learning_rate": 2.831381140979186e-06,
"loss": 3.1361,
"step": 56500
},
{
"epoch": 0.9526197041865129,
"grad_norm": 10.821110725402832,
"learning_rate": 2.4108873346254323e-06,
"loss": 3.0418,
"step": 57000
},
{
"epoch": 0.9526197041865129,
"eval_loss": 3.164776563644409,
"eval_runtime": 22.2464,
"eval_samples_per_second": 200.572,
"eval_steps_per_second": 25.083,
"step": 57000
},
{
"epoch": 0.9609760173811315,
"grad_norm": 11.476717948913574,
"learning_rate": 1.9895508553130533e-06,
"loss": 3.0504,
"step": 57500
},
{
"epoch": 0.96933233057575,
"grad_norm": 10.973363876342773,
"learning_rate": 1.5682143760006742e-06,
"loss": 3.0755,
"step": 58000
},
{
"epoch": 0.96933233057575,
"eval_loss": 3.1603705883026123,
"eval_runtime": 22.272,
"eval_samples_per_second": 200.341,
"eval_steps_per_second": 25.054,
"step": 58000
},
{
"epoch": 0.9776886437703685,
"grad_norm": 10.836787223815918,
"learning_rate": 1.1468778966882954e-06,
"loss": 3.1001,
"step": 58500
},
{
"epoch": 0.9860449569649871,
"grad_norm": 13.901703834533691,
"learning_rate": 7.255414173759165e-07,
"loss": 3.0633,
"step": 59000
},
{
"epoch": 0.9860449569649871,
"eval_loss": 3.1587648391723633,
"eval_runtime": 22.3994,
"eval_samples_per_second": 199.202,
"eval_steps_per_second": 24.911,
"step": 59000
},
{
"epoch": 0.9944012701596056,
"grad_norm": 19.022567749023438,
"learning_rate": 3.0420493806353753e-07,
"loss": 3.0751,
"step": 59500
}
],
"logging_steps": 500,
"max_steps": 59835,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.943723113325527e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}