{ "best_metric": 3.1603705883026123, "best_model_checkpoint": "./models/full-finetuning/LLaMmlein_120M/checkpoint-58000", "epoch": 1.0, "eval_steps": 1000, "global_step": 59835, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008356313194618534, "grad_norm": 40.14425277709961, "learning_rate": 4.96e-05, "loss": 4.8477, "step": 500 }, { "epoch": 0.016712626389237067, "grad_norm": 22.107275009155273, "learning_rate": 4.958203421252212e-05, "loss": 4.4466, "step": 1000 }, { "epoch": 0.016712626389237067, "eval_loss": 4.3118462562561035, "eval_runtime": 22.0404, "eval_samples_per_second": 202.446, "eval_steps_per_second": 25.317, "step": 1000 }, { "epoch": 0.025068939583855605, "grad_norm": 19.609399795532227, "learning_rate": 4.916069773320974e-05, "loss": 4.2986, "step": 1500 }, { "epoch": 0.033425252778474135, "grad_norm": 18.64438247680664, "learning_rate": 4.873936125389736e-05, "loss": 4.2295, "step": 2000 }, { "epoch": 0.033425252778474135, "eval_loss": 4.121812343597412, "eval_runtime": 22.1428, "eval_samples_per_second": 201.51, "eval_steps_per_second": 25.2, "step": 2000 }, { "epoch": 0.04178156597309267, "grad_norm": 19.098369598388672, "learning_rate": 4.8318024774584986e-05, "loss": 4.1165, "step": 2500 }, { "epoch": 0.05013787916771121, "grad_norm": 17.83785629272461, "learning_rate": 4.789668829527261e-05, "loss": 4.029, "step": 3000 }, { "epoch": 0.05013787916771121, "eval_loss": 4.040452480316162, "eval_runtime": 22.088, "eval_samples_per_second": 202.01, "eval_steps_per_second": 25.263, "step": 3000 }, { "epoch": 0.05849419236232974, "grad_norm": 14.394288063049316, "learning_rate": 4.747535181596023e-05, "loss": 4.0701, "step": 3500 }, { "epoch": 0.06685050555694827, "grad_norm": 17.005945205688477, "learning_rate": 4.705401533664785e-05, "loss": 4.0239, "step": 4000 }, { "epoch": 0.06685050555694827, "eval_loss": 3.948943853378296, "eval_runtime": 22.0828, "eval_samples_per_second": 202.058, "eval_steps_per_second": 25.269, "step": 4000 }, { "epoch": 0.0752068187515668, "grad_norm": 20.297489166259766, "learning_rate": 4.663267885733547e-05, "loss": 3.9704, "step": 4500 }, { "epoch": 0.08356313194618534, "grad_norm": 17.280521392822266, "learning_rate": 4.621134237802309e-05, "loss": 3.9118, "step": 5000 }, { "epoch": 0.08356313194618534, "eval_loss": 3.891711950302124, "eval_runtime": 22.1454, "eval_samples_per_second": 201.487, "eval_steps_per_second": 25.197, "step": 5000 }, { "epoch": 0.09191944514080387, "grad_norm": 18.847597122192383, "learning_rate": 4.579000589871071e-05, "loss": 3.9024, "step": 5500 }, { "epoch": 0.10027575833542242, "grad_norm": 14.654472351074219, "learning_rate": 4.5368669419398335e-05, "loss": 3.8479, "step": 6000 }, { "epoch": 0.10027575833542242, "eval_loss": 3.8378305435180664, "eval_runtime": 22.1051, "eval_samples_per_second": 201.854, "eval_steps_per_second": 25.243, "step": 6000 }, { "epoch": 0.10863207153004095, "grad_norm": 18.267274856567383, "learning_rate": 4.494817561304458e-05, "loss": 3.8116, "step": 6500 }, { "epoch": 0.11698838472465949, "grad_norm": 13.313653945922852, "learning_rate": 4.45268391337322e-05, "loss": 3.8592, "step": 7000 }, { "epoch": 0.11698838472465949, "eval_loss": 3.8072171211242676, "eval_runtime": 22.2351, "eval_samples_per_second": 200.674, "eval_steps_per_second": 25.095, "step": 7000 }, { "epoch": 0.12534469791927802, "grad_norm": 16.960010528564453, "learning_rate": 4.410550265441982e-05, "loss": 3.8291, "step": 7500 }, { "epoch": 0.13370101111389654, "grad_norm": 15.129623413085938, "learning_rate": 4.3684166175107444e-05, "loss": 3.7697, "step": 8000 }, { "epoch": 0.13370101111389654, "eval_loss": 3.777130126953125, "eval_runtime": 22.2765, "eval_samples_per_second": 200.3, "eval_steps_per_second": 25.049, "step": 8000 }, { "epoch": 0.1420573243085151, "grad_norm": 18.825756072998047, "learning_rate": 4.326535771467094e-05, "loss": 3.7714, "step": 8500 }, { "epoch": 0.1504136375031336, "grad_norm": 13.93099308013916, "learning_rate": 4.284402123535856e-05, "loss": 3.7719, "step": 9000 }, { "epoch": 0.1504136375031336, "eval_loss": 3.747012138366699, "eval_runtime": 22.2951, "eval_samples_per_second": 200.134, "eval_steps_per_second": 25.028, "step": 9000 }, { "epoch": 0.15876995069775215, "grad_norm": 12.885889053344727, "learning_rate": 4.2422684756046185e-05, "loss": 3.7127, "step": 9500 }, { "epoch": 0.16712626389237067, "grad_norm": 15.362942695617676, "learning_rate": 4.200134827673381e-05, "loss": 3.7135, "step": 10000 }, { "epoch": 0.16712626389237067, "eval_loss": 3.7190206050872803, "eval_runtime": 22.2344, "eval_samples_per_second": 200.68, "eval_steps_per_second": 25.096, "step": 10000 }, { "epoch": 0.17548257708698922, "grad_norm": 12.432964324951172, "learning_rate": 4.158001179742142e-05, "loss": 3.6715, "step": 10500 }, { "epoch": 0.18383889028160774, "grad_norm": 25.97146987915039, "learning_rate": 4.115867531810904e-05, "loss": 3.6599, "step": 11000 }, { "epoch": 0.18383889028160774, "eval_loss": 3.6818652153015137, "eval_runtime": 22.2972, "eval_samples_per_second": 200.115, "eval_steps_per_second": 25.026, "step": 11000 }, { "epoch": 0.1921952034762263, "grad_norm": 21.378082275390625, "learning_rate": 4.073733883879666e-05, "loss": 3.6754, "step": 11500 }, { "epoch": 0.20055151667084484, "grad_norm": 11.59192943572998, "learning_rate": 4.031600235948428e-05, "loss": 3.669, "step": 12000 }, { "epoch": 0.20055151667084484, "eval_loss": 3.661188840866089, "eval_runtime": 22.2553, "eval_samples_per_second": 200.492, "eval_steps_per_second": 25.073, "step": 12000 }, { "epoch": 0.20890782986546336, "grad_norm": 17.75707244873047, "learning_rate": 3.9894665880171905e-05, "loss": 3.6794, "step": 12500 }, { "epoch": 0.2172641430600819, "grad_norm": 10.825678825378418, "learning_rate": 3.947332940085953e-05, "loss": 3.6113, "step": 13000 }, { "epoch": 0.2172641430600819, "eval_loss": 3.6508119106292725, "eval_runtime": 22.2982, "eval_samples_per_second": 200.106, "eval_steps_per_second": 25.024, "step": 13000 }, { "epoch": 0.22562045625470042, "grad_norm": 10.04261302947998, "learning_rate": 3.905199292154715e-05, "loss": 3.5963, "step": 13500 }, { "epoch": 0.23397676944931897, "grad_norm": 13.926618576049805, "learning_rate": 3.863065644223477e-05, "loss": 3.5997, "step": 14000 }, { "epoch": 0.23397676944931897, "eval_loss": 3.6223905086517334, "eval_runtime": 22.2717, "eval_samples_per_second": 200.344, "eval_steps_per_second": 25.054, "step": 14000 }, { "epoch": 0.2423330826439375, "grad_norm": 11.00304889678955, "learning_rate": 3.820931996292239e-05, "loss": 3.5991, "step": 14500 }, { "epoch": 0.25068939583855604, "grad_norm": 16.099769592285156, "learning_rate": 3.778798348361001e-05, "loss": 3.6042, "step": 15000 }, { "epoch": 0.25068939583855604, "eval_loss": 3.5953731536865234, "eval_runtime": 22.2814, "eval_samples_per_second": 200.257, "eval_steps_per_second": 25.043, "step": 15000 }, { "epoch": 0.2590457090331746, "grad_norm": 12.459487915039062, "learning_rate": 3.736664700429763e-05, "loss": 3.5871, "step": 15500 }, { "epoch": 0.2674020222277931, "grad_norm": 16.979909896850586, "learning_rate": 3.6946153197943875e-05, "loss": 3.5238, "step": 16000 }, { "epoch": 0.2674020222277931, "eval_loss": 3.590113401412964, "eval_runtime": 22.2293, "eval_samples_per_second": 200.726, "eval_steps_per_second": 25.102, "step": 16000 }, { "epoch": 0.2757583354224116, "grad_norm": 23.20758056640625, "learning_rate": 3.65248167186315e-05, "loss": 3.5646, "step": 16500 }, { "epoch": 0.2841146486170302, "grad_norm": 18.35931396484375, "learning_rate": 3.610348023931912e-05, "loss": 3.5445, "step": 17000 }, { "epoch": 0.2841146486170302, "eval_loss": 3.563676595687866, "eval_runtime": 22.2571, "eval_samples_per_second": 200.475, "eval_steps_per_second": 25.071, "step": 17000 }, { "epoch": 0.2924709618116487, "grad_norm": 17.187950134277344, "learning_rate": 3.568214376000674e-05, "loss": 3.494, "step": 17500 }, { "epoch": 0.3008272750062672, "grad_norm": 15.331987380981445, "learning_rate": 3.5261649953652984e-05, "loss": 3.4913, "step": 18000 }, { "epoch": 0.3008272750062672, "eval_loss": 3.541306495666504, "eval_runtime": 22.2598, "eval_samples_per_second": 200.451, "eval_steps_per_second": 25.068, "step": 18000 }, { "epoch": 0.30918358820088576, "grad_norm": 16.340852737426758, "learning_rate": 3.484031347434061e-05, "loss": 3.4969, "step": 18500 }, { "epoch": 0.3175399013955043, "grad_norm": 12.265207290649414, "learning_rate": 3.441897699502823e-05, "loss": 3.4934, "step": 19000 }, { "epoch": 0.3175399013955043, "eval_loss": 3.520357847213745, "eval_runtime": 22.2447, "eval_samples_per_second": 200.587, "eval_steps_per_second": 25.085, "step": 19000 }, { "epoch": 0.32589621459012286, "grad_norm": 15.456232070922852, "learning_rate": 3.399764051571585e-05, "loss": 3.5013, "step": 19500 }, { "epoch": 0.33425252778474135, "grad_norm": 15.721699714660645, "learning_rate": 3.3576304036403474e-05, "loss": 3.4627, "step": 20000 }, { "epoch": 0.33425252778474135, "eval_loss": 3.5179378986358643, "eval_runtime": 22.2594, "eval_samples_per_second": 200.455, "eval_steps_per_second": 25.068, "step": 20000 }, { "epoch": 0.3426088409793599, "grad_norm": 12.118553161621094, "learning_rate": 3.3154967557091096e-05, "loss": 3.5006, "step": 20500 }, { "epoch": 0.35096515417397844, "grad_norm": 8.990864753723145, "learning_rate": 3.273447375073734e-05, "loss": 3.4367, "step": 21000 }, { "epoch": 0.35096515417397844, "eval_loss": 3.5118658542633057, "eval_runtime": 22.2415, "eval_samples_per_second": 200.616, "eval_steps_per_second": 25.088, "step": 21000 }, { "epoch": 0.359321467368597, "grad_norm": 9.9972562789917, "learning_rate": 3.231313727142496e-05, "loss": 3.4498, "step": 21500 }, { "epoch": 0.3676777805632155, "grad_norm": 10.996673583984375, "learning_rate": 3.189180079211258e-05, "loss": 3.4643, "step": 22000 }, { "epoch": 0.3676777805632155, "eval_loss": 3.483738899230957, "eval_runtime": 22.2582, "eval_samples_per_second": 200.465, "eval_steps_per_second": 25.069, "step": 22000 }, { "epoch": 0.37603409375783403, "grad_norm": 14.55636978149414, "learning_rate": 3.14704643128002e-05, "loss": 3.5215, "step": 22500 }, { "epoch": 0.3843904069524526, "grad_norm": 13.585105895996094, "learning_rate": 3.104912783348782e-05, "loss": 3.419, "step": 23000 }, { "epoch": 0.3843904069524526, "eval_loss": 3.47660231590271, "eval_runtime": 22.226, "eval_samples_per_second": 200.756, "eval_steps_per_second": 25.106, "step": 23000 }, { "epoch": 0.3927467201470711, "grad_norm": 11.853238105773926, "learning_rate": 3.062779135417544e-05, "loss": 3.4438, "step": 23500 }, { "epoch": 0.4011030333416897, "grad_norm": 13.06174373626709, "learning_rate": 3.020729754782169e-05, "loss": 3.4029, "step": 24000 }, { "epoch": 0.4011030333416897, "eval_loss": 3.4587268829345703, "eval_runtime": 22.2726, "eval_samples_per_second": 200.336, "eval_steps_per_second": 25.053, "step": 24000 }, { "epoch": 0.40945934653630817, "grad_norm": 16.874757766723633, "learning_rate": 2.9786803741467938e-05, "loss": 3.3971, "step": 24500 }, { "epoch": 0.4178156597309267, "grad_norm": 11.108474731445312, "learning_rate": 2.936546726215556e-05, "loss": 3.3574, "step": 25000 }, { "epoch": 0.4178156597309267, "eval_loss": 3.446179151535034, "eval_runtime": 22.3522, "eval_samples_per_second": 199.622, "eval_steps_per_second": 24.964, "step": 25000 }, { "epoch": 0.42617197292554526, "grad_norm": 12.936110496520996, "learning_rate": 2.8944130782843183e-05, "loss": 3.3829, "step": 25500 }, { "epoch": 0.4345282861201638, "grad_norm": 12.90854549407959, "learning_rate": 2.8522794303530802e-05, "loss": 3.4156, "step": 26000 }, { "epoch": 0.4345282861201638, "eval_loss": 3.44026517868042, "eval_runtime": 22.2661, "eval_samples_per_second": 200.394, "eval_steps_per_second": 25.061, "step": 26000 }, { "epoch": 0.4428845993147823, "grad_norm": 10.326555252075195, "learning_rate": 2.8101457824218424e-05, "loss": 3.3607, "step": 26500 }, { "epoch": 0.45124091250940085, "grad_norm": 12.372066497802734, "learning_rate": 2.7681806690823293e-05, "loss": 3.3836, "step": 27000 }, { "epoch": 0.45124091250940085, "eval_loss": 3.4253649711608887, "eval_runtime": 22.2507, "eval_samples_per_second": 200.533, "eval_steps_per_second": 25.078, "step": 27000 }, { "epoch": 0.4595972257040194, "grad_norm": 9.778299331665039, "learning_rate": 2.7260470211510912e-05, "loss": 3.3671, "step": 27500 }, { "epoch": 0.46795353889863794, "grad_norm": 20.047178268432617, "learning_rate": 2.6839133732198535e-05, "loss": 3.3395, "step": 28000 }, { "epoch": 0.46795353889863794, "eval_loss": 3.41679048538208, "eval_runtime": 22.2707, "eval_samples_per_second": 200.353, "eval_steps_per_second": 25.055, "step": 28000 }, { "epoch": 0.47630985209325644, "grad_norm": 9.312335968017578, "learning_rate": 2.6417797252886157e-05, "loss": 3.3616, "step": 28500 }, { "epoch": 0.484666165287875, "grad_norm": 10.994682312011719, "learning_rate": 2.5996460773573776e-05, "loss": 3.3719, "step": 29000 }, { "epoch": 0.484666165287875, "eval_loss": 3.4018924236297607, "eval_runtime": 22.2565, "eval_samples_per_second": 200.481, "eval_steps_per_second": 25.071, "step": 29000 }, { "epoch": 0.49302247848249353, "grad_norm": 13.464505195617676, "learning_rate": 2.5575124294261398e-05, "loss": 3.3312, "step": 29500 }, { "epoch": 0.5013787916771121, "grad_norm": 12.18619441986084, "learning_rate": 2.515378781494902e-05, "loss": 3.386, "step": 30000 }, { "epoch": 0.5013787916771121, "eval_loss": 3.3899354934692383, "eval_runtime": 22.2658, "eval_samples_per_second": 200.397, "eval_steps_per_second": 25.061, "step": 30000 }, { "epoch": 0.5097351048717306, "grad_norm": 14.552848815917969, "learning_rate": 2.4732451335636643e-05, "loss": 3.3377, "step": 30500 }, { "epoch": 0.5180914180663492, "grad_norm": 15.032088279724121, "learning_rate": 2.4311114856324262e-05, "loss": 3.3131, "step": 31000 }, { "epoch": 0.5180914180663492, "eval_loss": 3.378127336502075, "eval_runtime": 22.2423, "eval_samples_per_second": 200.609, "eval_steps_per_second": 25.087, "step": 31000 }, { "epoch": 0.5264477312609677, "grad_norm": 14.666757583618164, "learning_rate": 2.388977837701188e-05, "loss": 3.3457, "step": 31500 }, { "epoch": 0.5348040444555862, "grad_norm": 11.800482749938965, "learning_rate": 2.3468441897699503e-05, "loss": 3.3192, "step": 32000 }, { "epoch": 0.5348040444555862, "eval_loss": 3.3670458793640137, "eval_runtime": 22.256, "eval_samples_per_second": 200.485, "eval_steps_per_second": 25.072, "step": 32000 }, { "epoch": 0.5431603576502048, "grad_norm": 10.835103034973145, "learning_rate": 2.3047105418387125e-05, "loss": 3.3235, "step": 32500 }, { "epoch": 0.5515166708448233, "grad_norm": 12.06092357635498, "learning_rate": 2.2625768939074744e-05, "loss": 3.2969, "step": 33000 }, { "epoch": 0.5515166708448233, "eval_loss": 3.356658935546875, "eval_runtime": 22.2404, "eval_samples_per_second": 200.626, "eval_steps_per_second": 25.089, "step": 33000 }, { "epoch": 0.5598729840394417, "grad_norm": 15.398877143859863, "learning_rate": 2.2204432459762367e-05, "loss": 3.3181, "step": 33500 }, { "epoch": 0.5682292972340603, "grad_norm": 10.425477027893066, "learning_rate": 2.178309598044999e-05, "loss": 3.3202, "step": 34000 }, { "epoch": 0.5682292972340603, "eval_loss": 3.34324312210083, "eval_runtime": 22.2237, "eval_samples_per_second": 200.777, "eval_steps_per_second": 25.108, "step": 34000 }, { "epoch": 0.5765856104286788, "grad_norm": 13.118115425109863, "learning_rate": 2.136175950113761e-05, "loss": 3.3028, "step": 34500 }, { "epoch": 0.5849419236232974, "grad_norm": 8.235157012939453, "learning_rate": 2.0941265694783854e-05, "loss": 3.2403, "step": 35000 }, { "epoch": 0.5849419236232974, "eval_loss": 3.3430681228637695, "eval_runtime": 22.2974, "eval_samples_per_second": 200.113, "eval_steps_per_second": 25.025, "step": 35000 }, { "epoch": 0.5932982368179159, "grad_norm": 15.389208793640137, "learning_rate": 2.0519929215471476e-05, "loss": 3.3105, "step": 35500 }, { "epoch": 0.6016545500125344, "grad_norm": 12.708732604980469, "learning_rate": 2.0098592736159098e-05, "loss": 3.2775, "step": 36000 }, { "epoch": 0.6016545500125344, "eval_loss": 3.3276991844177246, "eval_runtime": 22.2643, "eval_samples_per_second": 200.411, "eval_steps_per_second": 25.063, "step": 36000 }, { "epoch": 0.610010863207153, "grad_norm": 13.642451286315918, "learning_rate": 1.9677256256846717e-05, "loss": 3.2902, "step": 36500 }, { "epoch": 0.6183671764017715, "grad_norm": 12.606600761413574, "learning_rate": 1.9256762450492966e-05, "loss": 3.271, "step": 37000 }, { "epoch": 0.6183671764017715, "eval_loss": 3.3122496604919434, "eval_runtime": 22.2804, "eval_samples_per_second": 200.266, "eval_steps_per_second": 25.044, "step": 37000 }, { "epoch": 0.6267234895963901, "grad_norm": 11.484159469604492, "learning_rate": 1.8835425971180585e-05, "loss": 3.2833, "step": 37500 }, { "epoch": 0.6350798027910086, "grad_norm": 12.317131996154785, "learning_rate": 1.8414089491868204e-05, "loss": 3.2848, "step": 38000 }, { "epoch": 0.6350798027910086, "eval_loss": 3.3035213947296143, "eval_runtime": 22.2937, "eval_samples_per_second": 200.147, "eval_steps_per_second": 25.03, "step": 38000 }, { "epoch": 0.6434361159856271, "grad_norm": 11.45077896118164, "learning_rate": 1.7992753012555827e-05, "loss": 3.202, "step": 38500 }, { "epoch": 0.6517924291802457, "grad_norm": 12.859657287597656, "learning_rate": 1.7572259206202076e-05, "loss": 3.2376, "step": 39000 }, { "epoch": 0.6517924291802457, "eval_loss": 3.2956559658050537, "eval_runtime": 22.3804, "eval_samples_per_second": 199.371, "eval_steps_per_second": 24.933, "step": 39000 }, { "epoch": 0.6601487423748642, "grad_norm": 14.472012519836426, "learning_rate": 1.7150922726889695e-05, "loss": 3.1924, "step": 39500 }, { "epoch": 0.6685050555694827, "grad_norm": 13.051079750061035, "learning_rate": 1.673042892053594e-05, "loss": 3.2598, "step": 40000 }, { "epoch": 0.6685050555694827, "eval_loss": 3.2878499031066895, "eval_runtime": 22.2464, "eval_samples_per_second": 200.572, "eval_steps_per_second": 25.083, "step": 40000 }, { "epoch": 0.6768613687641013, "grad_norm": 15.44560718536377, "learning_rate": 1.6309092441223563e-05, "loss": 3.1978, "step": 40500 }, { "epoch": 0.6852176819587198, "grad_norm": 16.988996505737305, "learning_rate": 1.588775596191118e-05, "loss": 3.2247, "step": 41000 }, { "epoch": 0.6852176819587198, "eval_loss": 3.279550313949585, "eval_runtime": 22.2386, "eval_samples_per_second": 200.642, "eval_steps_per_second": 25.091, "step": 41000 }, { "epoch": 0.6935739951533384, "grad_norm": 8.293917655944824, "learning_rate": 1.5466419482598804e-05, "loss": 3.1682, "step": 41500 }, { "epoch": 0.7019303083479569, "grad_norm": 10.755880355834961, "learning_rate": 1.5045925676245051e-05, "loss": 3.1849, "step": 42000 }, { "epoch": 0.7019303083479569, "eval_loss": 3.2791192531585693, "eval_runtime": 22.2554, "eval_samples_per_second": 200.491, "eval_steps_per_second": 25.073, "step": 42000 }, { "epoch": 0.7102866215425754, "grad_norm": 17.822643280029297, "learning_rate": 1.462458919693267e-05, "loss": 3.1714, "step": 42500 }, { "epoch": 0.718642934737194, "grad_norm": 18.230485916137695, "learning_rate": 1.4203252717620291e-05, "loss": 3.2112, "step": 43000 }, { "epoch": 0.718642934737194, "eval_loss": 3.260193109512329, "eval_runtime": 22.2518, "eval_samples_per_second": 200.523, "eval_steps_per_second": 25.077, "step": 43000 }, { "epoch": 0.7269992479318125, "grad_norm": 13.363430976867676, "learning_rate": 1.3781916238307913e-05, "loss": 3.1655, "step": 43500 }, { "epoch": 0.735355561126431, "grad_norm": 11.570181846618652, "learning_rate": 1.3360579758995534e-05, "loss": 3.174, "step": 44000 }, { "epoch": 0.735355561126431, "eval_loss": 3.2490386962890625, "eval_runtime": 22.2683, "eval_samples_per_second": 200.374, "eval_steps_per_second": 25.058, "step": 44000 }, { "epoch": 0.7437118743210496, "grad_norm": 19.80602264404297, "learning_rate": 1.2939243279683155e-05, "loss": 3.1987, "step": 44500 }, { "epoch": 0.7520681875156681, "grad_norm": 10.821731567382812, "learning_rate": 1.2518749473329402e-05, "loss": 3.1799, "step": 45000 }, { "epoch": 0.7520681875156681, "eval_loss": 3.240847587585449, "eval_runtime": 22.2794, "eval_samples_per_second": 200.275, "eval_steps_per_second": 25.046, "step": 45000 }, { "epoch": 0.7604245007102867, "grad_norm": 16.301612854003906, "learning_rate": 1.2097412994017023e-05, "loss": 3.2029, "step": 45500 }, { "epoch": 0.7687808139049052, "grad_norm": 14.699359893798828, "learning_rate": 1.1676076514704643e-05, "loss": 3.1752, "step": 46000 }, { "epoch": 0.7687808139049052, "eval_loss": 3.233914852142334, "eval_runtime": 22.269, "eval_samples_per_second": 200.369, "eval_steps_per_second": 25.057, "step": 46000 }, { "epoch": 0.7771371270995237, "grad_norm": 15.696563720703125, "learning_rate": 1.1254740035392266e-05, "loss": 3.132, "step": 46500 }, { "epoch": 0.7854934402941423, "grad_norm": 13.062487602233887, "learning_rate": 1.0833403556079886e-05, "loss": 3.131, "step": 47000 }, { "epoch": 0.7854934402941423, "eval_loss": 3.2280752658843994, "eval_runtime": 22.2955, "eval_samples_per_second": 200.13, "eval_steps_per_second": 25.027, "step": 47000 }, { "epoch": 0.7938497534887607, "grad_norm": 18.67305564880371, "learning_rate": 1.0412909749726132e-05, "loss": 3.1571, "step": 47500 }, { "epoch": 0.8022060666833793, "grad_norm": 10.377827644348145, "learning_rate": 9.992415943372378e-06, "loss": 3.181, "step": 48000 }, { "epoch": 0.8022060666833793, "eval_loss": 3.2206084728240967, "eval_runtime": 22.2943, "eval_samples_per_second": 200.141, "eval_steps_per_second": 25.029, "step": 48000 }, { "epoch": 0.8105623798779978, "grad_norm": 12.836233139038086, "learning_rate": 9.571922137018624e-06, "loss": 3.139, "step": 48500 }, { "epoch": 0.8189186930726163, "grad_norm": 11.736408233642578, "learning_rate": 9.150585657706244e-06, "loss": 3.0932, "step": 49000 }, { "epoch": 0.8189186930726163, "eval_loss": 3.2135069370269775, "eval_runtime": 22.2506, "eval_samples_per_second": 200.534, "eval_steps_per_second": 25.078, "step": 49000 }, { "epoch": 0.8272750062672349, "grad_norm": 16.016298294067383, "learning_rate": 8.729249178393865e-06, "loss": 3.1634, "step": 49500 }, { "epoch": 0.8356313194618534, "grad_norm": 10.488819122314453, "learning_rate": 8.307912699081487e-06, "loss": 3.1376, "step": 50000 }, { "epoch": 0.8356313194618534, "eval_loss": 3.2051162719726562, "eval_runtime": 22.294, "eval_samples_per_second": 200.144, "eval_steps_per_second": 25.029, "step": 50000 }, { "epoch": 0.8439876326564719, "grad_norm": 16.168071746826172, "learning_rate": 7.886576219769108e-06, "loss": 3.1121, "step": 50500 }, { "epoch": 0.8523439458510905, "grad_norm": 19.903099060058594, "learning_rate": 7.465239740456729e-06, "loss": 3.1084, "step": 51000 }, { "epoch": 0.8523439458510905, "eval_loss": 3.198310375213623, "eval_runtime": 22.3049, "eval_samples_per_second": 200.046, "eval_steps_per_second": 25.017, "step": 51000 }, { "epoch": 0.860700259045709, "grad_norm": 12.082676887512207, "learning_rate": 7.043903261144351e-06, "loss": 3.0957, "step": 51500 }, { "epoch": 0.8690565722403276, "grad_norm": 11.764552116394043, "learning_rate": 6.622566781831971e-06, "loss": 3.099, "step": 52000 }, { "epoch": 0.8690565722403276, "eval_loss": 3.193253993988037, "eval_runtime": 22.2469, "eval_samples_per_second": 200.567, "eval_steps_per_second": 25.082, "step": 52000 }, { "epoch": 0.8774128854349461, "grad_norm": 12.482972145080566, "learning_rate": 6.201230302519592e-06, "loss": 3.0779, "step": 52500 }, { "epoch": 0.8857691986295646, "grad_norm": 14.11436939239502, "learning_rate": 5.7798938232072135e-06, "loss": 3.1278, "step": 53000 }, { "epoch": 0.8857691986295646, "eval_loss": 3.1867904663085938, "eval_runtime": 22.2664, "eval_samples_per_second": 200.392, "eval_steps_per_second": 25.06, "step": 53000 }, { "epoch": 0.8941255118241832, "grad_norm": 19.69700813293457, "learning_rate": 5.358557343894835e-06, "loss": 3.0968, "step": 53500 }, { "epoch": 0.9024818250188017, "grad_norm": 14.537339210510254, "learning_rate": 4.937220864582456e-06, "loss": 3.1436, "step": 54000 }, { "epoch": 0.9024818250188017, "eval_loss": 3.180774688720703, "eval_runtime": 22.293, "eval_samples_per_second": 200.152, "eval_steps_per_second": 25.03, "step": 54000 }, { "epoch": 0.9108381382134202, "grad_norm": 16.117996215820312, "learning_rate": 4.515884385270077e-06, "loss": 3.1288, "step": 54500 }, { "epoch": 0.9191944514080388, "grad_norm": 12.458276748657227, "learning_rate": 4.094547905957698e-06, "loss": 3.0763, "step": 55000 }, { "epoch": 0.9191944514080388, "eval_loss": 3.175370216369629, "eval_runtime": 22.253, "eval_samples_per_second": 200.513, "eval_steps_per_second": 25.075, "step": 55000 }, { "epoch": 0.9275507646026573, "grad_norm": 14.115385055541992, "learning_rate": 3.6732114266453192e-06, "loss": 3.0642, "step": 55500 }, { "epoch": 0.9359070777972759, "grad_norm": 19.65464210510254, "learning_rate": 3.2518749473329403e-06, "loss": 3.1248, "step": 56000 }, { "epoch": 0.9359070777972759, "eval_loss": 3.1690962314605713, "eval_runtime": 22.3258, "eval_samples_per_second": 199.858, "eval_steps_per_second": 24.993, "step": 56000 }, { "epoch": 0.9442633909918944, "grad_norm": 11.953753471374512, "learning_rate": 2.831381140979186e-06, "loss": 3.1361, "step": 56500 }, { "epoch": 0.9526197041865129, "grad_norm": 10.821110725402832, "learning_rate": 2.4108873346254323e-06, "loss": 3.0418, "step": 57000 }, { "epoch": 0.9526197041865129, "eval_loss": 3.164776563644409, "eval_runtime": 22.2464, "eval_samples_per_second": 200.572, "eval_steps_per_second": 25.083, "step": 57000 }, { "epoch": 0.9609760173811315, "grad_norm": 11.476717948913574, "learning_rate": 1.9895508553130533e-06, "loss": 3.0504, "step": 57500 }, { "epoch": 0.96933233057575, "grad_norm": 10.973363876342773, "learning_rate": 1.5682143760006742e-06, "loss": 3.0755, "step": 58000 }, { "epoch": 0.96933233057575, "eval_loss": 3.1603705883026123, "eval_runtime": 22.272, "eval_samples_per_second": 200.341, "eval_steps_per_second": 25.054, "step": 58000 }, { "epoch": 0.9776886437703685, "grad_norm": 10.836787223815918, "learning_rate": 1.1468778966882954e-06, "loss": 3.1001, "step": 58500 }, { "epoch": 0.9860449569649871, "grad_norm": 13.901703834533691, "learning_rate": 7.255414173759165e-07, "loss": 3.0633, "step": 59000 }, { "epoch": 0.9860449569649871, "eval_loss": 3.1587648391723633, "eval_runtime": 22.3994, "eval_samples_per_second": 199.202, "eval_steps_per_second": 24.911, "step": 59000 }, { "epoch": 0.9944012701596056, "grad_norm": 19.022567749023438, "learning_rate": 3.0420493806353753e-07, "loss": 3.0751, "step": 59500 } ], "logging_steps": 500, "max_steps": 59835, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.943723113325527e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }