doiee's picture
Upload model checkpoint
7c820ec verified
{
"best_metric": 0.23812708258628845,
"best_model_checkpoint": "./model_outputs/checkpoint-9800",
"epoch": 1.9338753280580934,
"eval_steps": 100,
"global_step": 9800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.019732817649032106,
"grad_norm": 0.579439103603363,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.8467,
"step": 100
},
{
"epoch": 0.019732817649032106,
"eval_loss": 1.6029163599014282,
"eval_runtime": 0.1393,
"eval_samples_per_second": 35.888,
"eval_steps_per_second": 7.178,
"step": 100
},
{
"epoch": 0.03946563529806421,
"grad_norm": 1.7087633609771729,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.4062,
"step": 200
},
{
"epoch": 0.03946563529806421,
"eval_loss": 1.2955422401428223,
"eval_runtime": 0.0895,
"eval_samples_per_second": 55.895,
"eval_steps_per_second": 11.179,
"step": 200
},
{
"epoch": 0.05919845294709632,
"grad_norm": 2.1446690559387207,
"learning_rate": 5e-05,
"loss": 1.2561,
"step": 300
},
{
"epoch": 0.05919845294709632,
"eval_loss": 1.14933443069458,
"eval_runtime": 0.0896,
"eval_samples_per_second": 55.787,
"eval_steps_per_second": 11.157,
"step": 300
},
{
"epoch": 0.07893127059612842,
"grad_norm": 1.9442692995071411,
"learning_rate": 6.666666666666667e-05,
"loss": 1.1758,
"step": 400
},
{
"epoch": 0.07893127059612842,
"eval_loss": 1.070350170135498,
"eval_runtime": 0.0899,
"eval_samples_per_second": 55.607,
"eval_steps_per_second": 11.121,
"step": 400
},
{
"epoch": 0.09866408824516053,
"grad_norm": 1.6005765199661255,
"learning_rate": 8.333333333333334e-05,
"loss": 1.1199,
"step": 500
},
{
"epoch": 0.09866408824516053,
"eval_loss": 1.0037094354629517,
"eval_runtime": 0.0918,
"eval_samples_per_second": 54.446,
"eval_steps_per_second": 10.889,
"step": 500
},
{
"epoch": 0.11839690589419263,
"grad_norm": 1.4335505962371826,
"learning_rate": 0.0001,
"loss": 1.0773,
"step": 600
},
{
"epoch": 0.11839690589419263,
"eval_loss": 0.9590722918510437,
"eval_runtime": 0.0897,
"eval_samples_per_second": 55.743,
"eval_steps_per_second": 11.149,
"step": 600
},
{
"epoch": 0.13812972354322473,
"grad_norm": 1.535965085029602,
"learning_rate": 0.00011666666666666668,
"loss": 1.0446,
"step": 700
},
{
"epoch": 0.13812972354322473,
"eval_loss": 0.9230692982673645,
"eval_runtime": 0.0893,
"eval_samples_per_second": 55.988,
"eval_steps_per_second": 11.198,
"step": 700
},
{
"epoch": 0.15786254119225684,
"grad_norm": 1.1867926120758057,
"learning_rate": 0.00013333333333333334,
"loss": 1.0234,
"step": 800
},
{
"epoch": 0.15786254119225684,
"eval_loss": 0.8973449468612671,
"eval_runtime": 0.0898,
"eval_samples_per_second": 55.673,
"eval_steps_per_second": 11.135,
"step": 800
},
{
"epoch": 0.17759535884128894,
"grad_norm": 0.9903791546821594,
"learning_rate": 0.00015000000000000001,
"loss": 1.0057,
"step": 900
},
{
"epoch": 0.17759535884128894,
"eval_loss": 0.8900602459907532,
"eval_runtime": 0.0911,
"eval_samples_per_second": 54.888,
"eval_steps_per_second": 10.978,
"step": 900
},
{
"epoch": 0.19732817649032106,
"grad_norm": 1.0672552585601807,
"learning_rate": 0.0001666666666666667,
"loss": 0.9416,
"step": 1000
},
{
"epoch": 0.19732817649032106,
"eval_loss": 0.7900083065032959,
"eval_runtime": 0.0909,
"eval_samples_per_second": 54.981,
"eval_steps_per_second": 10.996,
"step": 1000
},
{
"epoch": 0.21706099413935315,
"grad_norm": 1.1501110792160034,
"learning_rate": 0.00018333333333333334,
"loss": 0.7217,
"step": 1100
},
{
"epoch": 0.21706099413935315,
"eval_loss": 0.5064653754234314,
"eval_runtime": 0.089,
"eval_samples_per_second": 56.189,
"eval_steps_per_second": 11.238,
"step": 1100
},
{
"epoch": 0.23679381178838527,
"grad_norm": 0.7229278683662415,
"learning_rate": 0.0002,
"loss": 0.5166,
"step": 1200
},
{
"epoch": 0.23679381178838527,
"eval_loss": 0.42082151770591736,
"eval_runtime": 0.0889,
"eval_samples_per_second": 56.242,
"eval_steps_per_second": 11.248,
"step": 1200
},
{
"epoch": 0.2565266294374174,
"grad_norm": 0.753103494644165,
"learning_rate": 0.00019993817941631932,
"loss": 0.4558,
"step": 1300
},
{
"epoch": 0.2565266294374174,
"eval_loss": 0.39703065156936646,
"eval_runtime": 0.09,
"eval_samples_per_second": 55.574,
"eval_steps_per_second": 11.115,
"step": 1300
},
{
"epoch": 0.27625944708644945,
"grad_norm": 0.4141121506690979,
"learning_rate": 0.00019975279410096856,
"loss": 0.4347,
"step": 1400
},
{
"epoch": 0.27625944708644945,
"eval_loss": 0.3718082904815674,
"eval_runtime": 0.0891,
"eval_samples_per_second": 56.096,
"eval_steps_per_second": 11.219,
"step": 1400
},
{
"epoch": 0.29599226473548157,
"grad_norm": 0.6031398773193359,
"learning_rate": 0.00019944407326651575,
"loss": 0.4217,
"step": 1500
},
{
"epoch": 0.29599226473548157,
"eval_loss": 0.3526321351528168,
"eval_runtime": 0.0904,
"eval_samples_per_second": 55.331,
"eval_steps_per_second": 11.066,
"step": 1500
},
{
"epoch": 0.3157250823845137,
"grad_norm": 0.5605940222740173,
"learning_rate": 0.0001990123986190045,
"loss": 0.4128,
"step": 1600
},
{
"epoch": 0.3157250823845137,
"eval_loss": 0.35304129123687744,
"eval_runtime": 0.0901,
"eval_samples_per_second": 55.473,
"eval_steps_per_second": 11.095,
"step": 1600
},
{
"epoch": 0.3354579000335458,
"grad_norm": 0.5206743478775024,
"learning_rate": 0.00019845830388600822,
"loss": 0.4054,
"step": 1700
},
{
"epoch": 0.3354579000335458,
"eval_loss": 0.34606409072875977,
"eval_runtime": 0.0913,
"eval_samples_per_second": 54.794,
"eval_steps_per_second": 10.959,
"step": 1700
},
{
"epoch": 0.3551907176825779,
"grad_norm": 0.46870893239974976,
"learning_rate": 0.000197782474156723,
"loss": 0.4004,
"step": 1800
},
{
"epoch": 0.3551907176825779,
"eval_loss": 0.33528828620910645,
"eval_runtime": 0.0919,
"eval_samples_per_second": 54.394,
"eval_steps_per_second": 10.879,
"step": 1800
},
{
"epoch": 0.37492353533161,
"grad_norm": 0.4280480444431305,
"learning_rate": 0.0001969857450349156,
"loss": 0.398,
"step": 1900
},
{
"epoch": 0.37492353533161,
"eval_loss": 0.3284319043159485,
"eval_runtime": 0.089,
"eval_samples_per_second": 56.157,
"eval_steps_per_second": 11.231,
"step": 1900
},
{
"epoch": 0.3946563529806421,
"grad_norm": 0.37423691153526306,
"learning_rate": 0.00019606910160577286,
"loss": 0.3932,
"step": 2000
},
{
"epoch": 0.3946563529806421,
"eval_loss": 0.32890015840530396,
"eval_runtime": 0.0894,
"eval_samples_per_second": 55.933,
"eval_steps_per_second": 11.187,
"step": 2000
},
{
"epoch": 0.41438917062967423,
"grad_norm": 0.5279616117477417,
"learning_rate": 0.00019503367721793112,
"loss": 0.3902,
"step": 2100
},
{
"epoch": 0.41438917062967423,
"eval_loss": 0.3163706958293915,
"eval_runtime": 0.0913,
"eval_samples_per_second": 54.779,
"eval_steps_per_second": 10.956,
"step": 2100
},
{
"epoch": 0.4341219882787063,
"grad_norm": 0.5170876383781433,
"learning_rate": 0.00019388075208219072,
"loss": 0.3854,
"step": 2200
},
{
"epoch": 0.4341219882787063,
"eval_loss": 0.3095114827156067,
"eval_runtime": 0.0905,
"eval_samples_per_second": 55.269,
"eval_steps_per_second": 11.054,
"step": 2200
},
{
"epoch": 0.4538548059277384,
"grad_norm": 0.5783583521842957,
"learning_rate": 0.00019261175168864823,
"loss": 0.3845,
"step": 2300
},
{
"epoch": 0.4538548059277384,
"eval_loss": 0.31021368503570557,
"eval_runtime": 0.0899,
"eval_samples_per_second": 55.639,
"eval_steps_per_second": 11.128,
"step": 2300
},
{
"epoch": 0.47358762357677053,
"grad_norm": 0.3570936322212219,
"learning_rate": 0.00019122824504420402,
"loss": 0.3796,
"step": 2400
},
{
"epoch": 0.47358762357677053,
"eval_loss": 0.30086052417755127,
"eval_runtime": 0.0901,
"eval_samples_per_second": 55.499,
"eval_steps_per_second": 11.1,
"step": 2400
},
{
"epoch": 0.49332044122580265,
"grad_norm": 0.3303937613964081,
"learning_rate": 0.0001897319427326239,
"loss": 0.381,
"step": 2500
},
{
"epoch": 0.49332044122580265,
"eval_loss": 0.3088415861129761,
"eval_runtime": 0.0896,
"eval_samples_per_second": 55.778,
"eval_steps_per_second": 11.156,
"step": 2500
},
{
"epoch": 0.5130532588748348,
"grad_norm": 0.39968788623809814,
"learning_rate": 0.00018812469479955306,
"loss": 0.3758,
"step": 2600
},
{
"epoch": 0.5130532588748348,
"eval_loss": 0.3069296181201935,
"eval_runtime": 0.0906,
"eval_samples_per_second": 55.215,
"eval_steps_per_second": 11.043,
"step": 2600
},
{
"epoch": 0.5327860765238669,
"grad_norm": 0.4014946520328522,
"learning_rate": 0.00018640848846509836,
"loss": 0.3728,
"step": 2700
},
{
"epoch": 0.5327860765238669,
"eval_loss": 0.302837997674942,
"eval_runtime": 0.0907,
"eval_samples_per_second": 55.152,
"eval_steps_per_second": 11.03,
"step": 2700
},
{
"epoch": 0.5525188941728989,
"grad_norm": 0.3048448860645294,
"learning_rate": 0.00018458544566680613,
"loss": 0.374,
"step": 2800
},
{
"epoch": 0.5525188941728989,
"eval_loss": 0.30044040083885193,
"eval_runtime": 0.0918,
"eval_samples_per_second": 54.44,
"eval_steps_per_second": 10.888,
"step": 2800
},
{
"epoch": 0.572251711821931,
"grad_norm": 0.36768218874931335,
"learning_rate": 0.00018265782043607362,
"loss": 0.3694,
"step": 2900
},
{
"epoch": 0.572251711821931,
"eval_loss": 0.2935519516468048,
"eval_runtime": 0.0918,
"eval_samples_per_second": 54.473,
"eval_steps_per_second": 10.895,
"step": 2900
},
{
"epoch": 0.5919845294709631,
"grad_norm": 0.3077068030834198,
"learning_rate": 0.00018062799611123843,
"loss": 0.3649,
"step": 3000
},
{
"epoch": 0.5919845294709631,
"eval_loss": 0.2956543564796448,
"eval_runtime": 0.0897,
"eval_samples_per_second": 55.755,
"eval_steps_per_second": 11.151,
"step": 3000
},
{
"epoch": 0.6117173471199953,
"grad_norm": 0.3608091175556183,
"learning_rate": 0.00017849848239079126,
"loss": 0.3613,
"step": 3100
},
{
"epoch": 0.6117173471199953,
"eval_loss": 0.29713118076324463,
"eval_runtime": 0.0892,
"eval_samples_per_second": 56.045,
"eval_steps_per_second": 11.209,
"step": 3100
},
{
"epoch": 0.6314501647690274,
"grad_norm": 0.35845324397087097,
"learning_rate": 0.00017627191223035512,
"loss": 0.3646,
"step": 3200
},
{
"epoch": 0.6314501647690274,
"eval_loss": 0.28599464893341064,
"eval_runtime": 0.0897,
"eval_samples_per_second": 55.735,
"eval_steps_per_second": 11.147,
"step": 3200
},
{
"epoch": 0.6511829824180595,
"grad_norm": 0.3350367546081543,
"learning_rate": 0.00017395103858726846,
"loss": 0.3619,
"step": 3300
},
{
"epoch": 0.6511829824180595,
"eval_loss": 0.2815980315208435,
"eval_runtime": 0.0892,
"eval_samples_per_second": 56.042,
"eval_steps_per_second": 11.208,
"step": 3300
},
{
"epoch": 0.6709158000670916,
"grad_norm": 0.30465880036354065,
"learning_rate": 0.00017153873101679668,
"loss": 0.3625,
"step": 3400
},
{
"epoch": 0.6709158000670916,
"eval_loss": 0.2854335308074951,
"eval_runtime": 0.09,
"eval_samples_per_second": 55.528,
"eval_steps_per_second": 11.106,
"step": 3400
},
{
"epoch": 0.6906486177161237,
"grad_norm": 0.31288811564445496,
"learning_rate": 0.00016903797212418015,
"loss": 0.3552,
"step": 3500
},
{
"epoch": 0.6906486177161237,
"eval_loss": 0.28378957509994507,
"eval_runtime": 0.0889,
"eval_samples_per_second": 56.248,
"eval_steps_per_second": 11.25,
"step": 3500
},
{
"epoch": 0.7103814353651557,
"grad_norm": 0.49065399169921875,
"learning_rate": 0.0001664518538769067,
"loss": 0.3545,
"step": 3600
},
{
"epoch": 0.7103814353651557,
"eval_loss": 0.28242945671081543,
"eval_runtime": 0.0893,
"eval_samples_per_second": 56.004,
"eval_steps_per_second": 11.201,
"step": 3600
},
{
"epoch": 0.7301142530141879,
"grad_norm": 0.34333252906799316,
"learning_rate": 0.00016378357378176654,
"loss": 0.3531,
"step": 3700
},
{
"epoch": 0.7301142530141879,
"eval_loss": 0.28695201873779297,
"eval_runtime": 0.0901,
"eval_samples_per_second": 55.475,
"eval_steps_per_second": 11.095,
"step": 3700
},
{
"epoch": 0.74984707066322,
"grad_norm": 0.42468976974487305,
"learning_rate": 0.0001610364309314178,
"loss": 0.3528,
"step": 3800
},
{
"epoch": 0.74984707066322,
"eval_loss": 0.282599538564682,
"eval_runtime": 0.0888,
"eval_samples_per_second": 56.288,
"eval_steps_per_second": 11.258,
"step": 3800
},
{
"epoch": 0.7695798883122521,
"grad_norm": 0.28823015093803406,
"learning_rate": 0.00015821382192534968,
"loss": 0.3515,
"step": 3900
},
{
"epoch": 0.7695798883122521,
"eval_loss": 0.28318002820014954,
"eval_runtime": 0.089,
"eval_samples_per_second": 56.211,
"eval_steps_per_second": 11.242,
"step": 3900
},
{
"epoch": 0.7893127059612842,
"grad_norm": 0.3027402460575104,
"learning_rate": 0.0001553192366702874,
"loss": 0.3515,
"step": 4000
},
{
"epoch": 0.7893127059612842,
"eval_loss": 0.2733287513256073,
"eval_runtime": 0.0893,
"eval_samples_per_second": 55.972,
"eval_steps_per_second": 11.194,
"step": 4000
},
{
"epoch": 0.8090455236103163,
"grad_norm": 0.29556697607040405,
"learning_rate": 0.00015235625406523058,
"loss": 0.3485,
"step": 4100
},
{
"epoch": 0.8090455236103163,
"eval_loss": 0.2786538004875183,
"eval_runtime": 0.0905,
"eval_samples_per_second": 55.253,
"eval_steps_per_second": 11.051,
"step": 4100
},
{
"epoch": 0.8287783412593485,
"grad_norm": 0.36687153577804565,
"learning_rate": 0.0001493285375764608,
"loss": 0.3486,
"step": 4200
},
{
"epoch": 0.8287783412593485,
"eval_loss": 0.281529039144516,
"eval_runtime": 0.089,
"eval_samples_per_second": 56.211,
"eval_steps_per_second": 11.242,
"step": 4200
},
{
"epoch": 0.8485111589083806,
"grad_norm": 0.3069268763065338,
"learning_rate": 0.00014623983070798918,
"loss": 0.3466,
"step": 4300
},
{
"epoch": 0.8485111589083806,
"eval_loss": 0.2738308012485504,
"eval_runtime": 0.0905,
"eval_samples_per_second": 55.242,
"eval_steps_per_second": 11.048,
"step": 4300
},
{
"epoch": 0.8682439765574126,
"grad_norm": 0.362088680267334,
"learning_rate": 0.00014309395237304426,
"loss": 0.3441,
"step": 4400
},
{
"epoch": 0.8682439765574126,
"eval_loss": 0.2713426649570465,
"eval_runtime": 0.09,
"eval_samples_per_second": 55.556,
"eval_steps_per_second": 11.111,
"step": 4400
},
{
"epoch": 0.8879767942064447,
"grad_norm": 0.35860228538513184,
"learning_rate": 0.00013989479217232315,
"loss": 0.3459,
"step": 4500
},
{
"epoch": 0.8879767942064447,
"eval_loss": 0.27472037076950073,
"eval_runtime": 0.0888,
"eval_samples_per_second": 56.322,
"eval_steps_per_second": 11.264,
"step": 4500
},
{
"epoch": 0.9077096118554768,
"grad_norm": 0.39065611362457275,
"learning_rate": 0.00013664630558484379,
"loss": 0.3365,
"step": 4600
},
{
"epoch": 0.9077096118554768,
"eval_loss": 0.2625993490219116,
"eval_runtime": 0.0905,
"eval_samples_per_second": 55.261,
"eval_steps_per_second": 11.052,
"step": 4600
},
{
"epoch": 0.927442429504509,
"grad_norm": 0.3311096131801605,
"learning_rate": 0.00013335250907734448,
"loss": 0.3433,
"step": 4700
},
{
"epoch": 0.927442429504509,
"eval_loss": 0.26789966225624084,
"eval_runtime": 0.0901,
"eval_samples_per_second": 55.469,
"eval_steps_per_second": 11.094,
"step": 4700
},
{
"epoch": 0.9471752471535411,
"grad_norm": 0.29878920316696167,
"learning_rate": 0.00013001747513827764,
"loss": 0.3421,
"step": 4800
},
{
"epoch": 0.9471752471535411,
"eval_loss": 0.26495981216430664,
"eval_runtime": 0.0901,
"eval_samples_per_second": 55.48,
"eval_steps_per_second": 11.096,
"step": 4800
},
{
"epoch": 0.9669080648025732,
"grad_norm": 0.27687743306159973,
"learning_rate": 0.00012664532724253745,
"loss": 0.3412,
"step": 4900
},
{
"epoch": 0.9669080648025732,
"eval_loss": 0.26512324810028076,
"eval_runtime": 0.0891,
"eval_samples_per_second": 56.121,
"eval_steps_per_second": 11.224,
"step": 4900
},
{
"epoch": 0.9866408824516053,
"grad_norm": 0.2745589017868042,
"learning_rate": 0.00012324023475314725,
"loss": 0.3389,
"step": 5000
},
{
"epoch": 0.9866408824516053,
"eval_loss": 0.2671201825141907,
"eval_runtime": 0.0893,
"eval_samples_per_second": 55.975,
"eval_steps_per_second": 11.195,
"step": 5000
},
{
"epoch": 1.0064328985535844,
"grad_norm": 0.3213329017162323,
"learning_rate": 0.00011980640776621077,
"loss": 0.3342,
"step": 5100
},
{
"epoch": 1.0064328985535844,
"eval_loss": 0.26046106219291687,
"eval_runtime": 0.0901,
"eval_samples_per_second": 55.499,
"eval_steps_per_second": 11.1,
"step": 5100
},
{
"epoch": 1.0261657162026165,
"grad_norm": 0.27474504709243774,
"learning_rate": 0.0001163480919054998,
"loss": 0.3256,
"step": 5200
},
{
"epoch": 1.0261657162026165,
"eval_loss": 0.25896698236465454,
"eval_runtime": 0.0888,
"eval_samples_per_second": 56.29,
"eval_steps_per_second": 11.258,
"step": 5200
},
{
"epoch": 1.0458985338516487,
"grad_norm": 0.30879899859428406,
"learning_rate": 0.00011286956307311555,
"loss": 0.3221,
"step": 5300
},
{
"epoch": 1.0458985338516487,
"eval_loss": 0.26479166746139526,
"eval_runtime": 0.0899,
"eval_samples_per_second": 55.608,
"eval_steps_per_second": 11.122,
"step": 5300
},
{
"epoch": 1.0656313515006808,
"grad_norm": 0.31644341349601746,
"learning_rate": 0.00010937512216271338,
"loss": 0.3213,
"step": 5400
},
{
"epoch": 1.0656313515006808,
"eval_loss": 0.2630908489227295,
"eval_runtime": 0.0912,
"eval_samples_per_second": 54.85,
"eval_steps_per_second": 10.97,
"step": 5400
},
{
"epoch": 1.085364169149713,
"grad_norm": 0.27920085191726685,
"learning_rate": 0.00010586908974182767,
"loss": 0.3236,
"step": 5500
},
{
"epoch": 1.085364169149713,
"eval_loss": 0.25873327255249023,
"eval_runtime": 0.0918,
"eval_samples_per_second": 54.458,
"eval_steps_per_second": 10.892,
"step": 5500
},
{
"epoch": 1.105096986798745,
"grad_norm": 0.41224974393844604,
"learning_rate": 0.0001023558007098717,
"loss": 0.3201,
"step": 5600
},
{
"epoch": 1.105096986798745,
"eval_loss": 0.2597818672657013,
"eval_runtime": 0.0924,
"eval_samples_per_second": 54.094,
"eval_steps_per_second": 10.819,
"step": 5600
},
{
"epoch": 1.1248298044477771,
"grad_norm": 0.28382521867752075,
"learning_rate": 9.88395989384173e-05,
"loss": 0.3227,
"step": 5700
},
{
"epoch": 1.1248298044477771,
"eval_loss": 0.2594955563545227,
"eval_runtime": 0.0907,
"eval_samples_per_second": 55.14,
"eval_steps_per_second": 11.028,
"step": 5700
},
{
"epoch": 1.1445626220968093,
"grad_norm": 0.258071631193161,
"learning_rate": 9.532483190038153e-05,
"loss": 0.3188,
"step": 5800
},
{
"epoch": 1.1445626220968093,
"eval_loss": 0.2607273459434509,
"eval_runtime": 0.0902,
"eval_samples_per_second": 55.429,
"eval_steps_per_second": 11.086,
"step": 5800
},
{
"epoch": 1.1642954397458414,
"grad_norm": 0.4713532030582428,
"learning_rate": 9.181584529476025e-05,
"loss": 0.3199,
"step": 5900
},
{
"epoch": 1.1642954397458414,
"eval_loss": 0.2583589255809784,
"eval_runtime": 0.0897,
"eval_samples_per_second": 55.713,
"eval_steps_per_second": 11.143,
"step": 5900
},
{
"epoch": 1.1840282573948735,
"grad_norm": 0.2917826771736145,
"learning_rate": 8.831697767355519e-05,
"loss": 0.3185,
"step": 6000
},
{
"epoch": 1.1840282573948735,
"eval_loss": 0.256054550409317,
"eval_runtime": 0.091,
"eval_samples_per_second": 54.935,
"eval_steps_per_second": 10.987,
"step": 6000
},
{
"epoch": 1.2037610750439056,
"grad_norm": 0.27514272928237915,
"learning_rate": 8.483255507753762e-05,
"loss": 0.3167,
"step": 6100
},
{
"epoch": 1.2037610750439056,
"eval_loss": 0.2567542791366577,
"eval_runtime": 0.0902,
"eval_samples_per_second": 55.458,
"eval_steps_per_second": 11.092,
"step": 6100
},
{
"epoch": 1.2234938926929377,
"grad_norm": 0.28672000765800476,
"learning_rate": 8.136688568748113e-05,
"loss": 0.3131,
"step": 6200
},
{
"epoch": 1.2234938926929377,
"eval_loss": 0.2553618848323822,
"eval_runtime": 0.0906,
"eval_samples_per_second": 55.177,
"eval_steps_per_second": 11.035,
"step": 6200
},
{
"epoch": 1.2432267103419696,
"grad_norm": 0.30310142040252686,
"learning_rate": 7.792425449747635e-05,
"loss": 0.3189,
"step": 6300
},
{
"epoch": 1.2432267103419696,
"eval_loss": 0.2524791657924652,
"eval_runtime": 0.0904,
"eval_samples_per_second": 55.281,
"eval_steps_per_second": 11.056,
"step": 6300
},
{
"epoch": 1.262959527991002,
"grad_norm": 0.2990928888320923,
"learning_rate": 7.450891801691468e-05,
"loss": 0.3163,
"step": 6400
},
{
"epoch": 1.262959527991002,
"eval_loss": 0.25296735763549805,
"eval_runtime": 0.0903,
"eval_samples_per_second": 55.377,
"eval_steps_per_second": 11.075,
"step": 6400
},
{
"epoch": 1.2826923456400339,
"grad_norm": 0.28423842787742615,
"learning_rate": 7.112509900768989e-05,
"loss": 0.3171,
"step": 6500
},
{
"epoch": 1.2826923456400339,
"eval_loss": 0.25522494316101074,
"eval_runtime": 0.09,
"eval_samples_per_second": 55.554,
"eval_steps_per_second": 11.111,
"step": 6500
},
{
"epoch": 1.302425163289066,
"grad_norm": 0.2940766513347626,
"learning_rate": 6.777698126312647e-05,
"loss": 0.3122,
"step": 6600
},
{
"epoch": 1.302425163289066,
"eval_loss": 0.2504900395870209,
"eval_runtime": 0.0898,
"eval_samples_per_second": 55.653,
"eval_steps_per_second": 11.131,
"step": 6600
},
{
"epoch": 1.322157980938098,
"grad_norm": 0.29685819149017334,
"learning_rate": 6.446870443508839e-05,
"loss": 0.3094,
"step": 6700
},
{
"epoch": 1.322157980938098,
"eval_loss": 0.2528066039085388,
"eval_runtime": 0.0914,
"eval_samples_per_second": 54.709,
"eval_steps_per_second": 10.942,
"step": 6700
},
{
"epoch": 1.3418907985871302,
"grad_norm": 0.28402870893478394,
"learning_rate": 6.120435891566542e-05,
"loss": 0.3143,
"step": 6800
},
{
"epoch": 1.3418907985871302,
"eval_loss": 0.2532269358634949,
"eval_runtime": 0.0909,
"eval_samples_per_second": 55.009,
"eval_steps_per_second": 11.002,
"step": 6800
},
{
"epoch": 1.3616236162361623,
"grad_norm": 0.29260462522506714,
"learning_rate": 5.7987980779764463e-05,
"loss": 0.3106,
"step": 6900
},
{
"epoch": 1.3616236162361623,
"eval_loss": 0.24992766976356506,
"eval_runtime": 0.0908,
"eval_samples_per_second": 55.057,
"eval_steps_per_second": 11.011,
"step": 6900
},
{
"epoch": 1.3813564338851945,
"grad_norm": 0.27314430475234985,
"learning_rate": 5.482354679485948e-05,
"loss": 0.3162,
"step": 7000
},
{
"epoch": 1.3813564338851945,
"eval_loss": 0.24991869926452637,
"eval_runtime": 0.091,
"eval_samples_per_second": 54.919,
"eval_steps_per_second": 10.984,
"step": 7000
},
{
"epoch": 1.4010892515342266,
"grad_norm": 0.28783899545669556,
"learning_rate": 5.17149695040698e-05,
"loss": 0.3093,
"step": 7100
},
{
"epoch": 1.4010892515342266,
"eval_loss": 0.2506985068321228,
"eval_runtime": 0.0908,
"eval_samples_per_second": 55.089,
"eval_steps_per_second": 11.018,
"step": 7100
},
{
"epoch": 1.4208220691832587,
"grad_norm": 0.3105429708957672,
"learning_rate": 4.866609238864609e-05,
"loss": 0.3105,
"step": 7200
},
{
"epoch": 1.4208220691832587,
"eval_loss": 0.2487274706363678,
"eval_runtime": 0.0911,
"eval_samples_per_second": 54.856,
"eval_steps_per_second": 10.971,
"step": 7200
},
{
"epoch": 1.4405548868322908,
"grad_norm": 0.2590714693069458,
"learning_rate": 4.568068511584529e-05,
"loss": 0.3092,
"step": 7300
},
{
"epoch": 1.4405548868322908,
"eval_loss": 0.2481408566236496,
"eval_runtime": 0.0906,
"eval_samples_per_second": 55.214,
"eval_steps_per_second": 11.043,
"step": 7300
},
{
"epoch": 1.460287704481323,
"grad_norm": 0.27170825004577637,
"learning_rate": 4.2762438878069955e-05,
"loss": 0.3113,
"step": 7400
},
{
"epoch": 1.460287704481323,
"eval_loss": 0.24591541290283203,
"eval_runtime": 0.0904,
"eval_samples_per_second": 55.31,
"eval_steps_per_second": 11.062,
"step": 7400
},
{
"epoch": 1.480020522130355,
"grad_norm": 0.2719477117061615,
"learning_rate": 3.991496182903498e-05,
"loss": 0.3077,
"step": 7500
},
{
"epoch": 1.480020522130355,
"eval_loss": 0.24259333312511444,
"eval_runtime": 0.091,
"eval_samples_per_second": 54.966,
"eval_steps_per_second": 10.993,
"step": 7500
},
{
"epoch": 1.499753339779387,
"grad_norm": 0.28030896186828613,
"learning_rate": 3.714177462260412e-05,
"loss": 0.3073,
"step": 7600
},
{
"epoch": 1.499753339779387,
"eval_loss": 0.2429008185863495,
"eval_runtime": 0.093,
"eval_samples_per_second": 53.774,
"eval_steps_per_second": 10.755,
"step": 7600
},
{
"epoch": 1.5194861574284193,
"grad_norm": 0.2803615629673004,
"learning_rate": 3.444630605981256e-05,
"loss": 0.3042,
"step": 7700
},
{
"epoch": 1.5194861574284193,
"eval_loss": 0.2448611706495285,
"eval_runtime": 0.0905,
"eval_samples_per_second": 55.251,
"eval_steps_per_second": 11.05,
"step": 7700
},
{
"epoch": 1.5392189750774512,
"grad_norm": 0.326019823551178,
"learning_rate": 3.183188884945714e-05,
"loss": 0.3075,
"step": 7800
},
{
"epoch": 1.5392189750774512,
"eval_loss": 0.24277858436107635,
"eval_runtime": 0.0897,
"eval_samples_per_second": 55.757,
"eval_steps_per_second": 11.151,
"step": 7800
},
{
"epoch": 1.5589517927264835,
"grad_norm": 0.3696662187576294,
"learning_rate": 2.930175548749645e-05,
"loss": 0.3047,
"step": 7900
},
{
"epoch": 1.5589517927264835,
"eval_loss": 0.2429245412349701,
"eval_runtime": 0.0909,
"eval_samples_per_second": 55.022,
"eval_steps_per_second": 11.004,
"step": 7900
},
{
"epoch": 1.5786846103755154,
"grad_norm": 0.2868829667568207,
"learning_rate": 2.6859034260355042e-05,
"loss": 0.307,
"step": 8000
},
{
"epoch": 1.5786846103755154,
"eval_loss": 0.2427954375743866,
"eval_runtime": 0.0901,
"eval_samples_per_second": 55.51,
"eval_steps_per_second": 11.102,
"step": 8000
},
{
"epoch": 1.5984174280245478,
"grad_norm": 0.28634509444236755,
"learning_rate": 2.4506745377073535e-05,
"loss": 0.303,
"step": 8100
},
{
"epoch": 1.5984174280245478,
"eval_loss": 0.24240228533744812,
"eval_runtime": 0.0911,
"eval_samples_per_second": 54.88,
"eval_steps_per_second": 10.976,
"step": 8100
},
{
"epoch": 1.6181502456735797,
"grad_norm": 0.2582091987133026,
"learning_rate": 2.224779723508692e-05,
"loss": 0.3063,
"step": 8200
},
{
"epoch": 1.6181502456735797,
"eval_loss": 0.24210545420646667,
"eval_runtime": 0.0895,
"eval_samples_per_second": 55.845,
"eval_steps_per_second": 11.169,
"step": 8200
},
{
"epoch": 1.6378830633226118,
"grad_norm": 0.29064637422561646,
"learning_rate": 2.0084982824248034e-05,
"loss": 0.3041,
"step": 8300
},
{
"epoch": 1.6378830633226118,
"eval_loss": 0.24005956947803497,
"eval_runtime": 0.0916,
"eval_samples_per_second": 54.615,
"eval_steps_per_second": 10.923,
"step": 8300
},
{
"epoch": 1.657615880971644,
"grad_norm": 0.29442399740219116,
"learning_rate": 1.802097627354231e-05,
"loss": 0.3046,
"step": 8400
},
{
"epoch": 1.657615880971644,
"eval_loss": 0.238916277885437,
"eval_runtime": 0.0907,
"eval_samples_per_second": 55.107,
"eval_steps_per_second": 11.021,
"step": 8400
},
{
"epoch": 1.677348698620676,
"grad_norm": 0.2660856246948242,
"learning_rate": 1.605832954476346e-05,
"loss": 0.3054,
"step": 8500
},
{
"epoch": 1.677348698620676,
"eval_loss": 0.2391015589237213,
"eval_runtime": 0.0907,
"eval_samples_per_second": 55.141,
"eval_steps_per_second": 11.028,
"step": 8500
},
{
"epoch": 1.6970815162697082,
"grad_norm": 0.29835525155067444,
"learning_rate": 1.4199469277238143e-05,
"loss": 0.3043,
"step": 8600
},
{
"epoch": 1.6970815162697082,
"eval_loss": 0.23956787586212158,
"eval_runtime": 0.0899,
"eval_samples_per_second": 55.608,
"eval_steps_per_second": 11.122,
"step": 8600
},
{
"epoch": 1.7168143339187403,
"grad_norm": 0.33219653367996216,
"learning_rate": 1.2446693787500697e-05,
"loss": 0.3033,
"step": 8700
},
{
"epoch": 1.7168143339187403,
"eval_loss": 0.23964008688926697,
"eval_runtime": 0.0905,
"eval_samples_per_second": 55.26,
"eval_steps_per_second": 11.052,
"step": 8700
},
{
"epoch": 1.7365471515677724,
"grad_norm": 0.26195675134658813,
"learning_rate": 1.0802170227627873e-05,
"loss": 0.3062,
"step": 8800
},
{
"epoch": 1.7365471515677724,
"eval_loss": 0.2384120523929596,
"eval_runtime": 0.0895,
"eval_samples_per_second": 55.837,
"eval_steps_per_second": 11.167,
"step": 8800
},
{
"epoch": 1.7562799692168045,
"grad_norm": 0.2874230146408081,
"learning_rate": 9.26793190574664e-06,
"loss": 0.3079,
"step": 8900
},
{
"epoch": 1.7562799692168045,
"eval_loss": 0.23803596198558807,
"eval_runtime": 0.0892,
"eval_samples_per_second": 56.06,
"eval_steps_per_second": 11.212,
"step": 8900
},
{
"epoch": 1.7760127868658366,
"grad_norm": 0.31188592314720154,
"learning_rate": 7.845875772028289e-06,
"loss": 0.3026,
"step": 9000
},
{
"epoch": 1.7760127868658366,
"eval_loss": 0.23819437623023987,
"eval_runtime": 0.0916,
"eval_samples_per_second": 54.586,
"eval_steps_per_second": 10.917,
"step": 9000
},
{
"epoch": 1.7957456045148685,
"grad_norm": 0.25863921642303467,
"learning_rate": 6.537760073277066e-06,
"loss": 0.3028,
"step": 9100
},
{
"epoch": 1.7957456045148685,
"eval_loss": 0.23811522126197815,
"eval_runtime": 0.0907,
"eval_samples_per_second": 55.122,
"eval_steps_per_second": 11.024,
"step": 9100
},
{
"epoch": 1.8154784221639009,
"grad_norm": 0.2967861592769623,
"learning_rate": 5.345202179013353e-06,
"loss": 0.3036,
"step": 9200
},
{
"epoch": 1.8154784221639009,
"eval_loss": 0.23839135468006134,
"eval_runtime": 0.0891,
"eval_samples_per_second": 56.087,
"eval_steps_per_second": 11.217,
"step": 9200
},
{
"epoch": 1.8352112398129328,
"grad_norm": 0.26816633343696594,
"learning_rate": 4.269676581739079e-06,
"loss": 0.3033,
"step": 9300
},
{
"epoch": 1.8352112398129328,
"eval_loss": 0.23855826258659363,
"eval_runtime": 0.0915,
"eval_samples_per_second": 54.629,
"eval_steps_per_second": 10.926,
"step": 9300
},
{
"epoch": 1.8549440574619651,
"grad_norm": 0.27961990237236023,
"learning_rate": 3.3125130738579922e-06,
"loss": 0.3059,
"step": 9400
},
{
"epoch": 1.8549440574619651,
"eval_loss": 0.2385035753250122,
"eval_runtime": 0.0918,
"eval_samples_per_second": 54.481,
"eval_steps_per_second": 10.896,
"step": 9400
},
{
"epoch": 1.874676875110997,
"grad_norm": 0.28482890129089355,
"learning_rate": 2.4748951035047596e-06,
"loss": 0.3047,
"step": 9500
},
{
"epoch": 1.874676875110997,
"eval_loss": 0.23791757225990295,
"eval_runtime": 0.0933,
"eval_samples_per_second": 53.597,
"eval_steps_per_second": 10.719,
"step": 9500
},
{
"epoch": 1.8944096927600294,
"grad_norm": 0.299679160118103,
"learning_rate": 1.7578583113159962e-06,
"loss": 0.3036,
"step": 9600
},
{
"epoch": 1.8944096927600294,
"eval_loss": 0.2383710891008377,
"eval_runtime": 0.0895,
"eval_samples_per_second": 55.854,
"eval_steps_per_second": 11.171,
"step": 9600
},
{
"epoch": 1.9141425104090612,
"grad_norm": 0.27456724643707275,
"learning_rate": 1.1622892499519421e-06,
"loss": 0.3032,
"step": 9700
},
{
"epoch": 1.9141425104090612,
"eval_loss": 0.2380281388759613,
"eval_runtime": 0.0895,
"eval_samples_per_second": 55.876,
"eval_steps_per_second": 11.175,
"step": 9700
},
{
"epoch": 1.9338753280580934,
"grad_norm": 0.27245351672172546,
"learning_rate": 6.889242879525415e-07,
"loss": 0.2994,
"step": 9800
},
{
"epoch": 1.9338753280580934,
"eval_loss": 0.23812708258628845,
"eval_runtime": 0.09,
"eval_samples_per_second": 55.528,
"eval_steps_per_second": 11.106,
"step": 9800
}
],
"logging_steps": 100,
"max_steps": 10134,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.236547392844431e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}