|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.97891231964484, |
|
"eval_steps": 100, |
|
"global_step": 4500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011098779134295227, |
|
"grad_norm": 0.32656678886781504, |
|
"learning_rate": 2.2222222222222224e-07, |
|
"loss": 1.1062, |
|
"mean_token_accuracy": 0.7074863796661243, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.022197558268590455, |
|
"grad_norm": 0.29460412354793364, |
|
"learning_rate": 4.444444444444445e-07, |
|
"loss": 1.0905, |
|
"mean_token_accuracy": 0.7132027888622114, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.033296337402885685, |
|
"grad_norm": 0.2926845798768026, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 1.1025, |
|
"mean_token_accuracy": 0.7111899019997314, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04439511653718091, |
|
"grad_norm": 0.2947376842618273, |
|
"learning_rate": 8.88888888888889e-07, |
|
"loss": 1.0792, |
|
"mean_token_accuracy": 0.7154833115402078, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05549389567147614, |
|
"grad_norm": 0.26594989197086916, |
|
"learning_rate": 1.111111111111111e-06, |
|
"loss": 1.1065, |
|
"mean_token_accuracy": 0.7056332765917197, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06659267480577137, |
|
"grad_norm": 0.19302469306737396, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 1.0611, |
|
"mean_token_accuracy": 0.7162843674231184, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07769145394006659, |
|
"grad_norm": 0.18651122869654144, |
|
"learning_rate": 1.5555555555555558e-06, |
|
"loss": 1.0486, |
|
"mean_token_accuracy": 0.7172663487082396, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08879023307436182, |
|
"grad_norm": 0.14595121865836222, |
|
"learning_rate": 1.777777777777778e-06, |
|
"loss": 1.0363, |
|
"mean_token_accuracy": 0.7196147071311284, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09988901220865705, |
|
"grad_norm": 0.11721804821745417, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.9939, |
|
"mean_token_accuracy": 0.7276444950188313, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11098779134295228, |
|
"grad_norm": 0.11362901801908676, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 0.9603, |
|
"mean_token_accuracy": 0.7342363903629822, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1220865704772475, |
|
"grad_norm": 0.09796836866809913, |
|
"learning_rate": 2.4444444444444447e-06, |
|
"loss": 0.9519, |
|
"mean_token_accuracy": 0.7358896327215562, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.13318534961154274, |
|
"grad_norm": 0.0957862317703024, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 0.9453, |
|
"mean_token_accuracy": 0.7352604908987554, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.14428412874583796, |
|
"grad_norm": 0.08550990149349075, |
|
"learning_rate": 2.888888888888889e-06, |
|
"loss": 0.9156, |
|
"mean_token_accuracy": 0.7417218485729853, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.15538290788013318, |
|
"grad_norm": 0.08055037389952527, |
|
"learning_rate": 3.1111111111111116e-06, |
|
"loss": 0.9174, |
|
"mean_token_accuracy": 0.74028441707153, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.16648168701442842, |
|
"grad_norm": 0.07404585575455314, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.8997, |
|
"mean_token_accuracy": 0.7443911962497833, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.17758046614872364, |
|
"grad_norm": 0.0755168094448169, |
|
"learning_rate": 3.555555555555556e-06, |
|
"loss": 0.8946, |
|
"mean_token_accuracy": 0.7450205354457197, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.18867924528301888, |
|
"grad_norm": 0.06918137028638209, |
|
"learning_rate": 3.777777777777778e-06, |
|
"loss": 0.8819, |
|
"mean_token_accuracy": 0.7460465509561359, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1997780244173141, |
|
"grad_norm": 0.06649356448882256, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.865, |
|
"mean_token_accuracy": 0.7505584314816246, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.21087680355160932, |
|
"grad_norm": 0.07241837843867385, |
|
"learning_rate": 4.222222222222223e-06, |
|
"loss": 0.8683, |
|
"mean_token_accuracy": 0.7501267207853098, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.22197558268590456, |
|
"grad_norm": 0.0688273015748006, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 0.8554, |
|
"mean_token_accuracy": 0.7520447614565848, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.22197558268590456, |
|
"eval_loss": 0.8836105465888977, |
|
"eval_mean_token_accuracy": 0.7428216802026476, |
|
"eval_runtime": 2.9492, |
|
"eval_samples_per_second": 43.741, |
|
"eval_steps_per_second": 3.73, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.23307436182019978, |
|
"grad_norm": 0.06672497320617116, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 0.8547, |
|
"mean_token_accuracy": 0.7530808688116643, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.244173140954495, |
|
"grad_norm": 0.06588821630374943, |
|
"learning_rate": 4.888888888888889e-06, |
|
"loss": 0.8493, |
|
"mean_token_accuracy": 0.7530783561789567, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.25527192008879024, |
|
"grad_norm": 0.06573805910703882, |
|
"learning_rate": 5.1111111111111115e-06, |
|
"loss": 0.8481, |
|
"mean_token_accuracy": 0.7527923082243844, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2663706992230855, |
|
"grad_norm": 0.06765053288672747, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 0.8345, |
|
"mean_token_accuracy": 0.7570082248902316, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.27746947835738067, |
|
"grad_norm": 0.06423926782724641, |
|
"learning_rate": 5.555555555555557e-06, |
|
"loss": 0.8334, |
|
"mean_token_accuracy": 0.7570059881760228, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2885682574916759, |
|
"grad_norm": 0.06544256761084138, |
|
"learning_rate": 5.777777777777778e-06, |
|
"loss": 0.8306, |
|
"mean_token_accuracy": 0.7560762771467135, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.29966703662597116, |
|
"grad_norm": 0.06689028220574877, |
|
"learning_rate": 6e-06, |
|
"loss": 0.8431, |
|
"mean_token_accuracy": 0.7535974296049089, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.31076581576026635, |
|
"grad_norm": 0.06896235600250016, |
|
"learning_rate": 6.222222222222223e-06, |
|
"loss": 0.8164, |
|
"mean_token_accuracy": 0.7605102065970375, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3218645948945616, |
|
"grad_norm": 0.06652473163537684, |
|
"learning_rate": 6.444444444444445e-06, |
|
"loss": 0.8354, |
|
"mean_token_accuracy": 0.7546830001786541, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.33296337402885684, |
|
"grad_norm": 0.0656998409953361, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.8204, |
|
"mean_token_accuracy": 0.7590995752154607, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.34406215316315203, |
|
"grad_norm": 0.06721736905388327, |
|
"learning_rate": 6.88888888888889e-06, |
|
"loss": 0.8188, |
|
"mean_token_accuracy": 0.7586616076789785, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3551609322974473, |
|
"grad_norm": 0.06906077440788903, |
|
"learning_rate": 7.111111111111112e-06, |
|
"loss": 0.8418, |
|
"mean_token_accuracy": 0.7528522141841469, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3662597114317425, |
|
"grad_norm": 0.07152076555649528, |
|
"learning_rate": 7.333333333333333e-06, |
|
"loss": 0.8266, |
|
"mean_token_accuracy": 0.7559661015692364, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.37735849056603776, |
|
"grad_norm": 0.07237554138006554, |
|
"learning_rate": 7.555555555555556e-06, |
|
"loss": 0.8124, |
|
"mean_token_accuracy": 0.7609189512133896, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.38845726970033295, |
|
"grad_norm": 0.0824656116508997, |
|
"learning_rate": 7.77777777777778e-06, |
|
"loss": 0.8115, |
|
"mean_token_accuracy": 0.759490100022586, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3995560488346282, |
|
"grad_norm": 0.06954623757963556, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.8207, |
|
"mean_token_accuracy": 0.757588037268629, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.41065482796892344, |
|
"grad_norm": 0.06873958713395736, |
|
"learning_rate": 8.222222222222222e-06, |
|
"loss": 0.7891, |
|
"mean_token_accuracy": 0.766508490603462, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.42175360710321863, |
|
"grad_norm": 0.06515019023084734, |
|
"learning_rate": 8.444444444444446e-06, |
|
"loss": 0.7942, |
|
"mean_token_accuracy": 0.7636216832625793, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4328523862375139, |
|
"grad_norm": 0.07127344424022096, |
|
"learning_rate": 8.666666666666668e-06, |
|
"loss": 0.8117, |
|
"mean_token_accuracy": 0.7599510101102652, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.4439511653718091, |
|
"grad_norm": 0.06750508464116467, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 0.794, |
|
"mean_token_accuracy": 0.7647805117972197, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4439511653718091, |
|
"eval_loss": 0.8218581080436707, |
|
"eval_mean_token_accuracy": 0.7549547265212898, |
|
"eval_runtime": 2.508, |
|
"eval_samples_per_second": 51.434, |
|
"eval_steps_per_second": 4.386, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4550499445061043, |
|
"grad_norm": 0.07169806108493063, |
|
"learning_rate": 9.111111111111112e-06, |
|
"loss": 0.7796, |
|
"mean_token_accuracy": 0.7678073917643145, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.46614872364039955, |
|
"grad_norm": 0.06834755097234334, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 0.8232, |
|
"mean_token_accuracy": 0.7559485442022151, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4772475027746948, |
|
"grad_norm": 0.07175829230991215, |
|
"learning_rate": 9.555555555555556e-06, |
|
"loss": 0.7999, |
|
"mean_token_accuracy": 0.7625316682567083, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.48834628190899, |
|
"grad_norm": 0.07570550308844941, |
|
"learning_rate": 9.777777777777779e-06, |
|
"loss": 0.7903, |
|
"mean_token_accuracy": 0.7654010521520587, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.49944506104328523, |
|
"grad_norm": 0.07216277014829452, |
|
"learning_rate": 1e-05, |
|
"loss": 0.8053, |
|
"mean_token_accuracy": 0.7603297556858186, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5105438401775805, |
|
"grad_norm": 0.06896541991818567, |
|
"learning_rate": 1.0222222222222223e-05, |
|
"loss": 0.8027, |
|
"mean_token_accuracy": 0.7611568746863343, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5216426193118757, |
|
"grad_norm": 0.07269082277182284, |
|
"learning_rate": 1.0444444444444445e-05, |
|
"loss": 0.7901, |
|
"mean_token_accuracy": 0.7643428675558843, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.532741398446171, |
|
"grad_norm": 0.07000824171875175, |
|
"learning_rate": 1.0666666666666667e-05, |
|
"loss": 0.7816, |
|
"mean_token_accuracy": 0.7676277365390342, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5438401775804661, |
|
"grad_norm": 0.06721455285792531, |
|
"learning_rate": 1.088888888888889e-05, |
|
"loss": 0.7829, |
|
"mean_token_accuracy": 0.7671494048062515, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5549389567147613, |
|
"grad_norm": 0.06540009584572047, |
|
"learning_rate": 1.1111111111111113e-05, |
|
"loss": 0.7934, |
|
"mean_token_accuracy": 0.7629588729696943, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5660377358490566, |
|
"grad_norm": 0.07281032144781761, |
|
"learning_rate": 1.1333333333333334e-05, |
|
"loss": 0.7757, |
|
"mean_token_accuracy": 0.7677620348353081, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5771365149833518, |
|
"grad_norm": 0.06942748506092133, |
|
"learning_rate": 1.1555555555555556e-05, |
|
"loss": 0.7564, |
|
"mean_token_accuracy": 0.7731927834900716, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.0662825869216575, |
|
"learning_rate": 1.177777777777778e-05, |
|
"loss": 0.7783, |
|
"mean_token_accuracy": 0.7668050606016081, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.5993340732519423, |
|
"grad_norm": 0.06693471426836645, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.7731, |
|
"mean_token_accuracy": 0.768561360245379, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6104328523862376, |
|
"grad_norm": 0.07214761991106813, |
|
"learning_rate": 1.2222222222222224e-05, |
|
"loss": 0.775, |
|
"mean_token_accuracy": 0.7678224396480112, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6215316315205327, |
|
"grad_norm": 0.07941094979893487, |
|
"learning_rate": 1.2444444444444446e-05, |
|
"loss": 0.7696, |
|
"mean_token_accuracy": 0.7695845557519624, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.632630410654828, |
|
"grad_norm": 0.07862635171047101, |
|
"learning_rate": 1.2666666666666667e-05, |
|
"loss": 0.8048, |
|
"mean_token_accuracy": 0.7594086424792449, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6437291897891232, |
|
"grad_norm": 0.06739330160330582, |
|
"learning_rate": 1.288888888888889e-05, |
|
"loss": 0.7799, |
|
"mean_token_accuracy": 0.7669548215148532, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6548279689234184, |
|
"grad_norm": 0.07180391852563321, |
|
"learning_rate": 1.3111111111111113e-05, |
|
"loss": 0.7864, |
|
"mean_token_accuracy": 0.7643820889959978, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.6659267480577137, |
|
"grad_norm": 0.07358894359497874, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.778, |
|
"mean_token_accuracy": 0.7664805782566748, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6659267480577137, |
|
"eval_loss": 0.7948585748672485, |
|
"eval_mean_token_accuracy": 0.7601977645003797, |
|
"eval_runtime": 2.5061, |
|
"eval_samples_per_second": 51.474, |
|
"eval_steps_per_second": 4.389, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6770255271920089, |
|
"grad_norm": 0.07176558672546726, |
|
"learning_rate": 1.3555555555555557e-05, |
|
"loss": 0.762, |
|
"mean_token_accuracy": 0.7715469734324651, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.6881243063263041, |
|
"grad_norm": 0.07661120954462695, |
|
"learning_rate": 1.377777777777778e-05, |
|
"loss": 0.774, |
|
"mean_token_accuracy": 0.7688521861239055, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6992230854605993, |
|
"grad_norm": 0.08099285112089856, |
|
"learning_rate": 1.4e-05, |
|
"loss": 0.7621, |
|
"mean_token_accuracy": 0.7709743741953946, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7103218645948945, |
|
"grad_norm": 0.07635780227021953, |
|
"learning_rate": 1.4222222222222224e-05, |
|
"loss": 0.741, |
|
"mean_token_accuracy": 0.7771059346697895, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7214206437291898, |
|
"grad_norm": 0.07916685859429015, |
|
"learning_rate": 1.4444444444444446e-05, |
|
"loss": 0.7655, |
|
"mean_token_accuracy": 0.7704483939834843, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.732519422863485, |
|
"grad_norm": 0.0679326809173189, |
|
"learning_rate": 1.4666666666666666e-05, |
|
"loss": 0.7657, |
|
"mean_token_accuracy": 0.7706814426497193, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7436182019977803, |
|
"grad_norm": 0.0779819303278401, |
|
"learning_rate": 1.488888888888889e-05, |
|
"loss": 0.7744, |
|
"mean_token_accuracy": 0.766929925481714, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.7547169811320755, |
|
"grad_norm": 0.0912911418984695, |
|
"learning_rate": 1.5111111111111112e-05, |
|
"loss": 0.7617, |
|
"mean_token_accuracy": 0.7714522958687495, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7658157602663707, |
|
"grad_norm": 0.07861319372730714, |
|
"learning_rate": 1.5333333333333334e-05, |
|
"loss": 0.7761, |
|
"mean_token_accuracy": 0.7662400340052155, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.7769145394006659, |
|
"grad_norm": 0.07302424137610575, |
|
"learning_rate": 1.555555555555556e-05, |
|
"loss": 0.7657, |
|
"mean_token_accuracy": 0.7694574502400165, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7880133185349611, |
|
"grad_norm": 0.07529267936879339, |
|
"learning_rate": 1.577777777777778e-05, |
|
"loss": 0.7726, |
|
"mean_token_accuracy": 0.7683270370750246, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.7991120976692564, |
|
"grad_norm": 0.07476756089459828, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.7621, |
|
"mean_token_accuracy": 0.7702739762275538, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8102108768035516, |
|
"grad_norm": 0.06840012057538632, |
|
"learning_rate": 1.6222222222222223e-05, |
|
"loss": 0.7537, |
|
"mean_token_accuracy": 0.772770864897998, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.8213096559378469, |
|
"grad_norm": 0.08097322392843904, |
|
"learning_rate": 1.6444444444444444e-05, |
|
"loss": 0.7747, |
|
"mean_token_accuracy": 0.7670298646020149, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.832408435072142, |
|
"grad_norm": 0.06862947612315898, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.7571, |
|
"mean_token_accuracy": 0.7722756690046501, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.8435072142064373, |
|
"grad_norm": 0.06985899518061597, |
|
"learning_rate": 1.688888888888889e-05, |
|
"loss": 0.7592, |
|
"mean_token_accuracy": 0.7716363522611088, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8546059933407325, |
|
"grad_norm": 0.07236936825950281, |
|
"learning_rate": 1.7111111111111112e-05, |
|
"loss": 0.7623, |
|
"mean_token_accuracy": 0.7704382284365983, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.8657047724750278, |
|
"grad_norm": 0.06927292766824548, |
|
"learning_rate": 1.7333333333333336e-05, |
|
"loss": 0.752, |
|
"mean_token_accuracy": 0.7727048211606123, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.876803551609323, |
|
"grad_norm": 0.07436093330373601, |
|
"learning_rate": 1.7555555555555556e-05, |
|
"loss": 0.7332, |
|
"mean_token_accuracy": 0.7787908082397291, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.8879023307436182, |
|
"grad_norm": 0.07000272665969048, |
|
"learning_rate": 1.7777777777777777e-05, |
|
"loss": 0.7595, |
|
"mean_token_accuracy": 0.7709150293562277, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8879023307436182, |
|
"eval_loss": 0.7792394161224365, |
|
"eval_mean_token_accuracy": 0.7637743885515423, |
|
"eval_runtime": 2.5025, |
|
"eval_samples_per_second": 51.549, |
|
"eval_steps_per_second": 4.396, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8990011098779135, |
|
"grad_norm": 0.07828773197571512, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.75, |
|
"mean_token_accuracy": 0.7742035226119346, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.9100998890122086, |
|
"grad_norm": 0.0786800574481835, |
|
"learning_rate": 1.8222222222222224e-05, |
|
"loss": 0.7458, |
|
"mean_token_accuracy": 0.7740958458322968, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9211986681465039, |
|
"grad_norm": 0.06723168272062767, |
|
"learning_rate": 1.8444444444444448e-05, |
|
"loss": 0.7309, |
|
"mean_token_accuracy": 0.7794692081483484, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.9322974472807991, |
|
"grad_norm": 0.07897130908429875, |
|
"learning_rate": 1.866666666666667e-05, |
|
"loss": 0.7431, |
|
"mean_token_accuracy": 0.7757018190238035, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9433962264150944, |
|
"grad_norm": 0.07871638828751878, |
|
"learning_rate": 1.888888888888889e-05, |
|
"loss": 0.7489, |
|
"mean_token_accuracy": 0.7737513410604485, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.9544950055493896, |
|
"grad_norm": 0.0766181271179252, |
|
"learning_rate": 1.9111111111111113e-05, |
|
"loss": 0.7394, |
|
"mean_token_accuracy": 0.7765881795651786, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9655937846836848, |
|
"grad_norm": 0.07235155791983158, |
|
"learning_rate": 1.9333333333333333e-05, |
|
"loss": 0.7359, |
|
"mean_token_accuracy": 0.7773230673527788, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.97669256381798, |
|
"grad_norm": 0.06718122861403932, |
|
"learning_rate": 1.9555555555555557e-05, |
|
"loss": 0.7451, |
|
"mean_token_accuracy": 0.7741639861854092, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9877913429522752, |
|
"grad_norm": 0.06979726690486535, |
|
"learning_rate": 1.977777777777778e-05, |
|
"loss": 0.7446, |
|
"mean_token_accuracy": 0.7742892173350467, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.9988901220865705, |
|
"grad_norm": 0.08187524092621244, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7618, |
|
"mean_token_accuracy": 0.7693789734522902, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0088790233074363, |
|
"grad_norm": 0.07032602395899457, |
|
"learning_rate": 1.9999924785916563e-05, |
|
"loss": 0.7212, |
|
"mean_token_accuracy": 0.7807686469529602, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.0199778024417314, |
|
"grad_norm": 0.07847380211681673, |
|
"learning_rate": 1.999969914479768e-05, |
|
"loss": 0.7193, |
|
"mean_token_accuracy": 0.7805523329120769, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.0310765815760266, |
|
"grad_norm": 0.07307725873740153, |
|
"learning_rate": 1.9999323080037623e-05, |
|
"loss": 0.7265, |
|
"mean_token_accuracy": 0.7779903005243214, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.042175360710322, |
|
"grad_norm": 0.07669795019824106, |
|
"learning_rate": 1.9998796597293477e-05, |
|
"loss": 0.7198, |
|
"mean_token_accuracy": 0.779993979104892, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.053274139844617, |
|
"grad_norm": 0.0792411711933711, |
|
"learning_rate": 1.9998119704485016e-05, |
|
"loss": 0.72, |
|
"mean_token_accuracy": 0.7801120305583913, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.0643729189789124, |
|
"grad_norm": 0.07886279995421645, |
|
"learning_rate": 1.999729241179462e-05, |
|
"loss": 0.7138, |
|
"mean_token_accuracy": 0.781635200170369, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.0754716981132075, |
|
"grad_norm": 0.07435886102846764, |
|
"learning_rate": 1.9996314731667096e-05, |
|
"loss": 0.7076, |
|
"mean_token_accuracy": 0.7830717794390655, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.0865704772475029, |
|
"grad_norm": 0.07230226563190868, |
|
"learning_rate": 1.9995186678809513e-05, |
|
"loss": 0.7033, |
|
"mean_token_accuracy": 0.7850300188021543, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.097669256381798, |
|
"grad_norm": 0.07003963622013061, |
|
"learning_rate": 1.999390827019096e-05, |
|
"loss": 0.7044, |
|
"mean_token_accuracy": 0.7842776950780501, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.1087680355160932, |
|
"grad_norm": 0.07508778652915178, |
|
"learning_rate": 1.9992479525042305e-05, |
|
"loss": 0.7372, |
|
"mean_token_accuracy": 0.774935390675619, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.1087680355160932, |
|
"eval_loss": 0.7690628170967102, |
|
"eval_mean_token_accuracy": 0.7655783087569432, |
|
"eval_runtime": 2.4973, |
|
"eval_samples_per_second": 51.656, |
|
"eval_steps_per_second": 4.405, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.1198668146503885, |
|
"grad_norm": 0.07918378464816227, |
|
"learning_rate": 1.9990900464855895e-05, |
|
"loss": 0.7006, |
|
"mean_token_accuracy": 0.7857579260565195, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.1309655937846836, |
|
"grad_norm": 0.07328427894357584, |
|
"learning_rate": 1.998917111338525e-05, |
|
"loss": 0.725, |
|
"mean_token_accuracy": 0.7775817733453466, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.142064372918979, |
|
"grad_norm": 0.07204080045449197, |
|
"learning_rate": 1.998729149664468e-05, |
|
"loss": 0.7016, |
|
"mean_token_accuracy": 0.7855126398407342, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.1531631520532741, |
|
"grad_norm": 0.08533628416371716, |
|
"learning_rate": 1.9985261642908917e-05, |
|
"loss": 0.7292, |
|
"mean_token_accuracy": 0.7774274235941383, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.1642619311875695, |
|
"grad_norm": 0.07520433809088832, |
|
"learning_rate": 1.9983081582712684e-05, |
|
"loss": 0.7181, |
|
"mean_token_accuracy": 0.780462785723534, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.1753607103218646, |
|
"grad_norm": 0.07190307670380537, |
|
"learning_rate": 1.998075134885022e-05, |
|
"loss": 0.7116, |
|
"mean_token_accuracy": 0.7825697671237573, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.1864594894561598, |
|
"grad_norm": 0.06491993297695452, |
|
"learning_rate": 1.9978270976374813e-05, |
|
"loss": 0.6703, |
|
"mean_token_accuracy": 0.793881569205478, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.197558268590455, |
|
"grad_norm": 0.06911870719514304, |
|
"learning_rate": 1.9975640502598243e-05, |
|
"loss": 0.7082, |
|
"mean_token_accuracy": 0.7836861901228346, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.2086570477247502, |
|
"grad_norm": 0.07437166520951717, |
|
"learning_rate": 1.9972859967090253e-05, |
|
"loss": 0.7264, |
|
"mean_token_accuracy": 0.7775489198837382, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.2197558268590456, |
|
"grad_norm": 0.06775894355173918, |
|
"learning_rate": 1.996992941167792e-05, |
|
"loss": 0.7133, |
|
"mean_token_accuracy": 0.7810329813258085, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.2308546059933407, |
|
"grad_norm": 0.07314137750187506, |
|
"learning_rate": 1.996684888044506e-05, |
|
"loss": 0.7037, |
|
"mean_token_accuracy": 0.7839758348181621, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.2419533851276359, |
|
"grad_norm": 0.07831492126639587, |
|
"learning_rate": 1.996361841973154e-05, |
|
"loss": 0.7126, |
|
"mean_token_accuracy": 0.7816916858828595, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.2530521642619312, |
|
"grad_norm": 0.07517018503795618, |
|
"learning_rate": 1.996023807813258e-05, |
|
"loss": 0.6941, |
|
"mean_token_accuracy": 0.7867524535551568, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.2641509433962264, |
|
"grad_norm": 0.0704338781552015, |
|
"learning_rate": 1.9956707906498046e-05, |
|
"loss": 0.6903, |
|
"mean_token_accuracy": 0.7879095356059462, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.2752497225305217, |
|
"grad_norm": 0.06995474295091916, |
|
"learning_rate": 1.9953027957931658e-05, |
|
"loss": 0.7222, |
|
"mean_token_accuracy": 0.7794008881208037, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.2863485016648168, |
|
"grad_norm": 0.07694713494942496, |
|
"learning_rate": 1.9949198287790215e-05, |
|
"loss": 0.6943, |
|
"mean_token_accuracy": 0.7870831177107039, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.297447280799112, |
|
"grad_norm": 0.08903403872621385, |
|
"learning_rate": 1.9945218953682736e-05, |
|
"loss": 0.7206, |
|
"mean_token_accuracy": 0.779562023576681, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.3085460599334073, |
|
"grad_norm": 0.07363661521999505, |
|
"learning_rate": 1.9941090015469614e-05, |
|
"loss": 0.724, |
|
"mean_token_accuracy": 0.7783052999023876, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.3196448390677027, |
|
"grad_norm": 0.07544765033521425, |
|
"learning_rate": 1.9936811535261714e-05, |
|
"loss": 0.712, |
|
"mean_token_accuracy": 0.7817552803483336, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.3307436182019978, |
|
"grad_norm": 0.07392484759536327, |
|
"learning_rate": 1.9932383577419432e-05, |
|
"loss": 0.7227, |
|
"mean_token_accuracy": 0.7788691622542909, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3307436182019978, |
|
"eval_loss": 0.7600793838500977, |
|
"eval_mean_token_accuracy": 0.7673947304714178, |
|
"eval_runtime": 2.5058, |
|
"eval_samples_per_second": 51.48, |
|
"eval_steps_per_second": 4.39, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.341842397336293, |
|
"grad_norm": 0.07911913461484033, |
|
"learning_rate": 1.9927806208551718e-05, |
|
"loss": 0.7093, |
|
"mean_token_accuracy": 0.782579034246759, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.3529411764705883, |
|
"grad_norm": 0.0642347895827473, |
|
"learning_rate": 1.99230794975151e-05, |
|
"loss": 0.7212, |
|
"mean_token_accuracy": 0.7787558227928784, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.3640399556048834, |
|
"grad_norm": 0.067833068280622, |
|
"learning_rate": 1.9918203515412616e-05, |
|
"loss": 0.7185, |
|
"mean_token_accuracy": 0.7798822363644213, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.3751387347391786, |
|
"grad_norm": 0.0688915478969926, |
|
"learning_rate": 1.9913178335592784e-05, |
|
"loss": 0.7249, |
|
"mean_token_accuracy": 0.7776480990593553, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.386237513873474, |
|
"grad_norm": 0.06744773171181109, |
|
"learning_rate": 1.9908004033648452e-05, |
|
"loss": 0.6984, |
|
"mean_token_accuracy": 0.785427300541276, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.397336293007769, |
|
"grad_norm": 0.07602324779465816, |
|
"learning_rate": 1.9902680687415704e-05, |
|
"loss": 0.7138, |
|
"mean_token_accuracy": 0.7809259935769812, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.4084350721420644, |
|
"grad_norm": 0.07420737656548172, |
|
"learning_rate": 1.9897208376972655e-05, |
|
"loss": 0.7208, |
|
"mean_token_accuracy": 0.7793231378215638, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.4195338512763596, |
|
"grad_norm": 0.07343563458085059, |
|
"learning_rate": 1.9891587184638274e-05, |
|
"loss": 0.7221, |
|
"mean_token_accuracy": 0.7778519855839254, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.430632630410655, |
|
"grad_norm": 0.07698384093927461, |
|
"learning_rate": 1.9885817194971116e-05, |
|
"loss": 0.7137, |
|
"mean_token_accuracy": 0.7805180143823278, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.44173140954495, |
|
"grad_norm": 0.07449182501302304, |
|
"learning_rate": 1.9879898494768093e-05, |
|
"loss": 0.7051, |
|
"mean_token_accuracy": 0.7831931362540656, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.4528301886792452, |
|
"grad_norm": 0.07581066932689472, |
|
"learning_rate": 1.9873831173063113e-05, |
|
"loss": 0.7049, |
|
"mean_token_accuracy": 0.7832020777520368, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.4639289678135405, |
|
"grad_norm": 0.06949135831136043, |
|
"learning_rate": 1.9867615321125796e-05, |
|
"loss": 0.7056, |
|
"mean_token_accuracy": 0.7838479455915908, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.4750277469478357, |
|
"grad_norm": 0.07439968483839138, |
|
"learning_rate": 1.9861251032460053e-05, |
|
"loss": 0.7081, |
|
"mean_token_accuracy": 0.7832292936410306, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.486126526082131, |
|
"grad_norm": 0.06607770608293857, |
|
"learning_rate": 1.9854738402802715e-05, |
|
"loss": 0.6932, |
|
"mean_token_accuracy": 0.786992797754289, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.4972253052164262, |
|
"grad_norm": 0.07053844606053124, |
|
"learning_rate": 1.9848077530122083e-05, |
|
"loss": 0.6965, |
|
"mean_token_accuracy": 0.7860283399251268, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.5083240843507215, |
|
"grad_norm": 0.0651242671351587, |
|
"learning_rate": 1.9841268514616434e-05, |
|
"loss": 0.7181, |
|
"mean_token_accuracy": 0.7788762916398921, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.5194228634850167, |
|
"grad_norm": 0.06916533407392902, |
|
"learning_rate": 1.9834311458712547e-05, |
|
"loss": 0.7076, |
|
"mean_token_accuracy": 0.7832109400921368, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.5305216426193118, |
|
"grad_norm": 0.07466081988250255, |
|
"learning_rate": 1.9827206467064133e-05, |
|
"loss": 0.7074, |
|
"mean_token_accuracy": 0.7828159386074006, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.5416204217536071, |
|
"grad_norm": 0.06967673384995965, |
|
"learning_rate": 1.9819953646550286e-05, |
|
"loss": 0.7003, |
|
"mean_token_accuracy": 0.7843127898869101, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.5527192008879025, |
|
"grad_norm": 0.07129452144368828, |
|
"learning_rate": 1.9812553106273848e-05, |
|
"loss": 0.6914, |
|
"mean_token_accuracy": 0.787352826911644, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5527192008879025, |
|
"eval_loss": 0.7521222829818726, |
|
"eval_mean_token_accuracy": 0.7691004239798882, |
|
"eval_runtime": 2.4983, |
|
"eval_samples_per_second": 51.635, |
|
"eval_steps_per_second": 4.403, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5638179800221974, |
|
"grad_norm": 0.07293730779046935, |
|
"learning_rate": 1.9805004957559795e-05, |
|
"loss": 0.6921, |
|
"mean_token_accuracy": 0.7870082715054341, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.5749167591564928, |
|
"grad_norm": 0.06857400596374212, |
|
"learning_rate": 1.979730931395354e-05, |
|
"loss": 0.7156, |
|
"mean_token_accuracy": 0.7806777176754099, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.5860155382907881, |
|
"grad_norm": 0.06787546605095558, |
|
"learning_rate": 1.9789466291219246e-05, |
|
"loss": 0.7006, |
|
"mean_token_accuracy": 0.7845560051830258, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.5971143174250833, |
|
"grad_norm": 0.06978512332784513, |
|
"learning_rate": 1.9781476007338058e-05, |
|
"loss": 0.7063, |
|
"mean_token_accuracy": 0.7831825376761976, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.6082130965593784, |
|
"grad_norm": 0.06573978556995788, |
|
"learning_rate": 1.9773338582506357e-05, |
|
"loss": 0.6974, |
|
"mean_token_accuracy": 0.7848375942024458, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.6193118756936737, |
|
"grad_norm": 0.06846729263039855, |
|
"learning_rate": 1.976505413913393e-05, |
|
"loss": 0.7304, |
|
"mean_token_accuracy": 0.7762347996924202, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.6304106548279689, |
|
"grad_norm": 0.07292716460344072, |
|
"learning_rate": 1.9756622801842144e-05, |
|
"loss": 0.6945, |
|
"mean_token_accuracy": 0.785635234248747, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.641509433962264, |
|
"grad_norm": 0.06788764902071766, |
|
"learning_rate": 1.974804469746206e-05, |
|
"loss": 0.6871, |
|
"mean_token_accuracy": 0.7883116176033117, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.6526082130965594, |
|
"grad_norm": 0.06990606289840068, |
|
"learning_rate": 1.9739319955032522e-05, |
|
"loss": 0.7105, |
|
"mean_token_accuracy": 0.7815165621962155, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.6637069922308547, |
|
"grad_norm": 0.06745401830931097, |
|
"learning_rate": 1.973044870579824e-05, |
|
"loss": 0.6901, |
|
"mean_token_accuracy": 0.7868986592436943, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.6748057713651499, |
|
"grad_norm": 0.06884276535878785, |
|
"learning_rate": 1.9721431083207786e-05, |
|
"loss": 0.7096, |
|
"mean_token_accuracy": 0.7823803797267934, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.685904550499445, |
|
"grad_norm": 0.06879555042165823, |
|
"learning_rate": 1.9712267222911605e-05, |
|
"loss": 0.6971, |
|
"mean_token_accuracy": 0.7852152497634741, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.6970033296337403, |
|
"grad_norm": 0.07979933050534435, |
|
"learning_rate": 1.9702957262759964e-05, |
|
"loss": 0.714, |
|
"mean_token_accuracy": 0.7805189578063543, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.7081021087680355, |
|
"grad_norm": 0.07357866691629775, |
|
"learning_rate": 1.9693501342800895e-05, |
|
"loss": 0.6874, |
|
"mean_token_accuracy": 0.7879884917047848, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.7192008879023306, |
|
"grad_norm": 0.0701902920875424, |
|
"learning_rate": 1.9683899605278062e-05, |
|
"loss": 0.7159, |
|
"mean_token_accuracy": 0.7792839420183199, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.730299667036626, |
|
"grad_norm": 0.06842656933800775, |
|
"learning_rate": 1.967415219462864e-05, |
|
"loss": 0.6942, |
|
"mean_token_accuracy": 0.7865705838505347, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.7413984461709213, |
|
"grad_norm": 0.06540618097934706, |
|
"learning_rate": 1.966425925748115e-05, |
|
"loss": 0.6898, |
|
"mean_token_accuracy": 0.7874069573881058, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.7524972253052165, |
|
"grad_norm": 0.06700381366809494, |
|
"learning_rate": 1.9654220942653223e-05, |
|
"loss": 0.7148, |
|
"mean_token_accuracy": 0.7804010777085539, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.7635960044395116, |
|
"grad_norm": 0.06251703724168588, |
|
"learning_rate": 1.964403740114939e-05, |
|
"loss": 0.7042, |
|
"mean_token_accuracy": 0.7826519820274497, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.774694783573807, |
|
"grad_norm": 0.06965212134485024, |
|
"learning_rate": 1.9633708786158803e-05, |
|
"loss": 0.7041, |
|
"mean_token_accuracy": 0.7828340916547577, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.774694783573807, |
|
"eval_loss": 0.7456536293029785, |
|
"eval_mean_token_accuracy": 0.7707645481418179, |
|
"eval_runtime": 2.5019, |
|
"eval_samples_per_second": 51.561, |
|
"eval_steps_per_second": 4.397, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.785793562708102, |
|
"grad_norm": 0.06629236648953662, |
|
"learning_rate": 1.9623235253052924e-05, |
|
"loss": 0.7031, |
|
"mean_token_accuracy": 0.783023400482328, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.7968923418423972, |
|
"grad_norm": 0.0700851146503613, |
|
"learning_rate": 1.961261695938319e-05, |
|
"loss": 0.6879, |
|
"mean_token_accuracy": 0.7871533891419487, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.8079911209766926, |
|
"grad_norm": 0.06481275430876884, |
|
"learning_rate": 1.9601854064878645e-05, |
|
"loss": 0.6976, |
|
"mean_token_accuracy": 0.7848978893968511, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.819089900110988, |
|
"grad_norm": 0.06713656020683173, |
|
"learning_rate": 1.959094673144354e-05, |
|
"loss": 0.6972, |
|
"mean_token_accuracy": 0.7849619060835862, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.830188679245283, |
|
"grad_norm": 0.06943806825337467, |
|
"learning_rate": 1.957989512315489e-05, |
|
"loss": 0.7057, |
|
"mean_token_accuracy": 0.7820006305173784, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.8412874583795782, |
|
"grad_norm": 0.07046613499341978, |
|
"learning_rate": 1.9568699406260016e-05, |
|
"loss": 0.6993, |
|
"mean_token_accuracy": 0.7842056521277757, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.8523862375138735, |
|
"grad_norm": 0.06742551538991974, |
|
"learning_rate": 1.9557359749174033e-05, |
|
"loss": 0.6937, |
|
"mean_token_accuracy": 0.7860851352389128, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.8634850166481687, |
|
"grad_norm": 0.06568715165754793, |
|
"learning_rate": 1.954587632247732e-05, |
|
"loss": 0.7206, |
|
"mean_token_accuracy": 0.7770859098863055, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.8745837957824638, |
|
"grad_norm": 0.07469838406522963, |
|
"learning_rate": 1.9534249298912968e-05, |
|
"loss": 0.7003, |
|
"mean_token_accuracy": 0.7843834053290605, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.8856825749167592, |
|
"grad_norm": 0.07128217656144081, |
|
"learning_rate": 1.9522478853384154e-05, |
|
"loss": 0.7094, |
|
"mean_token_accuracy": 0.780810941894696, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.8967813540510545, |
|
"grad_norm": 0.06973978771104394, |
|
"learning_rate": 1.9510565162951538e-05, |
|
"loss": 0.6922, |
|
"mean_token_accuracy": 0.7863895170050037, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.9078801331853497, |
|
"grad_norm": 0.0670621119728851, |
|
"learning_rate": 1.9498508406830577e-05, |
|
"loss": 0.7063, |
|
"mean_token_accuracy": 0.7823195111378769, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.9189789123196448, |
|
"grad_norm": 0.07134711572023682, |
|
"learning_rate": 1.9486308766388843e-05, |
|
"loss": 0.7145, |
|
"mean_token_accuracy": 0.7796630421809072, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.9300776914539401, |
|
"grad_norm": 0.06724287084560662, |
|
"learning_rate": 1.9473966425143292e-05, |
|
"loss": 0.6996, |
|
"mean_token_accuracy": 0.7833654062751905, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.9411764705882353, |
|
"grad_norm": 0.07372692352709145, |
|
"learning_rate": 1.946148156875751e-05, |
|
"loss": 0.7108, |
|
"mean_token_accuracy": 0.7810817039400682, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.9522752497225304, |
|
"grad_norm": 0.06690981599243406, |
|
"learning_rate": 1.944885438503888e-05, |
|
"loss": 0.7219, |
|
"mean_token_accuracy": 0.778134444662938, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.9633740288568258, |
|
"grad_norm": 0.0706861843071515, |
|
"learning_rate": 1.9436085063935837e-05, |
|
"loss": 0.6837, |
|
"mean_token_accuracy": 0.7875059272645717, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.9744728079911211, |
|
"grad_norm": 0.06607352029909119, |
|
"learning_rate": 1.9423173797534924e-05, |
|
"loss": 0.7144, |
|
"mean_token_accuracy": 0.7800783446443771, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.9855715871254163, |
|
"grad_norm": 0.06497044410128139, |
|
"learning_rate": 1.9410120780057958e-05, |
|
"loss": 0.6997, |
|
"mean_token_accuracy": 0.7838718452599522, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.9966703662597114, |
|
"grad_norm": 0.0669833105277842, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.7017, |
|
"mean_token_accuracy": 0.783079952098619, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.9966703662597114, |
|
"eval_loss": 0.7399606108665466, |
|
"eval_mean_token_accuracy": 0.7714698001679475, |
|
"eval_runtime": 2.502, |
|
"eval_samples_per_second": 51.559, |
|
"eval_steps_per_second": 4.397, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.006659267480577, |
|
"grad_norm": 0.06752952481322469, |
|
"learning_rate": 1.938359027942184e-05, |
|
"loss": 0.6904, |
|
"mean_token_accuracy": 0.7866080115747632, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 2.0177580466148726, |
|
"grad_norm": 0.07337066846619034, |
|
"learning_rate": 1.937011319535615e-05, |
|
"loss": 0.6648, |
|
"mean_token_accuracy": 0.7914878464945215, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.0288568257491675, |
|
"grad_norm": 0.07482816766070267, |
|
"learning_rate": 1.9356495158395317e-05, |
|
"loss": 0.6491, |
|
"mean_token_accuracy": 0.7960834519099566, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 2.039955604883463, |
|
"grad_norm": 0.07602684827791824, |
|
"learning_rate": 1.9342736373392976e-05, |
|
"loss": 0.6328, |
|
"mean_token_accuracy": 0.8012870703759848, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.051054384017758, |
|
"grad_norm": 0.06687084853107801, |
|
"learning_rate": 1.932883704732001e-05, |
|
"loss": 0.6382, |
|
"mean_token_accuracy": 0.7995388906965585, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.062153163152053, |
|
"grad_norm": 0.0674027612147169, |
|
"learning_rate": 1.9314797389261426e-05, |
|
"loss": 0.6398, |
|
"mean_token_accuracy": 0.7987728519453021, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.0732519422863485, |
|
"grad_norm": 0.06627889722925441, |
|
"learning_rate": 1.9300617610413232e-05, |
|
"loss": 0.6379, |
|
"mean_token_accuracy": 0.7998474325494318, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 2.084350721420644, |
|
"grad_norm": 0.06945568860354903, |
|
"learning_rate": 1.9286297924079244e-05, |
|
"loss": 0.6191, |
|
"mean_token_accuracy": 0.8055909240777206, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.095449500554939, |
|
"grad_norm": 0.07041836195613017, |
|
"learning_rate": 1.9271838545667876e-05, |
|
"loss": 0.6556, |
|
"mean_token_accuracy": 0.7946697171561378, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 2.106548279689234, |
|
"grad_norm": 0.06688242487342114, |
|
"learning_rate": 1.9257239692688907e-05, |
|
"loss": 0.6574, |
|
"mean_token_accuracy": 0.7937854444231515, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.1176470588235294, |
|
"grad_norm": 0.06439046861743944, |
|
"learning_rate": 1.92425015847502e-05, |
|
"loss": 0.6403, |
|
"mean_token_accuracy": 0.7991636690728392, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 2.128745837957825, |
|
"grad_norm": 0.06720995663335921, |
|
"learning_rate": 1.9227624443554425e-05, |
|
"loss": 0.6453, |
|
"mean_token_accuracy": 0.7970826233622977, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.1398446170921197, |
|
"grad_norm": 0.06581057685199704, |
|
"learning_rate": 1.921260849289568e-05, |
|
"loss": 0.6232, |
|
"mean_token_accuracy": 0.8048729584925922, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 2.150943396226415, |
|
"grad_norm": 0.06501306496885344, |
|
"learning_rate": 1.9197453958656157e-05, |
|
"loss": 0.6399, |
|
"mean_token_accuracy": 0.7984237345184382, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.1620421753607104, |
|
"grad_norm": 0.06827894352599473, |
|
"learning_rate": 1.9182161068802742e-05, |
|
"loss": 0.6495, |
|
"mean_token_accuracy": 0.795851682529419, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.1731409544950058, |
|
"grad_norm": 0.06461558807425297, |
|
"learning_rate": 1.916673005338357e-05, |
|
"loss": 0.656, |
|
"mean_token_accuracy": 0.794067004713519, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.1842397336293007, |
|
"grad_norm": 0.07270995750344725, |
|
"learning_rate": 1.915116114452458e-05, |
|
"loss": 0.6385, |
|
"mean_token_accuracy": 0.7996120848528411, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 2.195338512763596, |
|
"grad_norm": 0.06469442541114857, |
|
"learning_rate": 1.913545457642601e-05, |
|
"loss": 0.6479, |
|
"mean_token_accuracy": 0.7971225711022437, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.2064372918978914, |
|
"grad_norm": 0.06869297358951969, |
|
"learning_rate": 1.911961058535889e-05, |
|
"loss": 0.6184, |
|
"mean_token_accuracy": 0.8051116219349698, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 2.2175360710321863, |
|
"grad_norm": 0.06879744937055413, |
|
"learning_rate": 1.9103629409661468e-05, |
|
"loss": 0.6619, |
|
"mean_token_accuracy": 0.7926388777759388, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.2175360710321863, |
|
"eval_loss": 0.7441337704658508, |
|
"eval_mean_token_accuracy": 0.771140236457519, |
|
"eval_runtime": 2.5113, |
|
"eval_samples_per_second": 51.369, |
|
"eval_steps_per_second": 4.38, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.2286348501664817, |
|
"grad_norm": 0.06942438285553687, |
|
"learning_rate": 1.9087511289735646e-05, |
|
"loss": 0.6319, |
|
"mean_token_accuracy": 0.8006943272195789, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 2.239733629300777, |
|
"grad_norm": 0.07156492009528506, |
|
"learning_rate": 1.907125646804334e-05, |
|
"loss": 0.66, |
|
"mean_token_accuracy": 0.793213293023644, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.2508324084350724, |
|
"grad_norm": 0.0725060557022843, |
|
"learning_rate": 1.905486518910286e-05, |
|
"loss": 0.6675, |
|
"mean_token_accuracy": 0.7907495163202596, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 2.2619311875693673, |
|
"grad_norm": 0.06665264760259867, |
|
"learning_rate": 1.9038337699485207e-05, |
|
"loss": 0.6574, |
|
"mean_token_accuracy": 0.7942814820609995, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.2730299667036626, |
|
"grad_norm": 0.06471591371274955, |
|
"learning_rate": 1.902167424781038e-05, |
|
"loss": 0.6518, |
|
"mean_token_accuracy": 0.7957625783260307, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 2.284128745837958, |
|
"grad_norm": 0.06342644925790648, |
|
"learning_rate": 1.9004875084743624e-05, |
|
"loss": 0.6441, |
|
"mean_token_accuracy": 0.7975496945585403, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.295227524972253, |
|
"grad_norm": 0.06887554911088101, |
|
"learning_rate": 1.8987940462991673e-05, |
|
"loss": 0.6354, |
|
"mean_token_accuracy": 0.8001512281519416, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 2.3063263041065483, |
|
"grad_norm": 0.06901162427572825, |
|
"learning_rate": 1.8970870637298936e-05, |
|
"loss": 0.6553, |
|
"mean_token_accuracy": 0.7942396405885372, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.3174250832408436, |
|
"grad_norm": 0.06479294329676767, |
|
"learning_rate": 1.895366586444367e-05, |
|
"loss": 0.6583, |
|
"mean_token_accuracy": 0.7935195245732686, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 2.328523862375139, |
|
"grad_norm": 0.06989659802389134, |
|
"learning_rate": 1.8936326403234125e-05, |
|
"loss": 0.652, |
|
"mean_token_accuracy": 0.7949510683065234, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.339622641509434, |
|
"grad_norm": 0.06954996933553559, |
|
"learning_rate": 1.8918852514504632e-05, |
|
"loss": 0.6385, |
|
"mean_token_accuracy": 0.799856158538445, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 2.3507214206437292, |
|
"grad_norm": 0.07092170466098426, |
|
"learning_rate": 1.8901244461111697e-05, |
|
"loss": 0.6269, |
|
"mean_token_accuracy": 0.8034097121418204, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.3618201997780246, |
|
"grad_norm": 0.06217401891589138, |
|
"learning_rate": 1.8883502507930044e-05, |
|
"loss": 0.6395, |
|
"mean_token_accuracy": 0.7991042823069772, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 2.3729189789123195, |
|
"grad_norm": 0.0662172520804478, |
|
"learning_rate": 1.8865626921848615e-05, |
|
"loss": 0.6451, |
|
"mean_token_accuracy": 0.7974116719314959, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.384017758046615, |
|
"grad_norm": 0.07112233857552575, |
|
"learning_rate": 1.8847617971766577e-05, |
|
"loss": 0.653, |
|
"mean_token_accuracy": 0.7955020926307707, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 2.39511653718091, |
|
"grad_norm": 0.07214209721888336, |
|
"learning_rate": 1.8829475928589272e-05, |
|
"loss": 0.649, |
|
"mean_token_accuracy": 0.7957132739895204, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.4062153163152056, |
|
"grad_norm": 0.07301580591387644, |
|
"learning_rate": 1.8811201065224122e-05, |
|
"loss": 0.6622, |
|
"mean_token_accuracy": 0.7918022645898184, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 2.4173140954495005, |
|
"grad_norm": 0.06822261694746347, |
|
"learning_rate": 1.8792793656576544e-05, |
|
"loss": 0.6502, |
|
"mean_token_accuracy": 0.795963781486279, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.428412874583796, |
|
"grad_norm": 0.06893747302221427, |
|
"learning_rate": 1.877425397954582e-05, |
|
"loss": 0.6587, |
|
"mean_token_accuracy": 0.7934014423473963, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 2.439511653718091, |
|
"grad_norm": 0.0674145014890421, |
|
"learning_rate": 1.8755582313020912e-05, |
|
"loss": 0.6257, |
|
"mean_token_accuracy": 0.8032613355575334, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.439511653718091, |
|
"eval_loss": 0.7397039532661438, |
|
"eval_mean_token_accuracy": 0.7719497194970454, |
|
"eval_runtime": 2.5114, |
|
"eval_samples_per_second": 51.367, |
|
"eval_steps_per_second": 4.38, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.450610432852386, |
|
"grad_norm": 0.06748775143884418, |
|
"learning_rate": 1.873677893787627e-05, |
|
"loss": 0.6545, |
|
"mean_token_accuracy": 0.7947147115927367, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 2.4617092119866815, |
|
"grad_norm": 0.06642880789873921, |
|
"learning_rate": 1.8717844136967626e-05, |
|
"loss": 0.657, |
|
"mean_token_accuracy": 0.7933624891798241, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.472807991120977, |
|
"grad_norm": 0.072992943391947, |
|
"learning_rate": 1.8698778195127715e-05, |
|
"loss": 0.6569, |
|
"mean_token_accuracy": 0.7930798499718653, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 2.4839067702552717, |
|
"grad_norm": 0.0676737777720738, |
|
"learning_rate": 1.8679581399162008e-05, |
|
"loss": 0.6461, |
|
"mean_token_accuracy": 0.7969763117340545, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.495005549389567, |
|
"grad_norm": 0.0711336163435986, |
|
"learning_rate": 1.866025403784439e-05, |
|
"loss": 0.6202, |
|
"mean_token_accuracy": 0.8044824253292406, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 2.5061043285238624, |
|
"grad_norm": 0.0697843091991907, |
|
"learning_rate": 1.8640796401912805e-05, |
|
"loss": 0.6353, |
|
"mean_token_accuracy": 0.8006583544576105, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.5172031076581574, |
|
"grad_norm": 0.066545120562253, |
|
"learning_rate": 1.8621208784064913e-05, |
|
"loss": 0.6652, |
|
"mean_token_accuracy": 0.7910918076425002, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 2.5283018867924527, |
|
"grad_norm": 0.06454289193586986, |
|
"learning_rate": 1.860149147895366e-05, |
|
"loss": 0.6276, |
|
"mean_token_accuracy": 0.8020308318948542, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.539400665926748, |
|
"grad_norm": 0.06648253339239633, |
|
"learning_rate": 1.8581644783182837e-05, |
|
"loss": 0.634, |
|
"mean_token_accuracy": 0.8003396054716794, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 2.5504994450610434, |
|
"grad_norm": 0.060198174794412355, |
|
"learning_rate": 1.8561668995302668e-05, |
|
"loss": 0.6469, |
|
"mean_token_accuracy": 0.7973357638528167, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.561598224195339, |
|
"grad_norm": 0.06659840562861344, |
|
"learning_rate": 1.854156441580526e-05, |
|
"loss": 0.6467, |
|
"mean_token_accuracy": 0.7969091757435878, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 2.5726970033296337, |
|
"grad_norm": 0.06563698320336359, |
|
"learning_rate": 1.8521331347120116e-05, |
|
"loss": 0.6381, |
|
"mean_token_accuracy": 0.7992970667738716, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.583795782463929, |
|
"grad_norm": 0.06478226257486022, |
|
"learning_rate": 1.850097009360958e-05, |
|
"loss": 0.6426, |
|
"mean_token_accuracy": 0.7983039033131962, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 2.594894561598224, |
|
"grad_norm": 0.0664676569567307, |
|
"learning_rate": 1.848048096156426e-05, |
|
"loss": 0.6347, |
|
"mean_token_accuracy": 0.8004106791529967, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.6059933407325193, |
|
"grad_norm": 0.07085311611322727, |
|
"learning_rate": 1.845986425919841e-05, |
|
"loss": 0.6606, |
|
"mean_token_accuracy": 0.7914169985202615, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 2.6170921198668147, |
|
"grad_norm": 0.07197033028656963, |
|
"learning_rate": 1.843912029664531e-05, |
|
"loss": 0.6492, |
|
"mean_token_accuracy": 0.7963753925899613, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.62819089900111, |
|
"grad_norm": 0.06511918055148407, |
|
"learning_rate": 1.8418249385952575e-05, |
|
"loss": 0.622, |
|
"mean_token_accuracy": 0.8041274702036085, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 2.6392896781354054, |
|
"grad_norm": 0.06339181942451623, |
|
"learning_rate": 1.83972518410775e-05, |
|
"loss": 0.6403, |
|
"mean_token_accuracy": 0.7988095012565483, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.6503884572697003, |
|
"grad_norm": 0.06237858298480496, |
|
"learning_rate": 1.8376127977882294e-05, |
|
"loss": 0.6225, |
|
"mean_token_accuracy": 0.8035680254950377, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 2.6614872364039956, |
|
"grad_norm": 0.06430635783125142, |
|
"learning_rate": 1.8354878114129368e-05, |
|
"loss": 0.6359, |
|
"mean_token_accuracy": 0.7994956540644247, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.6614872364039956, |
|
"eval_loss": 0.7367499470710754, |
|
"eval_mean_token_accuracy": 0.7725689509903593, |
|
"eval_runtime": 2.5066, |
|
"eval_samples_per_second": 51.465, |
|
"eval_steps_per_second": 4.388, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.6725860155382906, |
|
"grad_norm": 0.07033787900057323, |
|
"learning_rate": 1.8333502569476516e-05, |
|
"loss": 0.6474, |
|
"mean_token_accuracy": 0.7964749010659795, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 2.683684794672586, |
|
"grad_norm": 0.06484833686602164, |
|
"learning_rate": 1.8312001665472146e-05, |
|
"loss": 0.6496, |
|
"mean_token_accuracy": 0.7958858397960623, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.6947835738068813, |
|
"grad_norm": 0.0626772586359051, |
|
"learning_rate": 1.8290375725550417e-05, |
|
"loss": 0.6255, |
|
"mean_token_accuracy": 0.8037525321584809, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 2.7058823529411766, |
|
"grad_norm": 0.06159318683983137, |
|
"learning_rate": 1.8268625075026375e-05, |
|
"loss": 0.6501, |
|
"mean_token_accuracy": 0.7965229989957979, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.7169811320754715, |
|
"grad_norm": 0.0618598047924171, |
|
"learning_rate": 1.824675004109107e-05, |
|
"loss": 0.6513, |
|
"mean_token_accuracy": 0.7947231244403284, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 2.728079911209767, |
|
"grad_norm": 0.06568273055121288, |
|
"learning_rate": 1.8224750952806626e-05, |
|
"loss": 0.646, |
|
"mean_token_accuracy": 0.7970477905076125, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.7391786903440623, |
|
"grad_norm": 0.06819286679772224, |
|
"learning_rate": 1.8202628141101294e-05, |
|
"loss": 0.6422, |
|
"mean_token_accuracy": 0.7986810100724087, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 2.750277469478357, |
|
"grad_norm": 0.0711841520608327, |
|
"learning_rate": 1.818038193876448e-05, |
|
"loss": 0.6174, |
|
"mean_token_accuracy": 0.8060896278306409, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.7613762486126525, |
|
"grad_norm": 0.06971065510993014, |
|
"learning_rate": 1.8158012680441723e-05, |
|
"loss": 0.6432, |
|
"mean_token_accuracy": 0.7975791405172538, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 2.772475027746948, |
|
"grad_norm": 0.06624207898108019, |
|
"learning_rate": 1.8135520702629677e-05, |
|
"loss": 0.6695, |
|
"mean_token_accuracy": 0.7898385773474513, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.7835738068812432, |
|
"grad_norm": 0.06340268367764915, |
|
"learning_rate": 1.8112906343671045e-05, |
|
"loss": 0.6448, |
|
"mean_token_accuracy": 0.7969842824020124, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 2.794672586015538, |
|
"grad_norm": 0.06802225275898736, |
|
"learning_rate": 1.8090169943749477e-05, |
|
"loss": 0.6352, |
|
"mean_token_accuracy": 0.7999616056672751, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.8057713651498335, |
|
"grad_norm": 0.0621959926252905, |
|
"learning_rate": 1.806731184488447e-05, |
|
"loss": 0.6511, |
|
"mean_token_accuracy": 0.7956011385580143, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 2.816870144284129, |
|
"grad_norm": 0.06958322307757032, |
|
"learning_rate": 1.8044332390926224e-05, |
|
"loss": 0.6546, |
|
"mean_token_accuracy": 0.7944194847521491, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.8279689234184238, |
|
"grad_norm": 0.06499832024961039, |
|
"learning_rate": 1.802123192755044e-05, |
|
"loss": 0.6615, |
|
"mean_token_accuracy": 0.7928298407723101, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 2.839067702552719, |
|
"grad_norm": 0.07307630365245506, |
|
"learning_rate": 1.799801080225316e-05, |
|
"loss": 0.6332, |
|
"mean_token_accuracy": 0.8002019958315876, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.8501664816870145, |
|
"grad_norm": 0.07081063389589305, |
|
"learning_rate": 1.7974669364345518e-05, |
|
"loss": 0.6436, |
|
"mean_token_accuracy": 0.7975661678829284, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 2.86126526082131, |
|
"grad_norm": 0.06364104487158771, |
|
"learning_rate": 1.795120796494848e-05, |
|
"loss": 0.6417, |
|
"mean_token_accuracy": 0.798216180882557, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.8723640399556047, |
|
"grad_norm": 0.07022725039220128, |
|
"learning_rate": 1.7927626956987577e-05, |
|
"loss": 0.6193, |
|
"mean_token_accuracy": 0.8051732373945985, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 2.8834628190899, |
|
"grad_norm": 0.07407116375551813, |
|
"learning_rate": 1.7903926695187595e-05, |
|
"loss": 0.6288, |
|
"mean_token_accuracy": 0.8017575332765929, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.8834628190899, |
|
"eval_loss": 0.7337623834609985, |
|
"eval_mean_token_accuracy": 0.7733067253190368, |
|
"eval_runtime": 2.5114, |
|
"eval_samples_per_second": 51.366, |
|
"eval_steps_per_second": 4.38, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.8945615982241955, |
|
"grad_norm": 0.0683657078818875, |
|
"learning_rate": 1.788010753606722e-05, |
|
"loss": 0.6437, |
|
"mean_token_accuracy": 0.7979937366466249, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 2.9056603773584904, |
|
"grad_norm": 0.06401144030251774, |
|
"learning_rate": 1.78561698379337e-05, |
|
"loss": 0.6525, |
|
"mean_token_accuracy": 0.7952514850245388, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.9167591564927857, |
|
"grad_norm": 0.06939812406872137, |
|
"learning_rate": 1.7832113960877445e-05, |
|
"loss": 0.6326, |
|
"mean_token_accuracy": 0.8011824622832678, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 2.927857935627081, |
|
"grad_norm": 0.06390511262490556, |
|
"learning_rate": 1.7807940266766595e-05, |
|
"loss": 0.6491, |
|
"mean_token_accuracy": 0.7960628539263179, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.9389567147613764, |
|
"grad_norm": 0.0679922767459088, |
|
"learning_rate": 1.7783649119241603e-05, |
|
"loss": 0.6343, |
|
"mean_token_accuracy": 0.8002581088236139, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 2.9500554938956713, |
|
"grad_norm": 0.06579108848028142, |
|
"learning_rate": 1.7759240883709745e-05, |
|
"loss": 0.6244, |
|
"mean_token_accuracy": 0.8036198689328673, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.9611542730299667, |
|
"grad_norm": 0.07202792613693204, |
|
"learning_rate": 1.7734715927339642e-05, |
|
"loss": 0.6553, |
|
"mean_token_accuracy": 0.7938320456683176, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 2.972253052164262, |
|
"grad_norm": 0.0669733486087782, |
|
"learning_rate": 1.7710074619055707e-05, |
|
"loss": 0.6518, |
|
"mean_token_accuracy": 0.7945248169339653, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.983351831298557, |
|
"grad_norm": 0.06844420691058574, |
|
"learning_rate": 1.7685317329532633e-05, |
|
"loss": 0.6561, |
|
"mean_token_accuracy": 0.7932978083640501, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 2.9944506104328523, |
|
"grad_norm": 0.06735395164638033, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 0.6451, |
|
"mean_token_accuracy": 0.7970097680754145, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.004439511653718, |
|
"grad_norm": 0.07064058723031082, |
|
"learning_rate": 1.7635456298185607e-05, |
|
"loss": 0.6223, |
|
"mean_token_accuracy": 0.8053257851051954, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 3.0155382907880135, |
|
"grad_norm": 0.07521256769387163, |
|
"learning_rate": 1.761035330641201e-05, |
|
"loss": 0.568, |
|
"mean_token_accuracy": 0.8179713362423978, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 3.0266370699223084, |
|
"grad_norm": 0.06470087882605292, |
|
"learning_rate": 1.7585135833488692e-05, |
|
"loss": 0.5777, |
|
"mean_token_accuracy": 0.8148876311929595, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 3.0377358490566038, |
|
"grad_norm": 0.06747297389090014, |
|
"learning_rate": 1.755980425875748e-05, |
|
"loss": 0.595, |
|
"mean_token_accuracy": 0.8092302889910499, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 3.048834628190899, |
|
"grad_norm": 0.06590547459656546, |
|
"learning_rate": 1.7534358963276606e-05, |
|
"loss": 0.5944, |
|
"mean_token_accuracy": 0.810452184015394, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 3.059933407325194, |
|
"grad_norm": 0.06771978622523486, |
|
"learning_rate": 1.7508800329814993e-05, |
|
"loss": 0.5857, |
|
"mean_token_accuracy": 0.8124715930556723, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 3.0710321864594894, |
|
"grad_norm": 0.06878565172783498, |
|
"learning_rate": 1.748312874284647e-05, |
|
"loss": 0.5818, |
|
"mean_token_accuracy": 0.8135985202900582, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 3.0821309655937847, |
|
"grad_norm": 0.06402876021976343, |
|
"learning_rate": 1.7457344588544018e-05, |
|
"loss": 0.5769, |
|
"mean_token_accuracy": 0.8156846142367662, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 3.09322974472808, |
|
"grad_norm": 0.0646784982224439, |
|
"learning_rate": 1.7431448254773943e-05, |
|
"loss": 0.5798, |
|
"mean_token_accuracy": 0.8142970258740082, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 3.104328523862375, |
|
"grad_norm": 0.06597855288274974, |
|
"learning_rate": 1.740544013109005e-05, |
|
"loss": 0.5884, |
|
"mean_token_accuracy": 0.8115392162041799, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.104328523862375, |
|
"eval_loss": 0.7474381923675537, |
|
"eval_mean_token_accuracy": 0.7717004443797748, |
|
"eval_runtime": 2.5041, |
|
"eval_samples_per_second": 51.515, |
|
"eval_steps_per_second": 4.393, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.1154273029966704, |
|
"grad_norm": 0.06425734459230836, |
|
"learning_rate": 1.7379320608727766e-05, |
|
"loss": 0.6018, |
|
"mean_token_accuracy": 0.8074577019491473, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 3.1265260821309657, |
|
"grad_norm": 0.07122678407183575, |
|
"learning_rate": 1.735309008059829e-05, |
|
"loss": 0.5787, |
|
"mean_token_accuracy": 0.8148202615560042, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 3.1376248612652606, |
|
"grad_norm": 0.07472861362175476, |
|
"learning_rate": 1.7326748941282638e-05, |
|
"loss": 0.5773, |
|
"mean_token_accuracy": 0.8159486725956425, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 3.148723640399556, |
|
"grad_norm": 0.06940334372384507, |
|
"learning_rate": 1.7300297587025748e-05, |
|
"loss": 0.5751, |
|
"mean_token_accuracy": 0.8157392354437368, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.1598224195338513, |
|
"grad_norm": 0.06891927963787495, |
|
"learning_rate": 1.7273736415730488e-05, |
|
"loss": 0.604, |
|
"mean_token_accuracy": 0.8073721273984527, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 3.1709211986681467, |
|
"grad_norm": 0.06448868374468762, |
|
"learning_rate": 1.7247065826951694e-05, |
|
"loss": 0.5968, |
|
"mean_token_accuracy": 0.8095589722183332, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 3.1820199778024416, |
|
"grad_norm": 0.06475524602325504, |
|
"learning_rate": 1.7220286221890137e-05, |
|
"loss": 0.6004, |
|
"mean_token_accuracy": 0.8077030508730166, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 3.193118756936737, |
|
"grad_norm": 0.06875614933754029, |
|
"learning_rate": 1.7193398003386514e-05, |
|
"loss": 0.5803, |
|
"mean_token_accuracy": 0.8144615967330615, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.2042175360710323, |
|
"grad_norm": 0.07191758410618443, |
|
"learning_rate": 1.716640157591536e-05, |
|
"loss": 0.5729, |
|
"mean_token_accuracy": 0.8167040368980463, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 3.2153163152053272, |
|
"grad_norm": 0.07506881347861384, |
|
"learning_rate": 1.7139297345578992e-05, |
|
"loss": 0.5904, |
|
"mean_token_accuracy": 0.8116413687428394, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.2264150943396226, |
|
"grad_norm": 0.06683185958843758, |
|
"learning_rate": 1.711208572010137e-05, |
|
"loss": 0.5879, |
|
"mean_token_accuracy": 0.8117346893203707, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 3.237513873473918, |
|
"grad_norm": 0.06384815369613044, |
|
"learning_rate": 1.7084767108822e-05, |
|
"loss": 0.5786, |
|
"mean_token_accuracy": 0.8152051484844588, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 3.2486126526082133, |
|
"grad_norm": 0.07113482761300648, |
|
"learning_rate": 1.7057341922689733e-05, |
|
"loss": 0.5833, |
|
"mean_token_accuracy": 0.8136391983287101, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 3.259711431742508, |
|
"grad_norm": 0.06875329966774961, |
|
"learning_rate": 1.702981057425662e-05, |
|
"loss": 0.6055, |
|
"mean_token_accuracy": 0.8064521514912204, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 3.2708102108768036, |
|
"grad_norm": 0.07129076660764946, |
|
"learning_rate": 1.7002173477671685e-05, |
|
"loss": 0.5795, |
|
"mean_token_accuracy": 0.8147054150827504, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 3.281908990011099, |
|
"grad_norm": 0.073218043150243, |
|
"learning_rate": 1.6974431048674714e-05, |
|
"loss": 0.5838, |
|
"mean_token_accuracy": 0.8133932009098818, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 3.293007769145394, |
|
"grad_norm": 0.06689255732673634, |
|
"learning_rate": 1.6946583704589973e-05, |
|
"loss": 0.597, |
|
"mean_token_accuracy": 0.8088926481283882, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 3.304106548279689, |
|
"grad_norm": 0.06374243658625131, |
|
"learning_rate": 1.691863186431996e-05, |
|
"loss": 0.5905, |
|
"mean_token_accuracy": 0.8113178827657815, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 3.3152053274139845, |
|
"grad_norm": 0.06295813244992231, |
|
"learning_rate": 1.689057594833908e-05, |
|
"loss": 0.6032, |
|
"mean_token_accuracy": 0.8069019652730617, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 3.32630410654828, |
|
"grad_norm": 0.06579736493219757, |
|
"learning_rate": 1.686241637868734e-05, |
|
"loss": 0.5826, |
|
"mean_token_accuracy": 0.8140389465289355, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.32630410654828, |
|
"eval_loss": 0.7457160353660583, |
|
"eval_mean_token_accuracy": 0.7719505953380342, |
|
"eval_runtime": 2.5024, |
|
"eval_samples_per_second": 51.551, |
|
"eval_steps_per_second": 4.396, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.337402885682575, |
|
"grad_norm": 0.07091243083602151, |
|
"learning_rate": 1.683415357896397e-05, |
|
"loss": 0.5857, |
|
"mean_token_accuracy": 0.8123794127619786, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 3.34850166481687, |
|
"grad_norm": 0.061307383634218, |
|
"learning_rate": 1.6805787974321107e-05, |
|
"loss": 0.5746, |
|
"mean_token_accuracy": 0.8166298304759364, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 3.3596004439511655, |
|
"grad_norm": 0.07258491640438769, |
|
"learning_rate": 1.6777319991457325e-05, |
|
"loss": 0.597, |
|
"mean_token_accuracy": 0.8087377760474664, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 3.3706992230854604, |
|
"grad_norm": 0.07268211516228792, |
|
"learning_rate": 1.674875005861128e-05, |
|
"loss": 0.5722, |
|
"mean_token_accuracy": 0.8174179468241606, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 3.381798002219756, |
|
"grad_norm": 0.07425718968092598, |
|
"learning_rate": 1.6720078605555227e-05, |
|
"loss": 0.5878, |
|
"mean_token_accuracy": 0.8117901692302955, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 3.392896781354051, |
|
"grad_norm": 0.06477589781728992, |
|
"learning_rate": 1.6691306063588583e-05, |
|
"loss": 0.5807, |
|
"mean_token_accuracy": 0.8140918874113428, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 3.4039955604883465, |
|
"grad_norm": 0.06662810742289851, |
|
"learning_rate": 1.6662432865531428e-05, |
|
"loss": 0.585, |
|
"mean_token_accuracy": 0.8134061399415382, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 3.4150943396226414, |
|
"grad_norm": 0.06459536165537999, |
|
"learning_rate": 1.6633459445717973e-05, |
|
"loss": 0.5978, |
|
"mean_token_accuracy": 0.8095315062509003, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 3.4261931187569368, |
|
"grad_norm": 0.06639712291679585, |
|
"learning_rate": 1.6604386239990077e-05, |
|
"loss": 0.589, |
|
"mean_token_accuracy": 0.8114936681679807, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 3.437291897891232, |
|
"grad_norm": 0.06853852815958374, |
|
"learning_rate": 1.657521368569064e-05, |
|
"loss": 0.6109, |
|
"mean_token_accuracy": 0.8049504842638907, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.448390677025527, |
|
"grad_norm": 0.06982303363818694, |
|
"learning_rate": 1.6545942221657042e-05, |
|
"loss": 0.5911, |
|
"mean_token_accuracy": 0.8112879302127055, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 3.4594894561598224, |
|
"grad_norm": 0.06760713569034003, |
|
"learning_rate": 1.6516572288214555e-05, |
|
"loss": 0.5852, |
|
"mean_token_accuracy": 0.8129200675672401, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 3.4705882352941178, |
|
"grad_norm": 0.07217456505648073, |
|
"learning_rate": 1.6487104327169702e-05, |
|
"loss": 0.5976, |
|
"mean_token_accuracy": 0.8093511038402598, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 3.481687014428413, |
|
"grad_norm": 0.070935181965617, |
|
"learning_rate": 1.6457538781803625e-05, |
|
"loss": 0.5905, |
|
"mean_token_accuracy": 0.8113604696928229, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 3.492785793562708, |
|
"grad_norm": 0.06078018778473591, |
|
"learning_rate": 1.6427876096865394e-05, |
|
"loss": 0.5784, |
|
"mean_token_accuracy": 0.8158070281321645, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 3.5038845726970034, |
|
"grad_norm": 0.06704102646198273, |
|
"learning_rate": 1.639811671856535e-05, |
|
"loss": 0.6082, |
|
"mean_token_accuracy": 0.8057792572972777, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 3.5149833518312983, |
|
"grad_norm": 0.06759521898261396, |
|
"learning_rate": 1.636826109456836e-05, |
|
"loss": 0.5926, |
|
"mean_token_accuracy": 0.8109007114764084, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 3.5260821309655936, |
|
"grad_norm": 0.06645230810548192, |
|
"learning_rate": 1.63383096739871e-05, |
|
"loss": 0.582, |
|
"mean_token_accuracy": 0.8141154963928526, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 3.537180910099889, |
|
"grad_norm": 0.07053272601617808, |
|
"learning_rate": 1.6308262907375314e-05, |
|
"loss": 0.5831, |
|
"mean_token_accuracy": 0.8134722581791063, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 3.5482796892341844, |
|
"grad_norm": 0.06509128182629838, |
|
"learning_rate": 1.627812124672099e-05, |
|
"loss": 0.5788, |
|
"mean_token_accuracy": 0.8153766770371117, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.5482796892341844, |
|
"eval_loss": 0.7433986067771912, |
|
"eval_mean_token_accuracy": 0.7729161029544169, |
|
"eval_runtime": 2.5059, |
|
"eval_samples_per_second": 51.478, |
|
"eval_steps_per_second": 4.39, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.5593784683684797, |
|
"grad_norm": 0.06880331086378055, |
|
"learning_rate": 1.6247885145439602e-05, |
|
"loss": 0.5922, |
|
"mean_token_accuracy": 0.8106637998812584, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 3.5704772475027746, |
|
"grad_norm": 0.06368796917129804, |
|
"learning_rate": 1.6217555058367288e-05, |
|
"loss": 0.5927, |
|
"mean_token_accuracy": 0.8103974761600462, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 3.58157602663707, |
|
"grad_norm": 0.07182991192738422, |
|
"learning_rate": 1.618713144175399e-05, |
|
"loss": 0.6022, |
|
"mean_token_accuracy": 0.8067329885490656, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 3.592674805771365, |
|
"grad_norm": 0.06863230039491855, |
|
"learning_rate": 1.6156614753256583e-05, |
|
"loss": 0.6132, |
|
"mean_token_accuracy": 0.8041375083354169, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 3.6037735849056602, |
|
"grad_norm": 0.06734776872058247, |
|
"learning_rate": 1.6126005451932028e-05, |
|
"loss": 0.5986, |
|
"mean_token_accuracy": 0.8086936598937127, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 3.6148723640399556, |
|
"grad_norm": 0.06691389231458741, |
|
"learning_rate": 1.6095303998230432e-05, |
|
"loss": 0.5903, |
|
"mean_token_accuracy": 0.8117445609970693, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 3.625971143174251, |
|
"grad_norm": 0.0626209547447361, |
|
"learning_rate": 1.6064510853988137e-05, |
|
"loss": 0.6066, |
|
"mean_token_accuracy": 0.8063172615190451, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 3.6370699223085463, |
|
"grad_norm": 0.06374216425499483, |
|
"learning_rate": 1.603362648242076e-05, |
|
"loss": 0.5901, |
|
"mean_token_accuracy": 0.8116106105320405, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 3.648168701442841, |
|
"grad_norm": 0.0648674771391545, |
|
"learning_rate": 1.6002651348116248e-05, |
|
"loss": 0.5944, |
|
"mean_token_accuracy": 0.8101063167096149, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 3.6592674805771366, |
|
"grad_norm": 0.06499753351633389, |
|
"learning_rate": 1.5971585917027864e-05, |
|
"loss": 0.5915, |
|
"mean_token_accuracy": 0.8101395566228055, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.6703662597114315, |
|
"grad_norm": 0.06831724829782981, |
|
"learning_rate": 1.5940430656467193e-05, |
|
"loss": 0.5992, |
|
"mean_token_accuracy": 0.8083434092585243, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 3.681465038845727, |
|
"grad_norm": 0.06790573799595492, |
|
"learning_rate": 1.5909186035097114e-05, |
|
"loss": 0.5785, |
|
"mean_token_accuracy": 0.8147185760230325, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 3.692563817980022, |
|
"grad_norm": 0.06427088903327956, |
|
"learning_rate": 1.5877852522924733e-05, |
|
"loss": 0.5794, |
|
"mean_token_accuracy": 0.8144371602569551, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 3.7036625971143176, |
|
"grad_norm": 0.06239985639848319, |
|
"learning_rate": 1.5846430591294334e-05, |
|
"loss": 0.5718, |
|
"mean_token_accuracy": 0.8172256989894547, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 3.714761376248613, |
|
"grad_norm": 0.06568843799685702, |
|
"learning_rate": 1.5814920712880267e-05, |
|
"loss": 0.5925, |
|
"mean_token_accuracy": 0.8102354089861006, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 3.725860155382908, |
|
"grad_norm": 0.0651721851556958, |
|
"learning_rate": 1.5783323361679865e-05, |
|
"loss": 0.5727, |
|
"mean_token_accuracy": 0.8159369895094459, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 3.736958934517203, |
|
"grad_norm": 0.0690220199124441, |
|
"learning_rate": 1.575163901300629e-05, |
|
"loss": 0.5906, |
|
"mean_token_accuracy": 0.8114687402753267, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 3.748057713651498, |
|
"grad_norm": 0.07248322781827103, |
|
"learning_rate": 1.5719868143481385e-05, |
|
"loss": 0.5884, |
|
"mean_token_accuracy": 0.8114747809234526, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.7591564927857934, |
|
"grad_norm": 0.06344203324482209, |
|
"learning_rate": 1.568801123102852e-05, |
|
"loss": 0.6063, |
|
"mean_token_accuracy": 0.8063411988778503, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 3.770255271920089, |
|
"grad_norm": 0.06819373858730451, |
|
"learning_rate": 1.5656068754865388e-05, |
|
"loss": 0.6054, |
|
"mean_token_accuracy": 0.8064807145119008, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.770255271920089, |
|
"eval_loss": 0.7394784688949585, |
|
"eval_mean_token_accuracy": 0.7736173949838345, |
|
"eval_runtime": 2.5098, |
|
"eval_samples_per_second": 51.398, |
|
"eval_steps_per_second": 4.383, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.781354051054384, |
|
"grad_norm": 0.06974470044867787, |
|
"learning_rate": 1.56240411954968e-05, |
|
"loss": 0.6052, |
|
"mean_token_accuracy": 0.8067864576933559, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 3.7924528301886795, |
|
"grad_norm": 0.06783881626076696, |
|
"learning_rate": 1.5591929034707468e-05, |
|
"loss": 0.6029, |
|
"mean_token_accuracy": 0.8078461112558084, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.8035516093229744, |
|
"grad_norm": 0.07199172367413192, |
|
"learning_rate": 1.5559732755554734e-05, |
|
"loss": 0.6165, |
|
"mean_token_accuracy": 0.8034011148204714, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 3.81465038845727, |
|
"grad_norm": 0.0627264927528077, |
|
"learning_rate": 1.552745284236133e-05, |
|
"loss": 0.581, |
|
"mean_token_accuracy": 0.8138149750532312, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.8257491675915647, |
|
"grad_norm": 0.0666022446878339, |
|
"learning_rate": 1.5495089780708062e-05, |
|
"loss": 0.5853, |
|
"mean_token_accuracy": 0.8127597504780851, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 3.83684794672586, |
|
"grad_norm": 0.06651837216095642, |
|
"learning_rate": 1.546264405742654e-05, |
|
"loss": 0.5901, |
|
"mean_token_accuracy": 0.8110268386574733, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.8479467258601554, |
|
"grad_norm": 0.06551396537364293, |
|
"learning_rate": 1.5430116160591836e-05, |
|
"loss": 0.5879, |
|
"mean_token_accuracy": 0.8116350609023744, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 3.8590455049944508, |
|
"grad_norm": 0.06347646901521764, |
|
"learning_rate": 1.539750657951513e-05, |
|
"loss": 0.582, |
|
"mean_token_accuracy": 0.8141941271798734, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.870144284128746, |
|
"grad_norm": 0.06581227966023888, |
|
"learning_rate": 1.536481580473638e-05, |
|
"loss": 0.6041, |
|
"mean_token_accuracy": 0.8068086974339712, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 3.881243063263041, |
|
"grad_norm": 0.06485201916018662, |
|
"learning_rate": 1.5332044328016916e-05, |
|
"loss": 0.5827, |
|
"mean_token_accuracy": 0.8135174002384069, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.8923418423973364, |
|
"grad_norm": 0.07023014750305509, |
|
"learning_rate": 1.529919264233205e-05, |
|
"loss": 0.5781, |
|
"mean_token_accuracy": 0.8148667595200646, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 3.9034406215316313, |
|
"grad_norm": 0.06788151535669468, |
|
"learning_rate": 1.5266261241863675e-05, |
|
"loss": 0.6046, |
|
"mean_token_accuracy": 0.8066485319988587, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.9145394006659266, |
|
"grad_norm": 0.06540773197792465, |
|
"learning_rate": 1.523325062199281e-05, |
|
"loss": 0.5912, |
|
"mean_token_accuracy": 0.810199155848963, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 3.925638179800222, |
|
"grad_norm": 0.06385079909541433, |
|
"learning_rate": 1.5200161279292154e-05, |
|
"loss": 0.606, |
|
"mean_token_accuracy": 0.806549376246009, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.9367369589345174, |
|
"grad_norm": 0.06405640899804474, |
|
"learning_rate": 1.5166993711518631e-05, |
|
"loss": 0.6074, |
|
"mean_token_accuracy": 0.805698963377606, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 3.9478357380688123, |
|
"grad_norm": 0.0656239778913707, |
|
"learning_rate": 1.5133748417605878e-05, |
|
"loss": 0.6042, |
|
"mean_token_accuracy": 0.8072386929802121, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.9589345172031076, |
|
"grad_norm": 0.06537373035601372, |
|
"learning_rate": 1.5100425897656754e-05, |
|
"loss": 0.5914, |
|
"mean_token_accuracy": 0.8114940759127753, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 3.970033296337403, |
|
"grad_norm": 0.07228070817170734, |
|
"learning_rate": 1.5067026652935823e-05, |
|
"loss": 0.591, |
|
"mean_token_accuracy": 0.8108245554957352, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.981132075471698, |
|
"grad_norm": 0.06997011004537755, |
|
"learning_rate": 1.50335511858618e-05, |
|
"loss": 0.6117, |
|
"mean_token_accuracy": 0.8037908682685642, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 3.9922308546059933, |
|
"grad_norm": 0.06257709842877483, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.5861, |
|
"mean_token_accuracy": 0.8126290702777871, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.9922308546059933, |
|
"eval_loss": 0.7382122874259949, |
|
"eval_mean_token_accuracy": 0.7738011020672627, |
|
"eval_runtime": 2.5042, |
|
"eval_samples_per_second": 51.514, |
|
"eval_steps_per_second": 4.393, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.002219755826859, |
|
"grad_norm": 0.0730797652438335, |
|
"learning_rate": 1.4966373600054763e-05, |
|
"loss": 0.5779, |
|
"mean_token_accuracy": 0.8164720629136107, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 4.013318534961154, |
|
"grad_norm": 0.08672989514268875, |
|
"learning_rate": 1.4932672491861855e-05, |
|
"loss": 0.5221, |
|
"mean_token_accuracy": 0.8303200814447649, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 4.02441731409545, |
|
"grad_norm": 0.08072906911113291, |
|
"learning_rate": 1.4898897182380872e-05, |
|
"loss": 0.5357, |
|
"mean_token_accuracy": 0.825473979487524, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 4.035516093229745, |
|
"grad_norm": 0.07018425634528604, |
|
"learning_rate": 1.48650481796876e-05, |
|
"loss": 0.5307, |
|
"mean_token_accuracy": 0.8279411218844428, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 4.04661487236404, |
|
"grad_norm": 0.068007941882165, |
|
"learning_rate": 1.4831125992966386e-05, |
|
"loss": 0.5196, |
|
"mean_token_accuracy": 0.831187564208158, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 4.057713651498335, |
|
"grad_norm": 0.0689958997732716, |
|
"learning_rate": 1.4797131132502464e-05, |
|
"loss": 0.5442, |
|
"mean_token_accuracy": 0.8231890900584835, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 4.06881243063263, |
|
"grad_norm": 0.07171455936324264, |
|
"learning_rate": 1.476306410967429e-05, |
|
"loss": 0.5345, |
|
"mean_token_accuracy": 0.8263741901210816, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 4.079911209766926, |
|
"grad_norm": 0.06804956275279826, |
|
"learning_rate": 1.4728925436945838e-05, |
|
"loss": 0.5207, |
|
"mean_token_accuracy": 0.8303962847835349, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 4.091009988901221, |
|
"grad_norm": 0.07083430472325235, |
|
"learning_rate": 1.469471562785891e-05, |
|
"loss": 0.5158, |
|
"mean_token_accuracy": 0.8320593088222301, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 4.102108768035516, |
|
"grad_norm": 0.06503541249215597, |
|
"learning_rate": 1.4660435197025391e-05, |
|
"loss": 0.525, |
|
"mean_token_accuracy": 0.8300507388211443, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 4.113207547169812, |
|
"grad_norm": 0.06971798835469743, |
|
"learning_rate": 1.4626084660119515e-05, |
|
"loss": 0.5295, |
|
"mean_token_accuracy": 0.8280530516037452, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 4.124306326304106, |
|
"grad_norm": 0.06603173419848209, |
|
"learning_rate": 1.4591664533870118e-05, |
|
"loss": 0.5266, |
|
"mean_token_accuracy": 0.8291048758745919, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 4.135405105438402, |
|
"grad_norm": 0.06938457936618332, |
|
"learning_rate": 1.4557175336052844e-05, |
|
"loss": 0.536, |
|
"mean_token_accuracy": 0.8256845911513351, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 4.146503884572697, |
|
"grad_norm": 0.06883650062666277, |
|
"learning_rate": 1.4522617585482377e-05, |
|
"loss": 0.5204, |
|
"mean_token_accuracy": 0.8310854137446115, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 4.157602663706992, |
|
"grad_norm": 0.07063831352536717, |
|
"learning_rate": 1.4487991802004625e-05, |
|
"loss": 0.5433, |
|
"mean_token_accuracy": 0.8236911002393945, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 4.168701442841288, |
|
"grad_norm": 0.06752461645830253, |
|
"learning_rate": 1.4453298506488896e-05, |
|
"loss": 0.538, |
|
"mean_token_accuracy": 0.8248770954289781, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 4.179800221975583, |
|
"grad_norm": 0.06690086776158738, |
|
"learning_rate": 1.441853822082008e-05, |
|
"loss": 0.5423, |
|
"mean_token_accuracy": 0.8239579652997131, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 4.190899001109878, |
|
"grad_norm": 0.0669777882641211, |
|
"learning_rate": 1.4383711467890776e-05, |
|
"loss": 0.5364, |
|
"mean_token_accuracy": 0.8257569366739599, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 4.201997780244173, |
|
"grad_norm": 0.0669780263721949, |
|
"learning_rate": 1.4348818771593452e-05, |
|
"loss": 0.5262, |
|
"mean_token_accuracy": 0.8289039644335492, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 4.213096559378468, |
|
"grad_norm": 0.06926763124785959, |
|
"learning_rate": 1.4313860656812537e-05, |
|
"loss": 0.5534, |
|
"mean_token_accuracy": 0.820804290469477, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.213096559378468, |
|
"eval_loss": 0.765135645866394, |
|
"eval_mean_token_accuracy": 0.7707166053233169, |
|
"eval_runtime": 2.5096, |
|
"eval_samples_per_second": 51.402, |
|
"eval_steps_per_second": 4.383, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.2241953385127635, |
|
"grad_norm": 0.07023846548990398, |
|
"learning_rate": 1.4278837649416543e-05, |
|
"loss": 0.5302, |
|
"mean_token_accuracy": 0.8275532628167696, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 4.235294117647059, |
|
"grad_norm": 0.06738468385547651, |
|
"learning_rate": 1.4243750276250154e-05, |
|
"loss": 0.5429, |
|
"mean_token_accuracy": 0.8233446568456619, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 4.246392896781354, |
|
"grad_norm": 0.06482738400996359, |
|
"learning_rate": 1.4208599065126292e-05, |
|
"loss": 0.5189, |
|
"mean_token_accuracy": 0.8316966637591312, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 4.25749167591565, |
|
"grad_norm": 0.06396002535376463, |
|
"learning_rate": 1.417338454481818e-05, |
|
"loss": 0.5278, |
|
"mean_token_accuracy": 0.8282958367271627, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 4.268590455049944, |
|
"grad_norm": 0.06689652898165803, |
|
"learning_rate": 1.4138107245051394e-05, |
|
"loss": 0.5427, |
|
"mean_token_accuracy": 0.8237128012084165, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 4.279689234184239, |
|
"grad_norm": 0.06899145550235058, |
|
"learning_rate": 1.4102767696495885e-05, |
|
"loss": 0.5416, |
|
"mean_token_accuracy": 0.8238680752683756, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 4.290788013318535, |
|
"grad_norm": 0.06653016304820043, |
|
"learning_rate": 1.4067366430758004e-05, |
|
"loss": 0.5412, |
|
"mean_token_accuracy": 0.824632569309992, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 4.30188679245283, |
|
"grad_norm": 0.07135924894684956, |
|
"learning_rate": 1.4031903980372503e-05, |
|
"loss": 0.5588, |
|
"mean_token_accuracy": 0.8186822026511017, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 4.3129855715871255, |
|
"grad_norm": 0.06901506479682018, |
|
"learning_rate": 1.3996380878794524e-05, |
|
"loss": 0.5307, |
|
"mean_token_accuracy": 0.8278875389394228, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 4.324084350721421, |
|
"grad_norm": 0.06828083288970177, |
|
"learning_rate": 1.396079766039157e-05, |
|
"loss": 0.5365, |
|
"mean_token_accuracy": 0.8254911164406096, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 4.335183129855716, |
|
"grad_norm": 0.06770085673952302, |
|
"learning_rate": 1.3925154860435473e-05, |
|
"loss": 0.5457, |
|
"mean_token_accuracy": 0.8232314349438562, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 4.3462819089900115, |
|
"grad_norm": 0.0659269142449662, |
|
"learning_rate": 1.3889453015094338e-05, |
|
"loss": 0.546, |
|
"mean_token_accuracy": 0.823248835683452, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 4.357380688124306, |
|
"grad_norm": 0.06685910108488621, |
|
"learning_rate": 1.3853692661424485e-05, |
|
"loss": 0.5313, |
|
"mean_token_accuracy": 0.8275431625307241, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 4.368479467258601, |
|
"grad_norm": 0.0699277758009232, |
|
"learning_rate": 1.3817874337362351e-05, |
|
"loss": 0.5435, |
|
"mean_token_accuracy": 0.8237799596940005, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 4.379578246392897, |
|
"grad_norm": 0.06463358677008339, |
|
"learning_rate": 1.3781998581716427e-05, |
|
"loss": 0.5307, |
|
"mean_token_accuracy": 0.8274417707074175, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 4.390677025527192, |
|
"grad_norm": 0.06736039165720337, |
|
"learning_rate": 1.3746065934159123e-05, |
|
"loss": 0.5296, |
|
"mean_token_accuracy": 0.8283872712315985, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 4.401775804661487, |
|
"grad_norm": 0.06597768478568991, |
|
"learning_rate": 1.3710076935218671e-05, |
|
"loss": 0.5337, |
|
"mean_token_accuracy": 0.8272544528484194, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 4.412874583795783, |
|
"grad_norm": 0.06441872232432956, |
|
"learning_rate": 1.3674032126270982e-05, |
|
"loss": 0.5349, |
|
"mean_token_accuracy": 0.8267573283926994, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 4.423973362930077, |
|
"grad_norm": 0.06605167933787068, |
|
"learning_rate": 1.3637932049531517e-05, |
|
"loss": 0.5307, |
|
"mean_token_accuracy": 0.8282462134771424, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 4.435072142064373, |
|
"grad_norm": 0.0665118285232646, |
|
"learning_rate": 1.3601777248047105e-05, |
|
"loss": 0.5449, |
|
"mean_token_accuracy": 0.8234090069569587, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.435072142064373, |
|
"eval_loss": 0.7611222267150879, |
|
"eval_mean_token_accuracy": 0.7715752214807218, |
|
"eval_runtime": 2.5096, |
|
"eval_samples_per_second": 51.403, |
|
"eval_steps_per_second": 4.383, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.446170921198668, |
|
"grad_norm": 0.0633952053347362, |
|
"learning_rate": 1.3565568265687802e-05, |
|
"loss": 0.5441, |
|
"mean_token_accuracy": 0.8236108702838326, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 4.457269700332963, |
|
"grad_norm": 0.06396077346644374, |
|
"learning_rate": 1.3529305647138689e-05, |
|
"loss": 0.5254, |
|
"mean_token_accuracy": 0.8295401585160572, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 4.468368479467259, |
|
"grad_norm": 0.0656532841801477, |
|
"learning_rate": 1.3492989937891694e-05, |
|
"loss": 0.5336, |
|
"mean_token_accuracy": 0.8266195612283573, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 4.479467258601554, |
|
"grad_norm": 0.06837988228446373, |
|
"learning_rate": 1.3456621684237367e-05, |
|
"loss": 0.5341, |
|
"mean_token_accuracy": 0.8274741807935527, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 4.490566037735849, |
|
"grad_norm": 0.0667039963365344, |
|
"learning_rate": 1.342020143325669e-05, |
|
"loss": 0.5466, |
|
"mean_token_accuracy": 0.8229245927993392, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 4.501664816870145, |
|
"grad_norm": 0.0673181621735528, |
|
"learning_rate": 1.3383729732812814e-05, |
|
"loss": 0.5539, |
|
"mean_token_accuracy": 0.8201497596550249, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 4.512763596004439, |
|
"grad_norm": 0.06895598701514556, |
|
"learning_rate": 1.3347207131542847e-05, |
|
"loss": 0.5553, |
|
"mean_token_accuracy": 0.8203718930059478, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 4.523862375138735, |
|
"grad_norm": 0.06570681535103308, |
|
"learning_rate": 1.3310634178849583e-05, |
|
"loss": 0.526, |
|
"mean_token_accuracy": 0.8288809425485892, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 4.53496115427303, |
|
"grad_norm": 0.0701180297067736, |
|
"learning_rate": 1.3274011424893245e-05, |
|
"loss": 0.5408, |
|
"mean_token_accuracy": 0.8247304133164256, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 4.546059933407325, |
|
"grad_norm": 0.06791266083149289, |
|
"learning_rate": 1.3237339420583213e-05, |
|
"loss": 0.543, |
|
"mean_token_accuracy": 0.8240936918076546, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 4.557158712541621, |
|
"grad_norm": 0.06711140536306563, |
|
"learning_rate": 1.3200618717569716e-05, |
|
"loss": 0.5511, |
|
"mean_token_accuracy": 0.8213156804488854, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 4.568257491675916, |
|
"grad_norm": 0.06793298802768283, |
|
"learning_rate": 1.3163849868235566e-05, |
|
"loss": 0.5363, |
|
"mean_token_accuracy": 0.825724493844356, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 4.5793562708102105, |
|
"grad_norm": 0.06892866532615381, |
|
"learning_rate": 1.312703342568782e-05, |
|
"loss": 0.534, |
|
"mean_token_accuracy": 0.8264291321992333, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 4.590455049944506, |
|
"grad_norm": 0.06664285296724572, |
|
"learning_rate": 1.3090169943749475e-05, |
|
"loss": 0.5348, |
|
"mean_token_accuracy": 0.8258681376303734, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 4.601553829078801, |
|
"grad_norm": 0.0654483232429745, |
|
"learning_rate": 1.3053259976951134e-05, |
|
"loss": 0.5596, |
|
"mean_token_accuracy": 0.8185533510882992, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 4.6126526082130965, |
|
"grad_norm": 0.07041916180841737, |
|
"learning_rate": 1.3016304080522657e-05, |
|
"loss": 0.5443, |
|
"mean_token_accuracy": 0.8240659035373834, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 4.623751387347392, |
|
"grad_norm": 0.06702813062704534, |
|
"learning_rate": 1.297930281038482e-05, |
|
"loss": 0.5491, |
|
"mean_token_accuracy": 0.8222027550986537, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 4.634850166481687, |
|
"grad_norm": 0.06248394102983872, |
|
"learning_rate": 1.2942256723140951e-05, |
|
"loss": 0.5336, |
|
"mean_token_accuracy": 0.8269264702799436, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 4.645948945615983, |
|
"grad_norm": 0.0643747533736713, |
|
"learning_rate": 1.290516637606855e-05, |
|
"loss": 0.5354, |
|
"mean_token_accuracy": 0.826579599216724, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 4.657047724750278, |
|
"grad_norm": 0.06547506463619156, |
|
"learning_rate": 1.2868032327110904e-05, |
|
"loss": 0.5261, |
|
"mean_token_accuracy": 0.8294653993974614, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.657047724750278, |
|
"eval_loss": 0.7608128190040588, |
|
"eval_mean_token_accuracy": 0.7713916980015862, |
|
"eval_runtime": 2.5115, |
|
"eval_samples_per_second": 51.364, |
|
"eval_steps_per_second": 4.38, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.668146503884572, |
|
"grad_norm": 0.06586818079405167, |
|
"learning_rate": 1.2830855134868705e-05, |
|
"loss": 0.5334, |
|
"mean_token_accuracy": 0.8269698798269095, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 4.679245283018868, |
|
"grad_norm": 0.06919497157074231, |
|
"learning_rate": 1.2793635358591645e-05, |
|
"loss": 0.5262, |
|
"mean_token_accuracy": 0.8291748631685987, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 4.690344062153163, |
|
"grad_norm": 0.0638134236313798, |
|
"learning_rate": 1.2756373558169992e-05, |
|
"loss": 0.5451, |
|
"mean_token_accuracy": 0.823209000051959, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 4.7014428412874585, |
|
"grad_norm": 0.0673456304701257, |
|
"learning_rate": 1.2719070294126183e-05, |
|
"loss": 0.5425, |
|
"mean_token_accuracy": 0.824060788934791, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 4.712541620421754, |
|
"grad_norm": 0.0628128288015867, |
|
"learning_rate": 1.2681726127606374e-05, |
|
"loss": 0.5371, |
|
"mean_token_accuracy": 0.8254979634319394, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 4.723640399556049, |
|
"grad_norm": 0.06717000875377617, |
|
"learning_rate": 1.2644341620372025e-05, |
|
"loss": 0.5437, |
|
"mean_token_accuracy": 0.8236075508618403, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 4.734739178690344, |
|
"grad_norm": 0.06808573080841533, |
|
"learning_rate": 1.2606917334791415e-05, |
|
"loss": 0.5493, |
|
"mean_token_accuracy": 0.8218429272375017, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 4.745837957824639, |
|
"grad_norm": 0.06914709442447933, |
|
"learning_rate": 1.2569453833831222e-05, |
|
"loss": 0.5431, |
|
"mean_token_accuracy": 0.8241093534073686, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 4.756936736958934, |
|
"grad_norm": 0.06334309954829766, |
|
"learning_rate": 1.253195168104802e-05, |
|
"loss": 0.5567, |
|
"mean_token_accuracy": 0.8192453652654745, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 4.76803551609323, |
|
"grad_norm": 0.06619495996123374, |
|
"learning_rate": 1.2494411440579814e-05, |
|
"loss": 0.5442, |
|
"mean_token_accuracy": 0.8232290532177181, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 4.779134295227525, |
|
"grad_norm": 0.07297705038597685, |
|
"learning_rate": 1.2456833677137563e-05, |
|
"loss": 0.5451, |
|
"mean_token_accuracy": 0.8230713940589893, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 4.79023307436182, |
|
"grad_norm": 0.0701375699397559, |
|
"learning_rate": 1.2419218955996677e-05, |
|
"loss": 0.5447, |
|
"mean_token_accuracy": 0.8232547438813633, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 4.801331853496116, |
|
"grad_norm": 0.06979220683803566, |
|
"learning_rate": 1.238156784298851e-05, |
|
"loss": 0.5414, |
|
"mean_token_accuracy": 0.8242423296299666, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 4.812430632630411, |
|
"grad_norm": 0.06538541811983827, |
|
"learning_rate": 1.2343880904491846e-05, |
|
"loss": 0.544, |
|
"mean_token_accuracy": 0.8230058901259314, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 4.823529411764706, |
|
"grad_norm": 0.06323017444968718, |
|
"learning_rate": 1.2306158707424402e-05, |
|
"loss": 0.5289, |
|
"mean_token_accuracy": 0.8285157998211631, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 4.834628190899001, |
|
"grad_norm": 0.0630498338664204, |
|
"learning_rate": 1.226840181923427e-05, |
|
"loss": 0.5384, |
|
"mean_token_accuracy": 0.8254811299839719, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 4.845726970033296, |
|
"grad_norm": 0.0677977464932852, |
|
"learning_rate": 1.2230610807891394e-05, |
|
"loss": 0.5428, |
|
"mean_token_accuracy": 0.8239020278215602, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 4.856825749167592, |
|
"grad_norm": 0.06837957133766576, |
|
"learning_rate": 1.2192786241879033e-05, |
|
"loss": 0.5369, |
|
"mean_token_accuracy": 0.8253965695150642, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 4.867924528301887, |
|
"grad_norm": 0.06858490759021686, |
|
"learning_rate": 1.2154928690185201e-05, |
|
"loss": 0.5499, |
|
"mean_token_accuracy": 0.8211856043294985, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 4.879023307436182, |
|
"grad_norm": 0.07073949700021516, |
|
"learning_rate": 1.211703872229411e-05, |
|
"loss": 0.5564, |
|
"mean_token_accuracy": 0.8193733641312193, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.879023307436182, |
|
"eval_loss": 0.7586882710456848, |
|
"eval_mean_token_accuracy": 0.7719546157599555, |
|
"eval_runtime": 2.5069, |
|
"eval_samples_per_second": 51.458, |
|
"eval_steps_per_second": 4.388, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.890122086570477, |
|
"grad_norm": 0.06682518985110458, |
|
"learning_rate": 1.2079116908177592e-05, |
|
"loss": 0.5503, |
|
"mean_token_accuracy": 0.8219519923319263, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 4.901220865704772, |
|
"grad_norm": 0.06878010685643873, |
|
"learning_rate": 1.2041163818286558e-05, |
|
"loss": 0.5412, |
|
"mean_token_accuracy": 0.8244903210988521, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 4.912319644839068, |
|
"grad_norm": 0.06859265775491848, |
|
"learning_rate": 1.2003180023542375e-05, |
|
"loss": 0.5482, |
|
"mean_token_accuracy": 0.8223593399026905, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 4.923418423973363, |
|
"grad_norm": 0.06582995971058746, |
|
"learning_rate": 1.1965166095328302e-05, |
|
"loss": 0.5444, |
|
"mean_token_accuracy": 0.8238218612141541, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 4.934517203107658, |
|
"grad_norm": 0.06844088791978743, |
|
"learning_rate": 1.1927122605480899e-05, |
|
"loss": 0.5404, |
|
"mean_token_accuracy": 0.8245272129167904, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 4.945615982241954, |
|
"grad_norm": 0.07288470554374063, |
|
"learning_rate": 1.1889050126281405e-05, |
|
"loss": 0.5352, |
|
"mean_token_accuracy": 0.8261003602637317, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 4.956714761376249, |
|
"grad_norm": 0.06671251103356769, |
|
"learning_rate": 1.1850949230447146e-05, |
|
"loss": 0.5306, |
|
"mean_token_accuracy": 0.8276477690305077, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 4.9678135405105435, |
|
"grad_norm": 0.06439515034329467, |
|
"learning_rate": 1.1812820491122918e-05, |
|
"loss": 0.5253, |
|
"mean_token_accuracy": 0.8289079489294556, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 4.978912319644839, |
|
"grad_norm": 0.06384436174886099, |
|
"learning_rate": 1.1774664481872354e-05, |
|
"loss": 0.533, |
|
"mean_token_accuracy": 0.8269564316724558, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 4.990011098779134, |
|
"grad_norm": 0.064914486533765, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 0.5337, |
|
"mean_token_accuracy": 0.8273936231132242, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.06525786189415514, |
|
"learning_rate": 1.1698272949889206e-05, |
|
"loss": 0.5516, |
|
"mean_token_accuracy": 0.8209712929381258, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 5.011098779134295, |
|
"grad_norm": 0.07325252213881911, |
|
"learning_rate": 1.1660038576300444e-05, |
|
"loss": 0.4878, |
|
"mean_token_accuracy": 0.8401779370635157, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 5.022197558268591, |
|
"grad_norm": 0.07048539425609023, |
|
"learning_rate": 1.1621779231055677e-05, |
|
"loss": 0.4684, |
|
"mean_token_accuracy": 0.8463123294941672, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 5.033296337402886, |
|
"grad_norm": 0.07119257946253367, |
|
"learning_rate": 1.158349548968323e-05, |
|
"loss": 0.4852, |
|
"mean_token_accuracy": 0.8406624644516423, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 5.0443951165371805, |
|
"grad_norm": 0.06758001059113193, |
|
"learning_rate": 1.1545187928078407e-05, |
|
"loss": 0.4851, |
|
"mean_token_accuracy": 0.8410094757922542, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 5.055493895671476, |
|
"grad_norm": 0.06748477290601286, |
|
"learning_rate": 1.1506857122494832e-05, |
|
"loss": 0.4666, |
|
"mean_token_accuracy": 0.8467701105668647, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 5.066592674805771, |
|
"grad_norm": 0.06670660224212706, |
|
"learning_rate": 1.146850364953579e-05, |
|
"loss": 0.4898, |
|
"mean_token_accuracy": 0.839417112282745, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 5.077691453940067, |
|
"grad_norm": 0.07087893469659705, |
|
"learning_rate": 1.1430128086145542e-05, |
|
"loss": 0.4791, |
|
"mean_token_accuracy": 0.8429755949198621, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 5.088790233074362, |
|
"grad_norm": 0.07478834139797155, |
|
"learning_rate": 1.1391731009600655e-05, |
|
"loss": 0.4892, |
|
"mean_token_accuracy": 0.8396017922119625, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 5.099889012208657, |
|
"grad_norm": 0.06766266447181689, |
|
"learning_rate": 1.1353312997501313e-05, |
|
"loss": 0.4691, |
|
"mean_token_accuracy": 0.8461667825407023, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 5.099889012208657, |
|
"eval_loss": 0.7955626845359802, |
|
"eval_mean_token_accuracy": 0.768756609510345, |
|
"eval_runtime": 2.5081, |
|
"eval_samples_per_second": 51.432, |
|
"eval_steps_per_second": 4.386, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 5.110987791342953, |
|
"grad_norm": 0.06999143705646484, |
|
"learning_rate": 1.1314874627762627e-05, |
|
"loss": 0.4738, |
|
"mean_token_accuracy": 0.8446440579759826, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 5.122086570477247, |
|
"grad_norm": 0.0718794241969376, |
|
"learning_rate": 1.127641647860595e-05, |
|
"loss": 0.4841, |
|
"mean_token_accuracy": 0.8411325365807377, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 5.1331853496115425, |
|
"grad_norm": 0.06704940991855901, |
|
"learning_rate": 1.1237939128550167e-05, |
|
"loss": 0.4743, |
|
"mean_token_accuracy": 0.8441928002045515, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 5.144284128745838, |
|
"grad_norm": 0.06501577030440127, |
|
"learning_rate": 1.1199443156402998e-05, |
|
"loss": 0.4572, |
|
"mean_token_accuracy": 0.8498967529805903, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 5.155382907880133, |
|
"grad_norm": 0.06928626193910288, |
|
"learning_rate": 1.1160929141252303e-05, |
|
"loss": 0.4645, |
|
"mean_token_accuracy": 0.8472524675665444, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 5.166481687014429, |
|
"grad_norm": 0.06840759490883057, |
|
"learning_rate": 1.1122397662457352e-05, |
|
"loss": 0.4776, |
|
"mean_token_accuracy": 0.8436041188428363, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 5.177580466148724, |
|
"grad_norm": 0.07013395237638928, |
|
"learning_rate": 1.1083849299640109e-05, |
|
"loss": 0.4846, |
|
"mean_token_accuracy": 0.8408465709590693, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 5.188679245283019, |
|
"grad_norm": 0.07333313859905721, |
|
"learning_rate": 1.1045284632676535e-05, |
|
"loss": 0.4823, |
|
"mean_token_accuracy": 0.8415907001724943, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 5.199778024417314, |
|
"grad_norm": 0.07150128563971167, |
|
"learning_rate": 1.1006704241687846e-05, |
|
"loss": 0.4891, |
|
"mean_token_accuracy": 0.8399067214225251, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 5.210876803551609, |
|
"grad_norm": 0.06766056951417396, |
|
"learning_rate": 1.0968108707031792e-05, |
|
"loss": 0.4943, |
|
"mean_token_accuracy": 0.8384693039607134, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 5.2219755826859044, |
|
"grad_norm": 0.07097733536527409, |
|
"learning_rate": 1.0929498609293925e-05, |
|
"loss": 0.485, |
|
"mean_token_accuracy": 0.8407270559505065, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 5.2330743618202, |
|
"grad_norm": 0.06905091060781053, |
|
"learning_rate": 1.0890874529278866e-05, |
|
"loss": 0.4945, |
|
"mean_token_accuracy": 0.8379440541935631, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 5.244173140954495, |
|
"grad_norm": 0.06825070345988755, |
|
"learning_rate": 1.0852237048001568e-05, |
|
"loss": 0.4748, |
|
"mean_token_accuracy": 0.8437304701186099, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 5.2552719200887905, |
|
"grad_norm": 0.06920597616240777, |
|
"learning_rate": 1.0813586746678584e-05, |
|
"loss": 0.4821, |
|
"mean_token_accuracy": 0.8413251705971904, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 5.266370699223086, |
|
"grad_norm": 0.07014168297576887, |
|
"learning_rate": 1.077492420671931e-05, |
|
"loss": 0.4682, |
|
"mean_token_accuracy": 0.84606039955115, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 5.27746947835738, |
|
"grad_norm": 0.06768993844433478, |
|
"learning_rate": 1.0736250009717249e-05, |
|
"loss": 0.4732, |
|
"mean_token_accuracy": 0.8445257386106653, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 5.288568257491676, |
|
"grad_norm": 0.06679687620254271, |
|
"learning_rate": 1.0697564737441254e-05, |
|
"loss": 0.4916, |
|
"mean_token_accuracy": 0.8387902210796765, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 5.299667036625971, |
|
"grad_norm": 0.07076922691915866, |
|
"learning_rate": 1.0658868971826785e-05, |
|
"loss": 0.4897, |
|
"mean_token_accuracy": 0.8391406358484407, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 5.310765815760266, |
|
"grad_norm": 0.06748496250773908, |
|
"learning_rate": 1.0620163294967155e-05, |
|
"loss": 0.4701, |
|
"mean_token_accuracy": 0.8459595898789651, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 5.321864594894562, |
|
"grad_norm": 0.07124845733985237, |
|
"learning_rate": 1.0581448289104759e-05, |
|
"loss": 0.4846, |
|
"mean_token_accuracy": 0.8411545447134726, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 5.321864594894562, |
|
"eval_loss": 0.7953329682350159, |
|
"eval_mean_token_accuracy": 0.7690350065166047, |
|
"eval_runtime": 2.4994, |
|
"eval_samples_per_second": 51.612, |
|
"eval_steps_per_second": 4.401, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 5.332963374028857, |
|
"grad_norm": 0.06547207880107556, |
|
"learning_rate": 1.054272453662234e-05, |
|
"loss": 0.4907, |
|
"mean_token_accuracy": 0.839171047344597, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 5.3440621531631525, |
|
"grad_norm": 0.07181124979794333, |
|
"learning_rate": 1.0503992620034202e-05, |
|
"loss": 0.4948, |
|
"mean_token_accuracy": 0.8375239063233091, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 5.355160932297447, |
|
"grad_norm": 0.06656783092915997, |
|
"learning_rate": 1.046525312197747e-05, |
|
"loss": 0.4857, |
|
"mean_token_accuracy": 0.8407924111332499, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 5.366259711431742, |
|
"grad_norm": 0.06794012917390951, |
|
"learning_rate": 1.0426506625203308e-05, |
|
"loss": 0.4784, |
|
"mean_token_accuracy": 0.8426411179712732, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 5.377358490566038, |
|
"grad_norm": 0.07188722795683923, |
|
"learning_rate": 1.038775371256817e-05, |
|
"loss": 0.4877, |
|
"mean_token_accuracy": 0.8399363439649579, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 5.388457269700333, |
|
"grad_norm": 0.06594729610705517, |
|
"learning_rate": 1.0348994967025012e-05, |
|
"loss": 0.4779, |
|
"mean_token_accuracy": 0.84393559716034, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 5.399556048834628, |
|
"grad_norm": 0.06794091373175933, |
|
"learning_rate": 1.0310230971614538e-05, |
|
"loss": 0.4855, |
|
"mean_token_accuracy": 0.8409262512902004, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 5.410654827968924, |
|
"grad_norm": 0.06808160516295343, |
|
"learning_rate": 1.027146230945643e-05, |
|
"loss": 0.4901, |
|
"mean_token_accuracy": 0.8390278774503441, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 5.421753607103218, |
|
"grad_norm": 0.06811284626620154, |
|
"learning_rate": 1.0232689563740563e-05, |
|
"loss": 0.4852, |
|
"mean_token_accuracy": 0.8407756696746276, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 5.4328523862375135, |
|
"grad_norm": 0.06846143690666025, |
|
"learning_rate": 1.0193913317718245e-05, |
|
"loss": 0.4984, |
|
"mean_token_accuracy": 0.8360630667700921, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 5.443951165371809, |
|
"grad_norm": 0.06588535272540232, |
|
"learning_rate": 1.0155134154693434e-05, |
|
"loss": 0.4714, |
|
"mean_token_accuracy": 0.8450789528948887, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 5.455049944506104, |
|
"grad_norm": 0.07326355147752138, |
|
"learning_rate": 1.0116352658013973e-05, |
|
"loss": 0.4844, |
|
"mean_token_accuracy": 0.8414956570792598, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 5.4661487236404, |
|
"grad_norm": 0.06613241730536085, |
|
"learning_rate": 1.0077569411062804e-05, |
|
"loss": 0.477, |
|
"mean_token_accuracy": 0.8429474129481453, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 5.477247502774695, |
|
"grad_norm": 0.07241995391150818, |
|
"learning_rate": 1.0038784997249205e-05, |
|
"loss": 0.4915, |
|
"mean_token_accuracy": 0.8389679819932333, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 5.48834628190899, |
|
"grad_norm": 0.07063682530565008, |
|
"learning_rate": 1e-05, |
|
"loss": 0.498, |
|
"mean_token_accuracy": 0.8369453954053674, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 5.499445061043286, |
|
"grad_norm": 0.07271110291337514, |
|
"learning_rate": 9.961215002750799e-06, |
|
"loss": 0.5087, |
|
"mean_token_accuracy": 0.8337575984868868, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 5.51054384017758, |
|
"grad_norm": 0.0683258143306416, |
|
"learning_rate": 9.9224305889372e-06, |
|
"loss": 0.4858, |
|
"mean_token_accuracy": 0.8407680099048033, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 5.5216426193118755, |
|
"grad_norm": 0.06532011380188109, |
|
"learning_rate": 9.883647341986032e-06, |
|
"loss": 0.4658, |
|
"mean_token_accuracy": 0.8468363678085075, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 5.532741398446171, |
|
"grad_norm": 0.06900876165623743, |
|
"learning_rate": 9.844865845306568e-06, |
|
"loss": 0.4767, |
|
"mean_token_accuracy": 0.8428589991860804, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 5.543840177580466, |
|
"grad_norm": 0.0740986106472282, |
|
"learning_rate": 9.806086682281759e-06, |
|
"loss": 0.4895, |
|
"mean_token_accuracy": 0.8394784540835362, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 5.543840177580466, |
|
"eval_loss": 0.7955297231674194, |
|
"eval_mean_token_accuracy": 0.7687454143115147, |
|
"eval_runtime": 2.5046, |
|
"eval_samples_per_second": 51.505, |
|
"eval_steps_per_second": 4.392, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 5.554938956714762, |
|
"grad_norm": 0.06642030466005984, |
|
"learning_rate": 9.767310436259438e-06, |
|
"loss": 0.5008, |
|
"mean_token_accuracy": 0.8361122903383699, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 5.566037735849057, |
|
"grad_norm": 0.06895013221498675, |
|
"learning_rate": 9.728537690543573e-06, |
|
"loss": 0.505, |
|
"mean_token_accuracy": 0.8343795016151843, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 5.577136514983351, |
|
"grad_norm": 0.07055240437970804, |
|
"learning_rate": 9.689769028385463e-06, |
|
"loss": 0.4951, |
|
"mean_token_accuracy": 0.837626593905515, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 5.588235294117647, |
|
"grad_norm": 0.06871556526072226, |
|
"learning_rate": 9.651005032974994e-06, |
|
"loss": 0.4886, |
|
"mean_token_accuracy": 0.8398810214660981, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 5.599334073251942, |
|
"grad_norm": 0.069682340612611, |
|
"learning_rate": 9.612246287431832e-06, |
|
"loss": 0.4945, |
|
"mean_token_accuracy": 0.8379422985314058, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 5.6104328523862375, |
|
"grad_norm": 0.06833821495209477, |
|
"learning_rate": 9.573493374796694e-06, |
|
"loss": 0.4808, |
|
"mean_token_accuracy": 0.8421461028149512, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 5.621531631520533, |
|
"grad_norm": 0.06636646706954583, |
|
"learning_rate": 9.534746878022533e-06, |
|
"loss": 0.4929, |
|
"mean_token_accuracy": 0.8380201477005709, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 5.632630410654828, |
|
"grad_norm": 0.06698898046345116, |
|
"learning_rate": 9.496007379965801e-06, |
|
"loss": 0.4959, |
|
"mean_token_accuracy": 0.837790504949783, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 5.6437291897891235, |
|
"grad_norm": 0.06805236937676203, |
|
"learning_rate": 9.457275463377665e-06, |
|
"loss": 0.498, |
|
"mean_token_accuracy": 0.8367196494144162, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 5.654827968923419, |
|
"grad_norm": 0.06705492549315094, |
|
"learning_rate": 9.418551710895243e-06, |
|
"loss": 0.4864, |
|
"mean_token_accuracy": 0.8409273808238578, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 5.665926748057713, |
|
"grad_norm": 0.07076074969261581, |
|
"learning_rate": 9.379836705032849e-06, |
|
"loss": 0.4864, |
|
"mean_token_accuracy": 0.8401968178066268, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 5.677025527192009, |
|
"grad_norm": 0.06659007567798325, |
|
"learning_rate": 9.341131028173215e-06, |
|
"loss": 0.4882, |
|
"mean_token_accuracy": 0.8395267262735404, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 5.688124306326304, |
|
"grad_norm": 0.06977970775580196, |
|
"learning_rate": 9.302435262558748e-06, |
|
"loss": 0.5151, |
|
"mean_token_accuracy": 0.8313035947119021, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 5.699223085460599, |
|
"grad_norm": 0.07036821942399733, |
|
"learning_rate": 9.263749990282753e-06, |
|
"loss": 0.4877, |
|
"mean_token_accuracy": 0.8402796498452879, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 5.710321864594895, |
|
"grad_norm": 0.06710082302155221, |
|
"learning_rate": 9.225075793280693e-06, |
|
"loss": 0.5081, |
|
"mean_token_accuracy": 0.8333654764490961, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 5.72142064372919, |
|
"grad_norm": 0.07006409892288358, |
|
"learning_rate": 9.18641325332142e-06, |
|
"loss": 0.5041, |
|
"mean_token_accuracy": 0.8346068076076595, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 5.732519422863485, |
|
"grad_norm": 0.06625593193600583, |
|
"learning_rate": 9.147762951998436e-06, |
|
"loss": 0.4975, |
|
"mean_token_accuracy": 0.8366931486178567, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 5.74361820199778, |
|
"grad_norm": 0.06676241932683132, |
|
"learning_rate": 9.109125470721141e-06, |
|
"loss": 0.4928, |
|
"mean_token_accuracy": 0.8382421368141244, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 5.754716981132075, |
|
"grad_norm": 0.0657279064724614, |
|
"learning_rate": 9.07050139070608e-06, |
|
"loss": 0.4939, |
|
"mean_token_accuracy": 0.8383761354079148, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 5.765815760266371, |
|
"grad_norm": 0.06661508771228203, |
|
"learning_rate": 9.03189129296821e-06, |
|
"loss": 0.4834, |
|
"mean_token_accuracy": 0.8413924101678466, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 5.765815760266371, |
|
"eval_loss": 0.7937665581703186, |
|
"eval_mean_token_accuracy": 0.7687533450104067, |
|
"eval_runtime": 2.5047, |
|
"eval_samples_per_second": 51.502, |
|
"eval_steps_per_second": 4.392, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 5.776914539400666, |
|
"grad_norm": 0.06972339544716691, |
|
"learning_rate": 8.993295758312155e-06, |
|
"loss": 0.4871, |
|
"mean_token_accuracy": 0.8403621459374608, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 5.788013318534961, |
|
"grad_norm": 0.06960404283948679, |
|
"learning_rate": 8.954715367323468e-06, |
|
"loss": 0.4793, |
|
"mean_token_accuracy": 0.8426401722260557, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 5.799112097669257, |
|
"grad_norm": 0.07020730896508283, |
|
"learning_rate": 8.916150700359896e-06, |
|
"loss": 0.4924, |
|
"mean_token_accuracy": 0.8382286290026955, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 5.810210876803552, |
|
"grad_norm": 0.06524987595853364, |
|
"learning_rate": 8.877602337542655e-06, |
|
"loss": 0.4835, |
|
"mean_token_accuracy": 0.8416452873609049, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 5.8213096559378465, |
|
"grad_norm": 0.06907599091717904, |
|
"learning_rate": 8.839070858747697e-06, |
|
"loss": 0.5007, |
|
"mean_token_accuracy": 0.8361772989154721, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 5.832408435072142, |
|
"grad_norm": 0.06957920316225238, |
|
"learning_rate": 8.800556843597002e-06, |
|
"loss": 0.5004, |
|
"mean_token_accuracy": 0.8360992024044208, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 5.843507214206437, |
|
"grad_norm": 0.07274736715088309, |
|
"learning_rate": 8.762060871449838e-06, |
|
"loss": 0.5003, |
|
"mean_token_accuracy": 0.8356531106891625, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 5.854605993340733, |
|
"grad_norm": 0.0749025930532487, |
|
"learning_rate": 8.723583521394054e-06, |
|
"loss": 0.4962, |
|
"mean_token_accuracy": 0.8370848915881123, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 5.865704772475028, |
|
"grad_norm": 0.07120547137676199, |
|
"learning_rate": 8.685125372237374e-06, |
|
"loss": 0.5145, |
|
"mean_token_accuracy": 0.8315248587134981, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 5.876803551609323, |
|
"grad_norm": 0.06848724166463453, |
|
"learning_rate": 8.646687002498692e-06, |
|
"loss": 0.4863, |
|
"mean_token_accuracy": 0.8405704017043709, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 5.887902330743618, |
|
"grad_norm": 0.0704312988672066, |
|
"learning_rate": 8.60826899039935e-06, |
|
"loss": 0.4954, |
|
"mean_token_accuracy": 0.8377861152224325, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 5.899001109877913, |
|
"grad_norm": 0.06953670462472285, |
|
"learning_rate": 8.569871913854458e-06, |
|
"loss": 0.4823, |
|
"mean_token_accuracy": 0.84187656853583, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 5.9100998890122085, |
|
"grad_norm": 0.0662886535942646, |
|
"learning_rate": 8.53149635046421e-06, |
|
"loss": 0.4954, |
|
"mean_token_accuracy": 0.8372553782052586, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 5.921198668146504, |
|
"grad_norm": 0.06315234294777407, |
|
"learning_rate": 8.49314287750517e-06, |
|
"loss": 0.4757, |
|
"mean_token_accuracy": 0.8433855066382703, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 5.932297447280799, |
|
"grad_norm": 0.06942216407055792, |
|
"learning_rate": 8.454812071921597e-06, |
|
"loss": 0.4877, |
|
"mean_token_accuracy": 0.8396264773743953, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 5.943396226415095, |
|
"grad_norm": 0.0691582315900679, |
|
"learning_rate": 8.416504510316774e-06, |
|
"loss": 0.484, |
|
"mean_token_accuracy": 0.8411113970529893, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 5.95449500554939, |
|
"grad_norm": 0.06769801918255437, |
|
"learning_rate": 8.378220768944328e-06, |
|
"loss": 0.4908, |
|
"mean_token_accuracy": 0.8392534567740544, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 5.965593784683685, |
|
"grad_norm": 0.0659122470311469, |
|
"learning_rate": 8.339961423699563e-06, |
|
"loss": 0.4759, |
|
"mean_token_accuracy": 0.843660765151712, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 5.97669256381798, |
|
"grad_norm": 0.06888231081448111, |
|
"learning_rate": 8.301727050110794e-06, |
|
"loss": 0.4993, |
|
"mean_token_accuracy": 0.8362201832443829, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 5.987791342952275, |
|
"grad_norm": 0.06698188670444734, |
|
"learning_rate": 8.263518223330698e-06, |
|
"loss": 0.4955, |
|
"mean_token_accuracy": 0.8371192889929862, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.987791342952275, |
|
"eval_loss": 0.7910561561584473, |
|
"eval_mean_token_accuracy": 0.7692369949186039, |
|
"eval_runtime": 2.5036, |
|
"eval_samples_per_second": 51.527, |
|
"eval_steps_per_second": 4.394, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.9988901220865705, |
|
"grad_norm": 0.06672049278308162, |
|
"learning_rate": 8.22533551812765e-06, |
|
"loss": 0.4758, |
|
"mean_token_accuracy": 0.8434463878747221, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 6.008879023307436, |
|
"grad_norm": 0.07710909975015609, |
|
"learning_rate": 8.187179508877086e-06, |
|
"loss": 0.446, |
|
"mean_token_accuracy": 0.8533357209356791, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 6.019977802441732, |
|
"grad_norm": 0.0771418028863615, |
|
"learning_rate": 8.149050769552856e-06, |
|
"loss": 0.4547, |
|
"mean_token_accuracy": 0.8494889642325584, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 6.031076581576027, |
|
"grad_norm": 0.07240683362402148, |
|
"learning_rate": 8.1109498737186e-06, |
|
"loss": 0.4373, |
|
"mean_token_accuracy": 0.8554890372187032, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 6.0421753607103215, |
|
"grad_norm": 0.07709077815156115, |
|
"learning_rate": 8.072877394519103e-06, |
|
"loss": 0.4496, |
|
"mean_token_accuracy": 0.8512469701533624, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 6.053274139844617, |
|
"grad_norm": 0.06941387115968416, |
|
"learning_rate": 8.034833904671698e-06, |
|
"loss": 0.4295, |
|
"mean_token_accuracy": 0.8584763547579994, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 6.064372918978912, |
|
"grad_norm": 0.07578900972120883, |
|
"learning_rate": 7.996819976457626e-06, |
|
"loss": 0.4365, |
|
"mean_token_accuracy": 0.855757515743538, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 6.0754716981132075, |
|
"grad_norm": 0.06934693627289155, |
|
"learning_rate": 7.958836181713445e-06, |
|
"loss": 0.4367, |
|
"mean_token_accuracy": 0.8556017916118088, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 6.086570477247503, |
|
"grad_norm": 0.07210016497005568, |
|
"learning_rate": 7.92088309182241e-06, |
|
"loss": 0.429, |
|
"mean_token_accuracy": 0.8580510382712909, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 6.097669256381798, |
|
"grad_norm": 0.07577767471629779, |
|
"learning_rate": 7.882961277705897e-06, |
|
"loss": 0.4362, |
|
"mean_token_accuracy": 0.8562088212016213, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 6.108768035516094, |
|
"grad_norm": 0.07179031519158982, |
|
"learning_rate": 7.845071309814802e-06, |
|
"loss": 0.4415, |
|
"mean_token_accuracy": 0.853879791220472, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 6.119866814650388, |
|
"grad_norm": 0.07041898999257495, |
|
"learning_rate": 7.807213758120965e-06, |
|
"loss": 0.4212, |
|
"mean_token_accuracy": 0.8610253997443506, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 6.130965593784683, |
|
"grad_norm": 0.06879152875025886, |
|
"learning_rate": 7.769389192108608e-06, |
|
"loss": 0.4319, |
|
"mean_token_accuracy": 0.8573816275209529, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 6.142064372918979, |
|
"grad_norm": 0.07185324061137859, |
|
"learning_rate": 7.731598180765732e-06, |
|
"loss": 0.4455, |
|
"mean_token_accuracy": 0.8524752572560315, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 6.153163152053274, |
|
"grad_norm": 0.07033926941192889, |
|
"learning_rate": 7.6938412925756e-06, |
|
"loss": 0.4433, |
|
"mean_token_accuracy": 0.853844047762491, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 6.1642619311875695, |
|
"grad_norm": 0.06915140786615713, |
|
"learning_rate": 7.656119095508155e-06, |
|
"loss": 0.4388, |
|
"mean_token_accuracy": 0.8552899831553875, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 6.175360710321865, |
|
"grad_norm": 0.06646950976574176, |
|
"learning_rate": 7.618432157011494e-06, |
|
"loss": 0.4287, |
|
"mean_token_accuracy": 0.8587684335286895, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 6.18645948945616, |
|
"grad_norm": 0.06954390622394957, |
|
"learning_rate": 7.580781044003324e-06, |
|
"loss": 0.4295, |
|
"mean_token_accuracy": 0.8582828818285677, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 6.197558268590455, |
|
"grad_norm": 0.07254257043936811, |
|
"learning_rate": 7.543166322862437e-06, |
|
"loss": 0.4333, |
|
"mean_token_accuracy": 0.8573370650863762, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 6.20865704772475, |
|
"grad_norm": 0.07422334950870361, |
|
"learning_rate": 7.505588559420188e-06, |
|
"loss": 0.4399, |
|
"mean_token_accuracy": 0.8540619797081618, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 6.20865704772475, |
|
"eval_loss": 0.8441078662872314, |
|
"eval_mean_token_accuracy": 0.7648808112159277, |
|
"eval_runtime": 2.5051, |
|
"eval_samples_per_second": 51.494, |
|
"eval_steps_per_second": 4.391, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 6.219755826859045, |
|
"grad_norm": 0.07396919663784399, |
|
"learning_rate": 7.468048318951983e-06, |
|
"loss": 0.4401, |
|
"mean_token_accuracy": 0.8546519363654996, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 6.230854605993341, |
|
"grad_norm": 0.0713243210685145, |
|
"learning_rate": 7.430546166168781e-06, |
|
"loss": 0.448, |
|
"mean_token_accuracy": 0.8519195241739214, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 6.241953385127636, |
|
"grad_norm": 0.07231708357633806, |
|
"learning_rate": 7.393082665208587e-06, |
|
"loss": 0.4448, |
|
"mean_token_accuracy": 0.8527369511433074, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 6.2530521642619314, |
|
"grad_norm": 0.06794319280745313, |
|
"learning_rate": 7.355658379627981e-06, |
|
"loss": 0.406, |
|
"mean_token_accuracy": 0.8656115445513048, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 6.264150943396227, |
|
"grad_norm": 0.07345780378767434, |
|
"learning_rate": 7.3182738723936255e-06, |
|
"loss": 0.4447, |
|
"mean_token_accuracy": 0.8527881707515051, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 6.275249722530521, |
|
"grad_norm": 0.06997362095232236, |
|
"learning_rate": 7.280929705873818e-06, |
|
"loss": 0.4238, |
|
"mean_token_accuracy": 0.8601466268048498, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 6.286348501664817, |
|
"grad_norm": 0.07131948856861685, |
|
"learning_rate": 7.243626441830009e-06, |
|
"loss": 0.4241, |
|
"mean_token_accuracy": 0.859541994237266, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 6.297447280799112, |
|
"grad_norm": 0.06944083932451703, |
|
"learning_rate": 7.206364641408358e-06, |
|
"loss": 0.4391, |
|
"mean_token_accuracy": 0.8550273221116059, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 6.308546059933407, |
|
"grad_norm": 0.07325056316103613, |
|
"learning_rate": 7.169144865131297e-06, |
|
"loss": 0.4455, |
|
"mean_token_accuracy": 0.8528142965379706, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 6.319644839067703, |
|
"grad_norm": 0.06840803745235291, |
|
"learning_rate": 7.131967672889101e-06, |
|
"loss": 0.4356, |
|
"mean_token_accuracy": 0.8564159584911453, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 6.330743618201998, |
|
"grad_norm": 0.07172434553638189, |
|
"learning_rate": 7.094833623931455e-06, |
|
"loss": 0.4474, |
|
"mean_token_accuracy": 0.8522211836045799, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 6.341842397336293, |
|
"grad_norm": 0.07657883060054264, |
|
"learning_rate": 7.057743276859048e-06, |
|
"loss": 0.4406, |
|
"mean_token_accuracy": 0.8546443812030414, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 6.352941176470588, |
|
"grad_norm": 0.07406808152539951, |
|
"learning_rate": 7.02069718961518e-06, |
|
"loss": 0.4506, |
|
"mean_token_accuracy": 0.8510959785080605, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 6.364039955604883, |
|
"grad_norm": 0.06875336053765568, |
|
"learning_rate": 6.983695919477346e-06, |
|
"loss": 0.4437, |
|
"mean_token_accuracy": 0.8531396837999757, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 6.375138734739179, |
|
"grad_norm": 0.07057356421532587, |
|
"learning_rate": 6.94674002304887e-06, |
|
"loss": 0.4501, |
|
"mean_token_accuracy": 0.8510574126425563, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 6.386237513873474, |
|
"grad_norm": 0.07071385124297858, |
|
"learning_rate": 6.909830056250527e-06, |
|
"loss": 0.4374, |
|
"mean_token_accuracy": 0.8555903780776044, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 6.397336293007769, |
|
"grad_norm": 0.07085634003180746, |
|
"learning_rate": 6.872966574312182e-06, |
|
"loss": 0.436, |
|
"mean_token_accuracy": 0.8559481393760334, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 6.408435072142065, |
|
"grad_norm": 0.06979867330158783, |
|
"learning_rate": 6.836150131764434e-06, |
|
"loss": 0.4486, |
|
"mean_token_accuracy": 0.8521716226340693, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 6.41953385127636, |
|
"grad_norm": 0.06904944422094117, |
|
"learning_rate": 6.799381282430284e-06, |
|
"loss": 0.4489, |
|
"mean_token_accuracy": 0.8513674182168313, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 6.4306326304106545, |
|
"grad_norm": 0.06827799167873423, |
|
"learning_rate": 6.762660579416791e-06, |
|
"loss": 0.4318, |
|
"mean_token_accuracy": 0.8570870173989833, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 6.4306326304106545, |
|
"eval_loss": 0.8432488441467285, |
|
"eval_mean_token_accuracy": 0.7648142474990643, |
|
"eval_runtime": 2.5092, |
|
"eval_samples_per_second": 51.411, |
|
"eval_steps_per_second": 4.384, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 6.44173140954495, |
|
"grad_norm": 0.07038913878573082, |
|
"learning_rate": 6.725988575106757e-06, |
|
"loss": 0.435, |
|
"mean_token_accuracy": 0.8566420371191622, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 6.452830188679245, |
|
"grad_norm": 0.07021550296983327, |
|
"learning_rate": 6.689365821150421e-06, |
|
"loss": 0.4424, |
|
"mean_token_accuracy": 0.8539050033549904, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 6.4639289678135405, |
|
"grad_norm": 0.07022203073927909, |
|
"learning_rate": 6.652792868457159e-06, |
|
"loss": 0.4375, |
|
"mean_token_accuracy": 0.8550374690978672, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 6.475027746947836, |
|
"grad_norm": 0.07214969010434828, |
|
"learning_rate": 6.61627026718719e-06, |
|
"loss": 0.4453, |
|
"mean_token_accuracy": 0.8527398643984709, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 6.486126526082131, |
|
"grad_norm": 0.0718849792187647, |
|
"learning_rate": 6.579798566743314e-06, |
|
"loss": 0.4316, |
|
"mean_token_accuracy": 0.8571454376520589, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 6.497225305216427, |
|
"grad_norm": 0.0687959823292499, |
|
"learning_rate": 6.543378315762634e-06, |
|
"loss": 0.4457, |
|
"mean_token_accuracy": 0.8522867903149901, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 6.508324084350721, |
|
"grad_norm": 0.06939011310417309, |
|
"learning_rate": 6.50701006210831e-06, |
|
"loss": 0.418, |
|
"mean_token_accuracy": 0.8624145286054207, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 6.519422863485016, |
|
"grad_norm": 0.06919702181469813, |
|
"learning_rate": 6.4706943528613135e-06, |
|
"loss": 0.4405, |
|
"mean_token_accuracy": 0.8544047091826522, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 6.530521642619312, |
|
"grad_norm": 0.07090082072019381, |
|
"learning_rate": 6.434431734312201e-06, |
|
"loss": 0.451, |
|
"mean_token_accuracy": 0.8512350567008408, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 6.541620421753607, |
|
"grad_norm": 0.07311782278151045, |
|
"learning_rate": 6.3982227519528986e-06, |
|
"loss": 0.4372, |
|
"mean_token_accuracy": 0.8551011624954604, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 6.5527192008879025, |
|
"grad_norm": 0.06853793673243276, |
|
"learning_rate": 6.362067950468489e-06, |
|
"loss": 0.4289, |
|
"mean_token_accuracy": 0.8585236146309112, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 6.563817980022198, |
|
"grad_norm": 0.07293011821392871, |
|
"learning_rate": 6.3259678737290174e-06, |
|
"loss": 0.4346, |
|
"mean_token_accuracy": 0.8561082904244961, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 6.574916759156492, |
|
"grad_norm": 0.06978086285931945, |
|
"learning_rate": 6.2899230647813315e-06, |
|
"loss": 0.4361, |
|
"mean_token_accuracy": 0.8558985457340178, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 6.586015538290788, |
|
"grad_norm": 0.07112662316350266, |
|
"learning_rate": 6.25393406584088e-06, |
|
"loss": 0.4413, |
|
"mean_token_accuracy": 0.8543134405793346, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 6.597114317425083, |
|
"grad_norm": 0.07262725739836273, |
|
"learning_rate": 6.218001418283577e-06, |
|
"loss": 0.4589, |
|
"mean_token_accuracy": 0.8485135772896658, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 6.608213096559378, |
|
"grad_norm": 0.07293978841730346, |
|
"learning_rate": 6.18212566263765e-06, |
|
"loss": 0.4462, |
|
"mean_token_accuracy": 0.8527901022563297, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 6.619311875693674, |
|
"grad_norm": 0.06869051011062992, |
|
"learning_rate": 6.146307338575519e-06, |
|
"loss": 0.4365, |
|
"mean_token_accuracy": 0.8560732349752435, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 6.630410654827969, |
|
"grad_norm": 0.07011481935789973, |
|
"learning_rate": 6.110546984905661e-06, |
|
"loss": 0.4325, |
|
"mean_token_accuracy": 0.8569647960292948, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 6.6415094339622645, |
|
"grad_norm": 0.06867347376570304, |
|
"learning_rate": 6.074845139564529e-06, |
|
"loss": 0.4482, |
|
"mean_token_accuracy": 0.8520339802204481, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 6.65260821309656, |
|
"grad_norm": 0.07167935814663243, |
|
"learning_rate": 6.039202339608432e-06, |
|
"loss": 0.434, |
|
"mean_token_accuracy": 0.8564132679138142, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 6.65260821309656, |
|
"eval_loss": 0.8441133499145508, |
|
"eval_mean_token_accuracy": 0.765247487379611, |
|
"eval_runtime": 2.5087, |
|
"eval_samples_per_second": 51.421, |
|
"eval_steps_per_second": 4.385, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 6.663706992230854, |
|
"grad_norm": 0.07186071828643208, |
|
"learning_rate": 6.00361912120548e-06, |
|
"loss": 0.4452, |
|
"mean_token_accuracy": 0.8528585061113889, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 6.67480577136515, |
|
"grad_norm": 0.06855121014435227, |
|
"learning_rate": 5.9680960196274995e-06, |
|
"loss": 0.4325, |
|
"mean_token_accuracy": 0.8572836863361127, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 6.685904550499445, |
|
"grad_norm": 0.07110585699367894, |
|
"learning_rate": 5.932633569242e-06, |
|
"loss": 0.4384, |
|
"mean_token_accuracy": 0.8553335978565515, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 6.69700332963374, |
|
"grad_norm": 0.07085757772624833, |
|
"learning_rate": 5.89723230350412e-06, |
|
"loss": 0.4379, |
|
"mean_token_accuracy": 0.8555847653374402, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 6.708102108768036, |
|
"grad_norm": 0.07063511706597377, |
|
"learning_rate": 5.8618927549486095e-06, |
|
"loss": 0.4468, |
|
"mean_token_accuracy": 0.8523510627335507, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 6.719200887902331, |
|
"grad_norm": 0.07187844503905214, |
|
"learning_rate": 5.8266154551818225e-06, |
|
"loss": 0.4527, |
|
"mean_token_accuracy": 0.8498687190225727, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 6.7302996670366255, |
|
"grad_norm": 0.07058997469604424, |
|
"learning_rate": 5.79140093487371e-06, |
|
"loss": 0.4435, |
|
"mean_token_accuracy": 0.8533467542770972, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 6.741398446170921, |
|
"grad_norm": 0.0717038610864614, |
|
"learning_rate": 5.756249723749847e-06, |
|
"loss": 0.4437, |
|
"mean_token_accuracy": 0.8533376800900824, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 6.752497225305216, |
|
"grad_norm": 0.06801675759895583, |
|
"learning_rate": 5.72116235058346e-06, |
|
"loss": 0.4322, |
|
"mean_token_accuracy": 0.8569636338319049, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 6.763596004439512, |
|
"grad_norm": 0.06801099401082193, |
|
"learning_rate": 5.686139343187468e-06, |
|
"loss": 0.4346, |
|
"mean_token_accuracy": 0.8563081469335604, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 6.774694783573807, |
|
"grad_norm": 0.06796075420482334, |
|
"learning_rate": 5.651181228406554e-06, |
|
"loss": 0.4511, |
|
"mean_token_accuracy": 0.8507139912972462, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 6.785793562708102, |
|
"grad_norm": 0.06858342437191409, |
|
"learning_rate": 5.616288532109225e-06, |
|
"loss": 0.4476, |
|
"mean_token_accuracy": 0.85227963226098, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 6.796892341842398, |
|
"grad_norm": 0.07033819528015756, |
|
"learning_rate": 5.581461779179924e-06, |
|
"loss": 0.4338, |
|
"mean_token_accuracy": 0.8568131935312552, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 6.807991120976693, |
|
"grad_norm": 0.0713035107971796, |
|
"learning_rate": 5.5467014935111065e-06, |
|
"loss": 0.4355, |
|
"mean_token_accuracy": 0.8558951804357434, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 6.8190899001109875, |
|
"grad_norm": 0.07131150385903263, |
|
"learning_rate": 5.512008197995379e-06, |
|
"loss": 0.4564, |
|
"mean_token_accuracy": 0.8497975681079393, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 6.830188679245283, |
|
"grad_norm": 0.06817277614907659, |
|
"learning_rate": 5.477382414517625e-06, |
|
"loss": 0.4316, |
|
"mean_token_accuracy": 0.8580683047832622, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 6.841287458379578, |
|
"grad_norm": 0.07057273044950617, |
|
"learning_rate": 5.442824663947157e-06, |
|
"loss": 0.4616, |
|
"mean_token_accuracy": 0.8475735324796743, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 6.8523862375138735, |
|
"grad_norm": 0.07036554647139312, |
|
"learning_rate": 5.4083354661298816e-06, |
|
"loss": 0.4498, |
|
"mean_token_accuracy": 0.8521374369729602, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 6.863485016648169, |
|
"grad_norm": 0.07222437693843485, |
|
"learning_rate": 5.373915339880484e-06, |
|
"loss": 0.4292, |
|
"mean_token_accuracy": 0.8582999085532814, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 6.874583795782464, |
|
"grad_norm": 0.072182918394675, |
|
"learning_rate": 5.339564802974615e-06, |
|
"loss": 0.4484, |
|
"mean_token_accuracy": 0.8518725421130162, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 6.874583795782464, |
|
"eval_loss": 0.8406283259391785, |
|
"eval_mean_token_accuracy": 0.7652879746380005, |
|
"eval_runtime": 2.5059, |
|
"eval_samples_per_second": 51.479, |
|
"eval_steps_per_second": 4.39, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 6.885682574916759, |
|
"grad_norm": 0.07096548464795761, |
|
"learning_rate": 5.305284372141095e-06, |
|
"loss": 0.4347, |
|
"mean_token_accuracy": 0.8562329599216175, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 6.896781354051054, |
|
"grad_norm": 0.06975587960479132, |
|
"learning_rate": 5.271074563054167e-06, |
|
"loss": 0.4571, |
|
"mean_token_accuracy": 0.8489463468135968, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 6.907880133185349, |
|
"grad_norm": 0.06741935931261574, |
|
"learning_rate": 5.236935890325717e-06, |
|
"loss": 0.4323, |
|
"mean_token_accuracy": 0.8573877517767727, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 6.918978912319645, |
|
"grad_norm": 0.07380361466603937, |
|
"learning_rate": 5.202868867497542e-06, |
|
"loss": 0.4619, |
|
"mean_token_accuracy": 0.8474778792524758, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 6.93007769145394, |
|
"grad_norm": 0.07025381857611887, |
|
"learning_rate": 5.168874007033615e-06, |
|
"loss": 0.4528, |
|
"mean_token_accuracy": 0.851322918006027, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 6.9411764705882355, |
|
"grad_norm": 0.06752810608010477, |
|
"learning_rate": 5.134951820312402e-06, |
|
"loss": 0.4261, |
|
"mean_token_accuracy": 0.8592220693486257, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 6.952275249722531, |
|
"grad_norm": 0.07028173714769523, |
|
"learning_rate": 5.101102817619132e-06, |
|
"loss": 0.4289, |
|
"mean_token_accuracy": 0.8581184918015656, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 6.963374028856826, |
|
"grad_norm": 0.07055892240185084, |
|
"learning_rate": 5.067327508138148e-06, |
|
"loss": 0.4405, |
|
"mean_token_accuracy": 0.8548007954650545, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 6.974472807991121, |
|
"grad_norm": 0.06971017906950491, |
|
"learning_rate": 5.033626399945241e-06, |
|
"loss": 0.4558, |
|
"mean_token_accuracy": 0.8490455064016939, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 6.985571587125416, |
|
"grad_norm": 0.07286514586275572, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": 0.4277, |
|
"mean_token_accuracy": 0.8587458553947747, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 6.996670366259711, |
|
"grad_norm": 0.07098175478184185, |
|
"learning_rate": 4.9664488141382026e-06, |
|
"loss": 0.4247, |
|
"mean_token_accuracy": 0.8598055136686054, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 7.006659267480577, |
|
"grad_norm": 0.07629907033137776, |
|
"learning_rate": 4.932973347064177e-06, |
|
"loss": 0.4125, |
|
"mean_token_accuracy": 0.8638759362274172, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 7.017758046614873, |
|
"grad_norm": 0.08557497683687372, |
|
"learning_rate": 4.899574102343247e-06, |
|
"loss": 0.4068, |
|
"mean_token_accuracy": 0.8653847908917432, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 7.028856825749168, |
|
"grad_norm": 0.07500777155917483, |
|
"learning_rate": 4.8662515823941255e-06, |
|
"loss": 0.3952, |
|
"mean_token_accuracy": 0.8694597047997366, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 7.039955604883462, |
|
"grad_norm": 0.07125380371706241, |
|
"learning_rate": 4.8330062884813714e-06, |
|
"loss": 0.3797, |
|
"mean_token_accuracy": 0.8746208020590464, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 7.051054384017758, |
|
"grad_norm": 0.0725012432074861, |
|
"learning_rate": 4.799838720707847e-06, |
|
"loss": 0.3849, |
|
"mean_token_accuracy": 0.8724296658830342, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 7.062153163152053, |
|
"grad_norm": 0.07410286674955183, |
|
"learning_rate": 4.766749378007193e-06, |
|
"loss": 0.3871, |
|
"mean_token_accuracy": 0.8719934430139469, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 7.0732519422863485, |
|
"grad_norm": 0.0711137297349784, |
|
"learning_rate": 4.733738758136327e-06, |
|
"loss": 0.3976, |
|
"mean_token_accuracy": 0.8683940340664813, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 7.084350721420644, |
|
"grad_norm": 0.07262361207927566, |
|
"learning_rate": 4.700807357667953e-06, |
|
"loss": 0.3978, |
|
"mean_token_accuracy": 0.8685349502275285, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 7.095449500554939, |
|
"grad_norm": 0.07309131618569278, |
|
"learning_rate": 4.66795567198309e-06, |
|
"loss": 0.4045, |
|
"mean_token_accuracy": 0.8659610376788004, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 7.095449500554939, |
|
"eval_loss": 0.8952147960662842, |
|
"eval_mean_token_accuracy": 0.7615468810098737, |
|
"eval_runtime": 2.5036, |
|
"eval_samples_per_second": 51.525, |
|
"eval_steps_per_second": 4.394, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 7.1065482796892345, |
|
"grad_norm": 0.0721920803563012, |
|
"learning_rate": 4.635184195263624e-06, |
|
"loss": 0.3966, |
|
"mean_token_accuracy": 0.8681120501745102, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 7.117647058823529, |
|
"grad_norm": 0.07285210208532443, |
|
"learning_rate": 4.6024934204848745e-06, |
|
"loss": 0.3961, |
|
"mean_token_accuracy": 0.8689753794301351, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 7.128745837957824, |
|
"grad_norm": 0.07326422808489647, |
|
"learning_rate": 4.56988383940817e-06, |
|
"loss": 0.4076, |
|
"mean_token_accuracy": 0.8650006041676468, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 7.13984461709212, |
|
"grad_norm": 0.0725191006821199, |
|
"learning_rate": 4.537355942573464e-06, |
|
"loss": 0.4018, |
|
"mean_token_accuracy": 0.8670280432137408, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 7.150943396226415, |
|
"grad_norm": 0.07203104950103884, |
|
"learning_rate": 4.504910219291941e-06, |
|
"loss": 0.3963, |
|
"mean_token_accuracy": 0.8690148347435971, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 7.16204217536071, |
|
"grad_norm": 0.07639160145123591, |
|
"learning_rate": 4.472547157638674e-06, |
|
"loss": 0.389, |
|
"mean_token_accuracy": 0.8710246712104658, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 7.173140954495006, |
|
"grad_norm": 0.07324338020343289, |
|
"learning_rate": 4.4402672444452664e-06, |
|
"loss": 0.3827, |
|
"mean_token_accuracy": 0.8731911923413589, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 7.184239733629301, |
|
"grad_norm": 0.07087742268240406, |
|
"learning_rate": 4.408070965292534e-06, |
|
"loss": 0.4053, |
|
"mean_token_accuracy": 0.8657608620764454, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 7.195338512763596, |
|
"grad_norm": 0.07201590538776476, |
|
"learning_rate": 4.375958804503201e-06, |
|
"loss": 0.4157, |
|
"mean_token_accuracy": 0.8624986168900755, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 7.206437291897891, |
|
"grad_norm": 0.06999596265135467, |
|
"learning_rate": 4.343931245134616e-06, |
|
"loss": 0.402, |
|
"mean_token_accuracy": 0.8665917993204305, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 7.217536071032186, |
|
"grad_norm": 0.07182851184142412, |
|
"learning_rate": 4.311988768971484e-06, |
|
"loss": 0.4015, |
|
"mean_token_accuracy": 0.8674501943469238, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 7.228634850166482, |
|
"grad_norm": 0.07362845834328512, |
|
"learning_rate": 4.2801318565186165e-06, |
|
"loss": 0.394, |
|
"mean_token_accuracy": 0.8695416479952799, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 7.239733629300777, |
|
"grad_norm": 0.0717682552415512, |
|
"learning_rate": 4.2483609869937115e-06, |
|
"loss": 0.4103, |
|
"mean_token_accuracy": 0.8642538533621753, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 7.250832408435072, |
|
"grad_norm": 0.07272893232968056, |
|
"learning_rate": 4.216676638320135e-06, |
|
"loss": 0.4131, |
|
"mean_token_accuracy": 0.8626869196808453, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 7.261931187569368, |
|
"grad_norm": 0.07135546730180682, |
|
"learning_rate": 4.185079287119733e-06, |
|
"loss": 0.4005, |
|
"mean_token_accuracy": 0.8674417568639401, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 7.273029966703662, |
|
"grad_norm": 0.0762468607841131, |
|
"learning_rate": 4.15356940870567e-06, |
|
"loss": 0.4035, |
|
"mean_token_accuracy": 0.8656978705707024, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 7.2841287458379576, |
|
"grad_norm": 0.07186567403941488, |
|
"learning_rate": 4.12214747707527e-06, |
|
"loss": 0.41, |
|
"mean_token_accuracy": 0.8647636654184666, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 7.295227524972253, |
|
"grad_norm": 0.07355942774719212, |
|
"learning_rate": 4.090813964902889e-06, |
|
"loss": 0.3943, |
|
"mean_token_accuracy": 0.8696648856429228, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 7.306326304106548, |
|
"grad_norm": 0.07215730561355123, |
|
"learning_rate": 4.059569343532809e-06, |
|
"loss": 0.3963, |
|
"mean_token_accuracy": 0.8685829093160791, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 7.317425083240844, |
|
"grad_norm": 0.07430099594156658, |
|
"learning_rate": 4.028414082972141e-06, |
|
"loss": 0.391, |
|
"mean_token_accuracy": 0.8703857625330743, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 7.317425083240844, |
|
"eval_loss": 0.8958173394203186, |
|
"eval_mean_token_accuracy": 0.7614542924776546, |
|
"eval_runtime": 2.504, |
|
"eval_samples_per_second": 51.518, |
|
"eval_steps_per_second": 4.393, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 7.328523862375139, |
|
"grad_norm": 0.0756589764512869, |
|
"learning_rate": 3.997348651883757e-06, |
|
"loss": 0.4024, |
|
"mean_token_accuracy": 0.8669605659453884, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 7.339622641509434, |
|
"grad_norm": 0.0732762921056893, |
|
"learning_rate": 3.966373517579244e-06, |
|
"loss": 0.4082, |
|
"mean_token_accuracy": 0.8643103527207086, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 7.350721420643729, |
|
"grad_norm": 0.07257949282418803, |
|
"learning_rate": 3.9354891460118695e-06, |
|
"loss": 0.4041, |
|
"mean_token_accuracy": 0.8662132721963183, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 7.361820199778024, |
|
"grad_norm": 0.07505384362015047, |
|
"learning_rate": 3.904696001769571e-06, |
|
"loss": 0.3963, |
|
"mean_token_accuracy": 0.8687588550722725, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 7.3729189789123195, |
|
"grad_norm": 0.07238130150060536, |
|
"learning_rate": 3.873994548067972e-06, |
|
"loss": 0.3903, |
|
"mean_token_accuracy": 0.8709913260405348, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 7.384017758046615, |
|
"grad_norm": 0.07413721682804225, |
|
"learning_rate": 3.8433852467434175e-06, |
|
"loss": 0.3998, |
|
"mean_token_accuracy": 0.8674460460383848, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 7.39511653718091, |
|
"grad_norm": 0.07312004025850016, |
|
"learning_rate": 3.8128685582460144e-06, |
|
"loss": 0.4236, |
|
"mean_token_accuracy": 0.8600225688180017, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 7.406215316315206, |
|
"grad_norm": 0.0695191654744446, |
|
"learning_rate": 3.7824449416327123e-06, |
|
"loss": 0.3883, |
|
"mean_token_accuracy": 0.8713507670108156, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 7.417314095449501, |
|
"grad_norm": 0.07317832388015687, |
|
"learning_rate": 3.7521148545604003e-06, |
|
"loss": 0.3961, |
|
"mean_token_accuracy": 0.8689663506413468, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 7.428412874583795, |
|
"grad_norm": 0.0714931532503113, |
|
"learning_rate": 3.7218787532790167e-06, |
|
"loss": 0.4077, |
|
"mean_token_accuracy": 0.865370666345699, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 7.439511653718091, |
|
"grad_norm": 0.07325374247465251, |
|
"learning_rate": 3.6917370926246877e-06, |
|
"loss": 0.3972, |
|
"mean_token_accuracy": 0.8682190031253774, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 7.450610432852386, |
|
"grad_norm": 0.07512751267328739, |
|
"learning_rate": 3.661690326012897e-06, |
|
"loss": 0.409, |
|
"mean_token_accuracy": 0.8650642432358119, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 7.4617092119866815, |
|
"grad_norm": 0.07195371705801658, |
|
"learning_rate": 3.631738905431641e-06, |
|
"loss": 0.3962, |
|
"mean_token_accuracy": 0.8694184936368143, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 7.472807991120977, |
|
"grad_norm": 0.0729446774567736, |
|
"learning_rate": 3.6018832814346516e-06, |
|
"loss": 0.4125, |
|
"mean_token_accuracy": 0.8633514638223495, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 7.483906770255272, |
|
"grad_norm": 0.07561542233758643, |
|
"learning_rate": 3.5721239031346067e-06, |
|
"loss": 0.4159, |
|
"mean_token_accuracy": 0.8619894212817206, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 7.4950055493895675, |
|
"grad_norm": 0.07252151252018872, |
|
"learning_rate": 3.542461218196379e-06, |
|
"loss": 0.3939, |
|
"mean_token_accuracy": 0.8697520947244627, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 7.506104328523862, |
|
"grad_norm": 0.07293598599568907, |
|
"learning_rate": 3.5128956728303e-06, |
|
"loss": 0.4028, |
|
"mean_token_accuracy": 0.8664042150786312, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 7.517203107658157, |
|
"grad_norm": 0.07266488282445482, |
|
"learning_rate": 3.483427711785449e-06, |
|
"loss": 0.3944, |
|
"mean_token_accuracy": 0.8695094069973557, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 7.528301886792453, |
|
"grad_norm": 0.07308961587910096, |
|
"learning_rate": 3.454057778342963e-06, |
|
"loss": 0.3937, |
|
"mean_token_accuracy": 0.8693873915221115, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 7.539400665926748, |
|
"grad_norm": 0.07427508856731285, |
|
"learning_rate": 3.424786314309365e-06, |
|
"loss": 0.3976, |
|
"mean_token_accuracy": 0.8685723175278763, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 7.539400665926748, |
|
"eval_loss": 0.8988686203956604, |
|
"eval_mean_token_accuracy": 0.7612407115662695, |
|
"eval_runtime": 2.5058, |
|
"eval_samples_per_second": 51.481, |
|
"eval_steps_per_second": 4.39, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 7.550499445061043, |
|
"grad_norm": 0.07215047426411422, |
|
"learning_rate": 3.3956137600099248e-06, |
|
"loss": 0.3961, |
|
"mean_token_accuracy": 0.8689609143492317, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 7.561598224195339, |
|
"grad_norm": 0.07388260988647496, |
|
"learning_rate": 3.3665405542820283e-06, |
|
"loss": 0.3992, |
|
"mean_token_accuracy": 0.8683887719751194, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 7.572697003329633, |
|
"grad_norm": 0.07356747008095715, |
|
"learning_rate": 3.337567134468579e-06, |
|
"loss": 0.4172, |
|
"mean_token_accuracy": 0.8613869632526061, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 7.583795782463929, |
|
"grad_norm": 0.0720764629548677, |
|
"learning_rate": 3.308693936411421e-06, |
|
"loss": 0.397, |
|
"mean_token_accuracy": 0.8684071063453782, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 7.594894561598224, |
|
"grad_norm": 0.0708557636265316, |
|
"learning_rate": 3.279921394444776e-06, |
|
"loss": 0.3961, |
|
"mean_token_accuracy": 0.8690219002022339, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 7.605993340732519, |
|
"grad_norm": 0.07196094239780987, |
|
"learning_rate": 3.2512499413887255e-06, |
|
"loss": 0.4099, |
|
"mean_token_accuracy": 0.8641867909633174, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 7.617092119866815, |
|
"grad_norm": 0.07170030139792326, |
|
"learning_rate": 3.222680008542678e-06, |
|
"loss": 0.3897, |
|
"mean_token_accuracy": 0.8706013772529104, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 7.62819089900111, |
|
"grad_norm": 0.0725499574373253, |
|
"learning_rate": 3.1942120256788966e-06, |
|
"loss": 0.4096, |
|
"mean_token_accuracy": 0.8641443370530075, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 7.639289678135405, |
|
"grad_norm": 0.07550535184165293, |
|
"learning_rate": 3.1658464210360285e-06, |
|
"loss": 0.3934, |
|
"mean_token_accuracy": 0.8700940005126734, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 7.650388457269701, |
|
"grad_norm": 0.07331647230148707, |
|
"learning_rate": 3.1375836213126653e-06, |
|
"loss": 0.4182, |
|
"mean_token_accuracy": 0.8615029840680271, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 7.661487236403995, |
|
"grad_norm": 0.07266758714245847, |
|
"learning_rate": 3.10942405166092e-06, |
|
"loss": 0.3995, |
|
"mean_token_accuracy": 0.8675580745382989, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 7.672586015538291, |
|
"grad_norm": 0.07202168132939574, |
|
"learning_rate": 3.081368135680041e-06, |
|
"loss": 0.41, |
|
"mean_token_accuracy": 0.8646123888948718, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 7.683684794672586, |
|
"grad_norm": 0.07108514628989722, |
|
"learning_rate": 3.0534162954100264e-06, |
|
"loss": 0.4053, |
|
"mean_token_accuracy": 0.8659444203679675, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 7.694783573806881, |
|
"grad_norm": 0.0724262143342174, |
|
"learning_rate": 3.0255689513252873e-06, |
|
"loss": 0.4119, |
|
"mean_token_accuracy": 0.8638014320646695, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 7.705882352941177, |
|
"grad_norm": 0.07001654568154396, |
|
"learning_rate": 2.9978265223283152e-06, |
|
"loss": 0.4008, |
|
"mean_token_accuracy": 0.8676831414391628, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 7.716981132075472, |
|
"grad_norm": 0.07167793631617325, |
|
"learning_rate": 2.970189425743383e-06, |
|
"loss": 0.3911, |
|
"mean_token_accuracy": 0.8705106921301781, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 7.7280799112097665, |
|
"grad_norm": 0.07293712480603332, |
|
"learning_rate": 2.94265807731027e-06, |
|
"loss": 0.3986, |
|
"mean_token_accuracy": 0.8681102444656519, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 7.739178690344062, |
|
"grad_norm": 0.0733576527410462, |
|
"learning_rate": 2.9152328911780027e-06, |
|
"loss": 0.4144, |
|
"mean_token_accuracy": 0.862807001709438, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 7.750277469478357, |
|
"grad_norm": 0.07182334518627545, |
|
"learning_rate": 2.8879142798986293e-06, |
|
"loss": 0.4062, |
|
"mean_token_accuracy": 0.8654581017683796, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 7.7613762486126525, |
|
"grad_norm": 0.07149491856929646, |
|
"learning_rate": 2.8607026544210115e-06, |
|
"loss": 0.3852, |
|
"mean_token_accuracy": 0.8726495300506489, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 7.7613762486126525, |
|
"eval_loss": 0.8958276510238647, |
|
"eval_mean_token_accuracy": 0.7615317905610097, |
|
"eval_runtime": 2.5054, |
|
"eval_samples_per_second": 51.489, |
|
"eval_steps_per_second": 4.391, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 7.772475027746948, |
|
"grad_norm": 0.07205023198542661, |
|
"learning_rate": 2.8335984240846424e-06, |
|
"loss": 0.4032, |
|
"mean_token_accuracy": 0.8664543028489167, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 7.783573806881243, |
|
"grad_norm": 0.07001518219013764, |
|
"learning_rate": 2.8066019966134907e-06, |
|
"loss": 0.3978, |
|
"mean_token_accuracy": 0.8678734443119132, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 7.794672586015539, |
|
"grad_norm": 0.07060004533338061, |
|
"learning_rate": 2.779713778109867e-06, |
|
"loss": 0.4032, |
|
"mean_token_accuracy": 0.8667982857834049, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 7.805771365149834, |
|
"grad_norm": 0.0692844958885861, |
|
"learning_rate": 2.7529341730483115e-06, |
|
"loss": 0.4127, |
|
"mean_token_accuracy": 0.8629632311173948, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 7.816870144284128, |
|
"grad_norm": 0.0729157537799244, |
|
"learning_rate": 2.726263584269513e-06, |
|
"loss": 0.4044, |
|
"mean_token_accuracy": 0.86610341023193, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 7.827968923418424, |
|
"grad_norm": 0.0723306129297517, |
|
"learning_rate": 2.6997024129742544e-06, |
|
"loss": 0.4025, |
|
"mean_token_accuracy": 0.8666444767653877, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 7.839067702552719, |
|
"grad_norm": 0.07329107603229254, |
|
"learning_rate": 2.6732510587173645e-06, |
|
"loss": 0.4033, |
|
"mean_token_accuracy": 0.8662954720595669, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 7.8501664816870145, |
|
"grad_norm": 0.07332189134275487, |
|
"learning_rate": 2.6469099194017144e-06, |
|
"loss": 0.3835, |
|
"mean_token_accuracy": 0.8732789802263794, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 7.86126526082131, |
|
"grad_norm": 0.07209594577795633, |
|
"learning_rate": 2.620679391272236e-06, |
|
"loss": 0.4079, |
|
"mean_token_accuracy": 0.8645226582422317, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 7.872364039955605, |
|
"grad_norm": 0.07114236658659995, |
|
"learning_rate": 2.594559868909956e-06, |
|
"loss": 0.3842, |
|
"mean_token_accuracy": 0.873175329490459, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 7.8834628190899, |
|
"grad_norm": 0.07221931580949754, |
|
"learning_rate": 2.5685517452260566e-06, |
|
"loss": 0.3987, |
|
"mean_token_accuracy": 0.8678625176266003, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 7.894561598224195, |
|
"grad_norm": 0.0714379162963992, |
|
"learning_rate": 2.542655411455982e-06, |
|
"loss": 0.393, |
|
"mean_token_accuracy": 0.8699922866936953, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 7.90566037735849, |
|
"grad_norm": 0.07121360401963349, |
|
"learning_rate": 2.5168712571535305e-06, |
|
"loss": 0.4037, |
|
"mean_token_accuracy": 0.8662862219762447, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 7.916759156492786, |
|
"grad_norm": 0.07302661140425189, |
|
"learning_rate": 2.4911996701850083e-06, |
|
"loss": 0.3951, |
|
"mean_token_accuracy": 0.8693212476026023, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 7.927857935627081, |
|
"grad_norm": 0.07114846005078426, |
|
"learning_rate": 2.4656410367233928e-06, |
|
"loss": 0.4032, |
|
"mean_token_accuracy": 0.8667585155126627, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 7.938956714761376, |
|
"grad_norm": 0.07063055606195018, |
|
"learning_rate": 2.4401957412425213e-06, |
|
"loss": 0.3954, |
|
"mean_token_accuracy": 0.8692018437360971, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 7.950055493895672, |
|
"grad_norm": 0.07709872630246001, |
|
"learning_rate": 2.4148641665113116e-06, |
|
"loss": 0.405, |
|
"mean_token_accuracy": 0.866025568545432, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 7.961154273029967, |
|
"grad_norm": 0.07479654482901768, |
|
"learning_rate": 2.3896466935879957e-06, |
|
"loss": 0.3971, |
|
"mean_token_accuracy": 0.868972963732916, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 7.972253052164262, |
|
"grad_norm": 0.07459975485093782, |
|
"learning_rate": 2.364543701814398e-06, |
|
"loss": 0.3963, |
|
"mean_token_accuracy": 0.8689697104778606, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 7.983351831298557, |
|
"grad_norm": 0.07491883674276509, |
|
"learning_rate": 2.339555568810221e-06, |
|
"loss": 0.4176, |
|
"mean_token_accuracy": 0.8613320490404834, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 7.983351831298557, |
|
"eval_loss": 0.8959746956825256, |
|
"eval_mean_token_accuracy": 0.7616840841270472, |
|
"eval_runtime": 2.5049, |
|
"eval_samples_per_second": 51.499, |
|
"eval_steps_per_second": 4.391, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 7.994450610432852, |
|
"grad_norm": 0.0705701747454165, |
|
"learning_rate": 2.3146826704673696e-06, |
|
"loss": 0.3948, |
|
"mean_token_accuracy": 0.8686890434955494, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 8.004439511653718, |
|
"grad_norm": 0.07856101400524802, |
|
"learning_rate": 2.2899253809442944e-06, |
|
"loss": 0.4023, |
|
"mean_token_accuracy": 0.8675475603916234, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 8.015538290788013, |
|
"grad_norm": 0.07096704231396721, |
|
"learning_rate": 2.265284072660362e-06, |
|
"loss": 0.3536, |
|
"mean_token_accuracy": 0.8834662318113198, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 8.026637069922309, |
|
"grad_norm": 0.07811200288429487, |
|
"learning_rate": 2.2407591162902576e-06, |
|
"loss": 0.3783, |
|
"mean_token_accuracy": 0.8743016220721802, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 8.037735849056604, |
|
"grad_norm": 0.06847870304456238, |
|
"learning_rate": 2.2163508807584e-06, |
|
"loss": 0.3567, |
|
"mean_token_accuracy": 0.8822303443810826, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 8.0488346281909, |
|
"grad_norm": 0.07237316029649163, |
|
"learning_rate": 2.192059733233408e-06, |
|
"loss": 0.3642, |
|
"mean_token_accuracy": 0.8798146287904443, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 8.059933407325195, |
|
"grad_norm": 0.07326537582767348, |
|
"learning_rate": 2.1678860391225588e-06, |
|
"loss": 0.3748, |
|
"mean_token_accuracy": 0.8763103580641246, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 8.07103218645949, |
|
"grad_norm": 0.0716311973227314, |
|
"learning_rate": 2.1438301620662994e-06, |
|
"loss": 0.381, |
|
"mean_token_accuracy": 0.8740629045020978, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 8.082130965593784, |
|
"grad_norm": 0.07156015129797201, |
|
"learning_rate": 2.119892463932781e-06, |
|
"loss": 0.3602, |
|
"mean_token_accuracy": 0.8808393768354014, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 8.09322974472808, |
|
"grad_norm": 0.0729212945492825, |
|
"learning_rate": 2.0960733048124082e-06, |
|
"loss": 0.392, |
|
"mean_token_accuracy": 0.8698077920762731, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 8.104328523862375, |
|
"grad_norm": 0.07233081181998524, |
|
"learning_rate": 2.072373043012422e-06, |
|
"loss": 0.3788, |
|
"mean_token_accuracy": 0.874932114508718, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 8.11542730299667, |
|
"grad_norm": 0.07255689066428407, |
|
"learning_rate": 2.048792035051521e-06, |
|
"loss": 0.3864, |
|
"mean_token_accuracy": 0.8725265939389528, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 8.126526082130965, |
|
"grad_norm": 0.06796721021247674, |
|
"learning_rate": 2.0253306356544843e-06, |
|
"loss": 0.3534, |
|
"mean_token_accuracy": 0.8830378557607703, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 8.13762486126526, |
|
"grad_norm": 0.07079763475731628, |
|
"learning_rate": 2.001989197746841e-06, |
|
"loss": 0.3621, |
|
"mean_token_accuracy": 0.8800683843848924, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 8.148723640399556, |
|
"grad_norm": 0.06934827260274774, |
|
"learning_rate": 1.9787680724495617e-06, |
|
"loss": 0.3633, |
|
"mean_token_accuracy": 0.8795690658099342, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 8.159822419533851, |
|
"grad_norm": 0.07149671546021895, |
|
"learning_rate": 1.9556676090737803e-06, |
|
"loss": 0.376, |
|
"mean_token_accuracy": 0.8752081791023549, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 8.170921198668147, |
|
"grad_norm": 0.07306552731180865, |
|
"learning_rate": 1.9326881551155307e-06, |
|
"loss": 0.375, |
|
"mean_token_accuracy": 0.8762224073973204, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 8.182019977802442, |
|
"grad_norm": 0.07245624271929346, |
|
"learning_rate": 1.9098300562505266e-06, |
|
"loss": 0.3628, |
|
"mean_token_accuracy": 0.8800849389151534, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 8.193118756936737, |
|
"grad_norm": 0.0717221526393695, |
|
"learning_rate": 1.8870936563289598e-06, |
|
"loss": 0.3736, |
|
"mean_token_accuracy": 0.8766231182349811, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 8.204217536071033, |
|
"grad_norm": 0.07427725350287043, |
|
"learning_rate": 1.8644792973703252e-06, |
|
"loss": 0.3865, |
|
"mean_token_accuracy": 0.8728949638922477, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 8.204217536071033, |
|
"eval_loss": 0.9425944089889526, |
|
"eval_mean_token_accuracy": 0.7587572912638287, |
|
"eval_runtime": 2.513, |
|
"eval_samples_per_second": 51.333, |
|
"eval_steps_per_second": 4.377, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 8.215316315205328, |
|
"grad_norm": 0.07078445884735189, |
|
"learning_rate": 1.8419873195582815e-06, |
|
"loss": 0.3667, |
|
"mean_token_accuracy": 0.8785569874719916, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 8.226415094339623, |
|
"grad_norm": 0.0714911223340155, |
|
"learning_rate": 1.8196180612355252e-06, |
|
"loss": 0.3633, |
|
"mean_token_accuracy": 0.8797422666007302, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 8.237513873473917, |
|
"grad_norm": 0.07175228593929144, |
|
"learning_rate": 1.79737185889871e-06, |
|
"loss": 0.3659, |
|
"mean_token_accuracy": 0.8794436477784764, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 8.248612652608212, |
|
"grad_norm": 0.07124623704997705, |
|
"learning_rate": 1.7752490471933769e-06, |
|
"loss": 0.3801, |
|
"mean_token_accuracy": 0.874297933476225, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 8.259711431742508, |
|
"grad_norm": 0.07340107106058204, |
|
"learning_rate": 1.7532499589089324e-06, |
|
"loss": 0.364, |
|
"mean_token_accuracy": 0.8794662910609281, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 8.270810210876803, |
|
"grad_norm": 0.06990896856577071, |
|
"learning_rate": 1.7313749249736266e-06, |
|
"loss": 0.3792, |
|
"mean_token_accuracy": 0.8745357270848668, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 8.281908990011098, |
|
"grad_norm": 0.0713247064610054, |
|
"learning_rate": 1.709624274449584e-06, |
|
"loss": 0.3711, |
|
"mean_token_accuracy": 0.8773109810868555, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 8.293007769145394, |
|
"grad_norm": 0.07760277266498634, |
|
"learning_rate": 1.6879983345278528e-06, |
|
"loss": 0.3785, |
|
"mean_token_accuracy": 0.8745675120863103, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 8.30410654827969, |
|
"grad_norm": 0.07306991998908845, |
|
"learning_rate": 1.6664974305234848e-06, |
|
"loss": 0.3768, |
|
"mean_token_accuracy": 0.8757597027758562, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 8.315205327413985, |
|
"grad_norm": 0.07485329674104144, |
|
"learning_rate": 1.6451218858706374e-06, |
|
"loss": 0.3694, |
|
"mean_token_accuracy": 0.8776273219353218, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 8.32630410654828, |
|
"grad_norm": 0.07294537461380335, |
|
"learning_rate": 1.6238720221177062e-06, |
|
"loss": 0.3841, |
|
"mean_token_accuracy": 0.873170659995959, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 8.337402885682575, |
|
"grad_norm": 0.07530630160471269, |
|
"learning_rate": 1.6027481589225024e-06, |
|
"loss": 0.3685, |
|
"mean_token_accuracy": 0.8781178863003885, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 8.34850166481687, |
|
"grad_norm": 0.07717414395687935, |
|
"learning_rate": 1.5817506140474248e-06, |
|
"loss": 0.3869, |
|
"mean_token_accuracy": 0.871778824694412, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 8.359600443951166, |
|
"grad_norm": 0.072807969887607, |
|
"learning_rate": 1.560879703354693e-06, |
|
"loss": 0.3652, |
|
"mean_token_accuracy": 0.8794815169500076, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 8.370699223085461, |
|
"grad_norm": 0.07519402001309598, |
|
"learning_rate": 1.5401357408015893e-06, |
|
"loss": 0.3932, |
|
"mean_token_accuracy": 0.8699543946559007, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 8.381798002219757, |
|
"grad_norm": 0.07230015905299639, |
|
"learning_rate": 1.5195190384357405e-06, |
|
"loss": 0.3799, |
|
"mean_token_accuracy": 0.874275331240719, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 8.39289678135405, |
|
"grad_norm": 0.07234384636097141, |
|
"learning_rate": 1.4990299063904202e-06, |
|
"loss": 0.3608, |
|
"mean_token_accuracy": 0.8807774420928469, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 8.403995560488346, |
|
"grad_norm": 0.07523663390266445, |
|
"learning_rate": 1.4786686528798878e-06, |
|
"loss": 0.38, |
|
"mean_token_accuracy": 0.8738187855023073, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 8.415094339622641, |
|
"grad_norm": 0.07140911792883375, |
|
"learning_rate": 1.4584355841947452e-06, |
|
"loss": 0.3698, |
|
"mean_token_accuracy": 0.8777678442461072, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 8.426193118756936, |
|
"grad_norm": 0.07260987883106731, |
|
"learning_rate": 1.4383310046973365e-06, |
|
"loss": 0.3862, |
|
"mean_token_accuracy": 0.8725054701888313, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 8.426193118756936, |
|
"eval_loss": 0.9420565962791443, |
|
"eval_mean_token_accuracy": 0.7586397433634224, |
|
"eval_runtime": 2.5095, |
|
"eval_samples_per_second": 51.405, |
|
"eval_steps_per_second": 4.383, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 8.437291897891232, |
|
"grad_norm": 0.07262092270909268, |
|
"learning_rate": 1.4183552168171655e-06, |
|
"loss": 0.3825, |
|
"mean_token_accuracy": 0.8734111086359343, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 8.448390677025527, |
|
"grad_norm": 0.07299203132420759, |
|
"learning_rate": 1.3985085210463479e-06, |
|
"loss": 0.3721, |
|
"mean_token_accuracy": 0.8770900982908305, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 8.459489456159822, |
|
"grad_norm": 0.06995744118261267, |
|
"learning_rate": 1.3787912159350903e-06, |
|
"loss": 0.3711, |
|
"mean_token_accuracy": 0.8772603519720799, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 8.470588235294118, |
|
"grad_norm": 0.07314470110332796, |
|
"learning_rate": 1.3592035980871954e-06, |
|
"loss": 0.3664, |
|
"mean_token_accuracy": 0.8791009918268561, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 8.481687014428413, |
|
"grad_norm": 0.07223228849997157, |
|
"learning_rate": 1.339745962155613e-06, |
|
"loss": 0.3509, |
|
"mean_token_accuracy": 0.8839586558883811, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 8.492785793562708, |
|
"grad_norm": 0.069403514773057, |
|
"learning_rate": 1.3204186008379926e-06, |
|
"loss": 0.3706, |
|
"mean_token_accuracy": 0.8774691970570467, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 8.503884572697004, |
|
"grad_norm": 0.07233134301819387, |
|
"learning_rate": 1.3012218048722858e-06, |
|
"loss": 0.3716, |
|
"mean_token_accuracy": 0.8770607873907654, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 8.5149833518313, |
|
"grad_norm": 0.07609229799981544, |
|
"learning_rate": 1.282155863032377e-06, |
|
"loss": 0.3806, |
|
"mean_token_accuracy": 0.8743671333344023, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 8.526082130965595, |
|
"grad_norm": 0.07475884278608393, |
|
"learning_rate": 1.2632210621237329e-06, |
|
"loss": 0.3785, |
|
"mean_token_accuracy": 0.8742028469705245, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 8.537180910099888, |
|
"grad_norm": 0.07033717924572155, |
|
"learning_rate": 1.2444176869790925e-06, |
|
"loss": 0.3695, |
|
"mean_token_accuracy": 0.8776175277300983, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 8.548279689234183, |
|
"grad_norm": 0.07239360928547944, |
|
"learning_rate": 1.2257460204541793e-06, |
|
"loss": 0.3874, |
|
"mean_token_accuracy": 0.8717868736796097, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 8.559378468368479, |
|
"grad_norm": 0.07174016953600017, |
|
"learning_rate": 1.207206343423456e-06, |
|
"loss": 0.384, |
|
"mean_token_accuracy": 0.8728361979192087, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 8.570477247502774, |
|
"grad_norm": 0.07129464090092877, |
|
"learning_rate": 1.188798934775881e-06, |
|
"loss": 0.3655, |
|
"mean_token_accuracy": 0.879167498582583, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 8.58157602663707, |
|
"grad_norm": 0.07073053510147226, |
|
"learning_rate": 1.1705240714107301e-06, |
|
"loss": 0.3825, |
|
"mean_token_accuracy": 0.8730660416822736, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 8.592674805771365, |
|
"grad_norm": 0.07250361995881191, |
|
"learning_rate": 1.152382028233422e-06, |
|
"loss": 0.3829, |
|
"mean_token_accuracy": 0.8733264398074165, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 8.60377358490566, |
|
"grad_norm": 0.071804460982185, |
|
"learning_rate": 1.1343730781513896e-06, |
|
"loss": 0.3887, |
|
"mean_token_accuracy": 0.8710065827782969, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 8.614872364039956, |
|
"grad_norm": 0.07472671484346735, |
|
"learning_rate": 1.1164974920699611e-06, |
|
"loss": 0.3782, |
|
"mean_token_accuracy": 0.8746941554136797, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 8.625971143174251, |
|
"grad_norm": 0.07311763408887324, |
|
"learning_rate": 1.0987555388883042e-06, |
|
"loss": 0.3773, |
|
"mean_token_accuracy": 0.8753191156569005, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 8.637069922308546, |
|
"grad_norm": 0.0718876150770474, |
|
"learning_rate": 1.0811474854953708e-06, |
|
"loss": 0.3744, |
|
"mean_token_accuracy": 0.8761765009973879, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 8.648168701442842, |
|
"grad_norm": 0.0723882381184508, |
|
"learning_rate": 1.0636735967658785e-06, |
|
"loss": 0.3602, |
|
"mean_token_accuracy": 0.8808158833620467, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 8.648168701442842, |
|
"eval_loss": 0.9418078064918518, |
|
"eval_mean_token_accuracy": 0.7587906580527934, |
|
"eval_runtime": 2.5083, |
|
"eval_samples_per_second": 51.429, |
|
"eval_steps_per_second": 4.385, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 8.659267480577137, |
|
"grad_norm": 0.0704714071422713, |
|
"learning_rate": 1.0463341355563318e-06, |
|
"loss": 0.3748, |
|
"mean_token_accuracy": 0.8759210630008264, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 8.670366259711432, |
|
"grad_norm": 0.0723912490097062, |
|
"learning_rate": 1.0291293627010678e-06, |
|
"loss": 0.3748, |
|
"mean_token_accuracy": 0.8756913392135388, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 8.681465038845728, |
|
"grad_norm": 0.07363633481469684, |
|
"learning_rate": 1.012059537008332e-06, |
|
"loss": 0.3729, |
|
"mean_token_accuracy": 0.8765285287087201, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 8.692563817980023, |
|
"grad_norm": 0.07295013091365997, |
|
"learning_rate": 9.95124915256378e-07, |
|
"loss": 0.3735, |
|
"mean_token_accuracy": 0.8767417582495198, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 8.703662597114317, |
|
"grad_norm": 0.07168690933716926, |
|
"learning_rate": 9.783257521896228e-07, |
|
"loss": 0.3649, |
|
"mean_token_accuracy": 0.8790620620466859, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 8.714761376248612, |
|
"grad_norm": 0.07210660521074372, |
|
"learning_rate": 9.616623005147952e-07, |
|
"loss": 0.392, |
|
"mean_token_accuracy": 0.8699732825725949, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 8.725860155382907, |
|
"grad_norm": 0.07265661223655243, |
|
"learning_rate": 9.451348108971425e-07, |
|
"loss": 0.3923, |
|
"mean_token_accuracy": 0.8697471915224153, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 8.736958934517203, |
|
"grad_norm": 0.07360746515128512, |
|
"learning_rate": 9.287435319566618e-07, |
|
"loss": 0.3894, |
|
"mean_token_accuracy": 0.871409389950205, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 8.748057713651498, |
|
"grad_norm": 0.07031615395240165, |
|
"learning_rate": 9.124887102643576e-07, |
|
"loss": 0.3652, |
|
"mean_token_accuracy": 0.8781901781375167, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 8.759156492785793, |
|
"grad_norm": 0.07463303052325941, |
|
"learning_rate": 8.963705903385344e-07, |
|
"loss": 0.3797, |
|
"mean_token_accuracy": 0.8744689688569356, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 8.770255271920089, |
|
"grad_norm": 0.07076393949841339, |
|
"learning_rate": 8.803894146411118e-07, |
|
"loss": 0.3843, |
|
"mean_token_accuracy": 0.8729947995264962, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 8.781354051054384, |
|
"grad_norm": 0.0763471702517904, |
|
"learning_rate": 8.645454235739903e-07, |
|
"loss": 0.354, |
|
"mean_token_accuracy": 0.8833278525225567, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 8.79245283018868, |
|
"grad_norm": 0.07382769897849192, |
|
"learning_rate": 8.488388554754223e-07, |
|
"loss": 0.3972, |
|
"mean_token_accuracy": 0.8683902843534799, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 8.803551609322975, |
|
"grad_norm": 0.07341021239100626, |
|
"learning_rate": 8.332699466164307e-07, |
|
"loss": 0.3724, |
|
"mean_token_accuracy": 0.8769335032073728, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 8.81465038845727, |
|
"grad_norm": 0.07084973072946958, |
|
"learning_rate": 8.178389311972612e-07, |
|
"loss": 0.3739, |
|
"mean_token_accuracy": 0.8766542744127456, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 8.825749167591566, |
|
"grad_norm": 0.0748288898335603, |
|
"learning_rate": 8.025460413438457e-07, |
|
"loss": 0.3753, |
|
"mean_token_accuracy": 0.876294181154862, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 8.836847946725861, |
|
"grad_norm": 0.07189696727103217, |
|
"learning_rate": 7.873915071043248e-07, |
|
"loss": 0.3832, |
|
"mean_token_accuracy": 0.8733729707285276, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 8.847946725860155, |
|
"grad_norm": 0.07359083102051164, |
|
"learning_rate": 7.723755564455771e-07, |
|
"loss": 0.3698, |
|
"mean_token_accuracy": 0.8773191235829898, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 8.85904550499445, |
|
"grad_norm": 0.07127519773440091, |
|
"learning_rate": 7.574984152497988e-07, |
|
"loss": 0.3702, |
|
"mean_token_accuracy": 0.8777848908337162, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 8.870144284128745, |
|
"grad_norm": 0.07660045354772042, |
|
"learning_rate": 7.427603073110967e-07, |
|
"loss": 0.3687, |
|
"mean_token_accuracy": 0.8779125901278991, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 8.870144284128745, |
|
"eval_loss": 0.9427998661994934, |
|
"eval_mean_token_accuracy": 0.7590286003467458, |
|
"eval_runtime": 2.5139, |
|
"eval_samples_per_second": 51.316, |
|
"eval_steps_per_second": 4.376, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 8.88124306326304, |
|
"grad_norm": 0.07081398095943368, |
|
"learning_rate": 7.281614543321269e-07, |
|
"loss": 0.3722, |
|
"mean_token_accuracy": 0.8771854375405537, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 8.892341842397336, |
|
"grad_norm": 0.07087618891479863, |
|
"learning_rate": 7.13702075920758e-07, |
|
"loss": 0.3682, |
|
"mean_token_accuracy": 0.8779584769791494, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 8.903440621531631, |
|
"grad_norm": 0.0717888807240299, |
|
"learning_rate": 6.99382389586769e-07, |
|
"loss": 0.3717, |
|
"mean_token_accuracy": 0.8771199704414011, |
|
"step": 4015 |
|
}, |
|
{ |
|
"epoch": 8.914539400665927, |
|
"grad_norm": 0.07423804991496352, |
|
"learning_rate": 6.852026107385756e-07, |
|
"loss": 0.378, |
|
"mean_token_accuracy": 0.8751318821537319, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 8.925638179800222, |
|
"grad_norm": 0.07693603325948843, |
|
"learning_rate": 6.711629526799946e-07, |
|
"loss": 0.37, |
|
"mean_token_accuracy": 0.877517280827164, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 8.936736958934517, |
|
"grad_norm": 0.0711413133732257, |
|
"learning_rate": 6.572636266070265e-07, |
|
"loss": 0.3747, |
|
"mean_token_accuracy": 0.8761638578721318, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 8.947835738068813, |
|
"grad_norm": 0.07342329249702224, |
|
"learning_rate": 6.435048416046863e-07, |
|
"loss": 0.3743, |
|
"mean_token_accuracy": 0.8757618608773102, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 8.958934517203108, |
|
"grad_norm": 0.07445700415992246, |
|
"learning_rate": 6.298868046438533e-07, |
|
"loss": 0.3817, |
|
"mean_token_accuracy": 0.8735115567669782, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 8.970033296337403, |
|
"grad_norm": 0.07393330425473023, |
|
"learning_rate": 6.164097205781616e-07, |
|
"loss": 0.3776, |
|
"mean_token_accuracy": 0.8752504462904687, |
|
"step": 4045 |
|
}, |
|
{ |
|
"epoch": 8.981132075471699, |
|
"grad_norm": 0.07192510715946791, |
|
"learning_rate": 6.030737921409169e-07, |
|
"loss": 0.3742, |
|
"mean_token_accuracy": 0.8764858951863983, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 8.992230854605994, |
|
"grad_norm": 0.07378669584037137, |
|
"learning_rate": 5.898792199420445e-07, |
|
"loss": 0.3814, |
|
"mean_token_accuracy": 0.8738473459261946, |
|
"step": 4055 |
|
}, |
|
{ |
|
"epoch": 9.002219755826859, |
|
"grad_norm": 0.11321983827426473, |
|
"learning_rate": 5.768262024650773e-07, |
|
"loss": 0.3831, |
|
"mean_token_accuracy": 0.8741038114540909, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 9.013318534961154, |
|
"grad_norm": 0.06955479623267266, |
|
"learning_rate": 5.63914936064165e-07, |
|
"loss": 0.3751, |
|
"mean_token_accuracy": 0.876432982412313, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 9.02441731409545, |
|
"grad_norm": 0.07002975386890714, |
|
"learning_rate": 5.511456149611194e-07, |
|
"loss": 0.368, |
|
"mean_token_accuracy": 0.8784786449055064, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 9.035516093229745, |
|
"grad_norm": 0.07426559705057928, |
|
"learning_rate": 5.385184312424973e-07, |
|
"loss": 0.3493, |
|
"mean_token_accuracy": 0.8844570839583301, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 9.04661487236404, |
|
"grad_norm": 0.07328109369066403, |
|
"learning_rate": 5.26033574856708e-07, |
|
"loss": 0.3516, |
|
"mean_token_accuracy": 0.8844177074732104, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 9.057713651498336, |
|
"grad_norm": 0.07007036388790919, |
|
"learning_rate": 5.136912336111599e-07, |
|
"loss": 0.365, |
|
"mean_token_accuracy": 0.8789461900681786, |
|
"step": 4085 |
|
}, |
|
{ |
|
"epoch": 9.068812430632631, |
|
"grad_norm": 0.06902457824440712, |
|
"learning_rate": 5.014915931694253e-07, |
|
"loss": 0.3557, |
|
"mean_token_accuracy": 0.8822892955015667, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 9.079911209766925, |
|
"grad_norm": 0.07023725613632394, |
|
"learning_rate": 4.894348370484648e-07, |
|
"loss": 0.3668, |
|
"mean_token_accuracy": 0.8788434835781784, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 9.09100998890122, |
|
"grad_norm": 0.06843363288085369, |
|
"learning_rate": 4.775211466158469e-07, |
|
"loss": 0.3572, |
|
"mean_token_accuracy": 0.8819544770759362, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 9.09100998890122, |
|
"eval_loss": 0.9615139961242676, |
|
"eval_mean_token_accuracy": 0.7577368010204709, |
|
"eval_runtime": 2.5049, |
|
"eval_samples_per_second": 51.499, |
|
"eval_steps_per_second": 4.391, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 9.102108768035515, |
|
"grad_norm": 0.07099833327070085, |
|
"learning_rate": 4.6575070108703433e-07, |
|
"loss": 0.3569, |
|
"mean_token_accuracy": 0.8820014538736366, |
|
"step": 4105 |
|
}, |
|
{ |
|
"epoch": 9.11320754716981, |
|
"grad_norm": 0.07199267346067889, |
|
"learning_rate": 4.5412367752268094e-07, |
|
"loss": 0.3605, |
|
"mean_token_accuracy": 0.8809590109620006, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 9.124306326304106, |
|
"grad_norm": 0.07090954996655408, |
|
"learning_rate": 4.4264025082597084e-07, |
|
"loss": 0.368, |
|
"mean_token_accuracy": 0.8785202173573573, |
|
"step": 4115 |
|
}, |
|
{ |
|
"epoch": 9.135405105438402, |
|
"grad_norm": 0.07176215249132896, |
|
"learning_rate": 4.313005937399861e-07, |
|
"loss": 0.3532, |
|
"mean_token_accuracy": 0.8833904371817389, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 9.146503884572697, |
|
"grad_norm": 0.07295746294927272, |
|
"learning_rate": 4.2010487684511105e-07, |
|
"loss": 0.3608, |
|
"mean_token_accuracy": 0.8812105612763963, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 9.157602663706992, |
|
"grad_norm": 0.07080889187808312, |
|
"learning_rate": 4.0905326855646186e-07, |
|
"loss": 0.3479, |
|
"mean_token_accuracy": 0.8851608707330438, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 9.168701442841288, |
|
"grad_norm": 0.07276807335314621, |
|
"learning_rate": 3.981459351213568e-07, |
|
"loss": 0.3777, |
|
"mean_token_accuracy": 0.875140887125394, |
|
"step": 4135 |
|
}, |
|
{ |
|
"epoch": 9.179800221975583, |
|
"grad_norm": 0.07224696890024185, |
|
"learning_rate": 3.8738304061681107e-07, |
|
"loss": 0.3708, |
|
"mean_token_accuracy": 0.8769411028535119, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 9.190899001109878, |
|
"grad_norm": 0.07110354177350459, |
|
"learning_rate": 3.7676474694707697e-07, |
|
"loss": 0.3623, |
|
"mean_token_accuracy": 0.8798909253279312, |
|
"step": 4145 |
|
}, |
|
{ |
|
"epoch": 9.201997780244174, |
|
"grad_norm": 0.07248736421213088, |
|
"learning_rate": 3.662912138411967e-07, |
|
"loss": 0.3626, |
|
"mean_token_accuracy": 0.8803352453391436, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 9.213096559378469, |
|
"grad_norm": 0.07027410169677895, |
|
"learning_rate": 3.55962598850611e-07, |
|
"loss": 0.3562, |
|
"mean_token_accuracy": 0.8819391769764504, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 9.224195338512764, |
|
"grad_norm": 0.07317371460252099, |
|
"learning_rate": 3.457790573467812e-07, |
|
"loss": 0.3556, |
|
"mean_token_accuracy": 0.8822487731331142, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 9.235294117647058, |
|
"grad_norm": 0.07476687816670186, |
|
"learning_rate": 3.357407425188541e-07, |
|
"loss": 0.3668, |
|
"mean_token_accuracy": 0.8782191688237292, |
|
"step": 4165 |
|
}, |
|
{ |
|
"epoch": 9.246392896781353, |
|
"grad_norm": 0.07118871381012193, |
|
"learning_rate": 3.2584780537136206e-07, |
|
"loss": 0.365, |
|
"mean_token_accuracy": 0.8795225550635518, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 9.257491675915649, |
|
"grad_norm": 0.07290781684699486, |
|
"learning_rate": 3.161003947219421e-07, |
|
"loss": 0.3565, |
|
"mean_token_accuracy": 0.8823807741440511, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 9.268590455049944, |
|
"grad_norm": 0.07402119674237229, |
|
"learning_rate": 3.06498657199108e-07, |
|
"loss": 0.3588, |
|
"mean_token_accuracy": 0.881556779460829, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 9.27968923418424, |
|
"grad_norm": 0.07281681233758444, |
|
"learning_rate": 2.970427372400353e-07, |
|
"loss": 0.3707, |
|
"mean_token_accuracy": 0.8775587538856486, |
|
"step": 4185 |
|
}, |
|
{ |
|
"epoch": 9.290788013318535, |
|
"grad_norm": 0.07176258029954591, |
|
"learning_rate": 2.877327770883964e-07, |
|
"loss": 0.3556, |
|
"mean_token_accuracy": 0.8826148656597541, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 9.30188679245283, |
|
"grad_norm": 0.07116410644119966, |
|
"learning_rate": 2.7856891679221565e-07, |
|
"loss": 0.353, |
|
"mean_token_accuracy": 0.8834335910667945, |
|
"step": 4195 |
|
}, |
|
{ |
|
"epoch": 9.312985571587125, |
|
"grad_norm": 0.07142192428482035, |
|
"learning_rate": 2.6955129420176193e-07, |
|
"loss": 0.353, |
|
"mean_token_accuracy": 0.8831818252941022, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 9.312985571587125, |
|
"eval_loss": 0.9637966752052307, |
|
"eval_mean_token_accuracy": 0.7577037486164723, |
|
"eval_runtime": 2.5041, |
|
"eval_samples_per_second": 51.515, |
|
"eval_steps_per_second": 4.393, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 9.32408435072142, |
|
"grad_norm": 0.06866435467347672, |
|
"learning_rate": 2.606800449674796e-07, |
|
"loss": 0.355, |
|
"mean_token_accuracy": 0.8828792506438695, |
|
"step": 4205 |
|
}, |
|
{ |
|
"epoch": 9.335183129855716, |
|
"grad_norm": 0.07266126009595562, |
|
"learning_rate": 2.51955302537944e-07, |
|
"loss": 0.3682, |
|
"mean_token_accuracy": 0.8779147885127332, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 9.346281908990012, |
|
"grad_norm": 0.07054897167424838, |
|
"learning_rate": 2.433771981578581e-07, |
|
"loss": 0.368, |
|
"mean_token_accuracy": 0.8782042065862056, |
|
"step": 4215 |
|
}, |
|
{ |
|
"epoch": 9.357380688124307, |
|
"grad_norm": 0.07285199787679796, |
|
"learning_rate": 2.349458608660704e-07, |
|
"loss": 0.367, |
|
"mean_token_accuracy": 0.878290004869226, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 9.368479467258602, |
|
"grad_norm": 0.07059019804792459, |
|
"learning_rate": 2.2666141749364434e-07, |
|
"loss": 0.3639, |
|
"mean_token_accuracy": 0.8798083702482842, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 9.379578246392898, |
|
"grad_norm": 0.07188436340923393, |
|
"learning_rate": 2.1852399266194312e-07, |
|
"loss": 0.3651, |
|
"mean_token_accuracy": 0.8792633871506889, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 9.390677025527191, |
|
"grad_norm": 0.07006294910861763, |
|
"learning_rate": 2.1053370878075685e-07, |
|
"loss": 0.3717, |
|
"mean_token_accuracy": 0.8777365849907172, |
|
"step": 4235 |
|
}, |
|
{ |
|
"epoch": 9.401775804661487, |
|
"grad_norm": 0.07329004738303971, |
|
"learning_rate": 2.0269068604646058e-07, |
|
"loss": 0.3783, |
|
"mean_token_accuracy": 0.8749171892288341, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 9.412874583795782, |
|
"grad_norm": 0.06897691821001979, |
|
"learning_rate": 1.9499504244020694e-07, |
|
"loss": 0.3657, |
|
"mean_token_accuracy": 0.8788341299355134, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 9.423973362930077, |
|
"grad_norm": 0.07254613553732685, |
|
"learning_rate": 1.874468937261531e-07, |
|
"loss": 0.3595, |
|
"mean_token_accuracy": 0.8813002625000752, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 9.435072142064373, |
|
"grad_norm": 0.06889561787998928, |
|
"learning_rate": 1.8004635344971656e-07, |
|
"loss": 0.3661, |
|
"mean_token_accuracy": 0.8790216325364879, |
|
"step": 4255 |
|
}, |
|
{ |
|
"epoch": 9.446170921198668, |
|
"grad_norm": 0.0730126004749205, |
|
"learning_rate": 1.7279353293586765e-07, |
|
"loss": 0.3679, |
|
"mean_token_accuracy": 0.878868187083347, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 9.457269700332963, |
|
"grad_norm": 0.072922212674908, |
|
"learning_rate": 1.6568854128745537e-07, |
|
"loss": 0.3733, |
|
"mean_token_accuracy": 0.8763594368603579, |
|
"step": 4265 |
|
}, |
|
{ |
|
"epoch": 9.468368479467259, |
|
"grad_norm": 0.07046125285873713, |
|
"learning_rate": 1.5873148538356752e-07, |
|
"loss": 0.3657, |
|
"mean_token_accuracy": 0.8794734514599737, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 9.479467258601554, |
|
"grad_norm": 0.07257942568751115, |
|
"learning_rate": 1.519224698779198e-07, |
|
"loss": 0.3667, |
|
"mean_token_accuracy": 0.8792038821818163, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 9.49056603773585, |
|
"grad_norm": 0.07002558706899382, |
|
"learning_rate": 1.4526159719728595e-07, |
|
"loss": 0.3502, |
|
"mean_token_accuracy": 0.8844099641821337, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 9.501664816870145, |
|
"grad_norm": 0.07126622603987796, |
|
"learning_rate": 1.3874896753995005e-07, |
|
"loss": 0.3664, |
|
"mean_token_accuracy": 0.8787627277974227, |
|
"step": 4285 |
|
}, |
|
{ |
|
"epoch": 9.51276359600444, |
|
"grad_norm": 0.0709690326726039, |
|
"learning_rate": 1.323846788742078e-07, |
|
"loss": 0.3685, |
|
"mean_token_accuracy": 0.8786089736859786, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 9.523862375138735, |
|
"grad_norm": 0.07128432924368934, |
|
"learning_rate": 1.261688269368877e-07, |
|
"loss": 0.3598, |
|
"mean_token_accuracy": 0.8812220314344532, |
|
"step": 4295 |
|
}, |
|
{ |
|
"epoch": 9.53496115427303, |
|
"grad_norm": 0.06985707014893448, |
|
"learning_rate": 1.201015052319099e-07, |
|
"loss": 0.3584, |
|
"mean_token_accuracy": 0.8816790571082956, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 9.53496115427303, |
|
"eval_loss": 0.9642364978790283, |
|
"eval_mean_token_accuracy": 0.757674836302265, |
|
"eval_runtime": 2.5035, |
|
"eval_samples_per_second": 51.528, |
|
"eval_steps_per_second": 4.394, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 9.546059933407324, |
|
"grad_norm": 0.07087098387168629, |
|
"learning_rate": 1.1418280502888401e-07, |
|
"loss": 0.3655, |
|
"mean_token_accuracy": 0.8790343017131386, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 9.55715871254162, |
|
"grad_norm": 0.07337993821243646, |
|
"learning_rate": 1.084128153617292e-07, |
|
"loss": 0.3737, |
|
"mean_token_accuracy": 0.8764109415276697, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 9.568257491675915, |
|
"grad_norm": 0.07046222496074872, |
|
"learning_rate": 1.0279162302734624e-07, |
|
"loss": 0.3512, |
|
"mean_token_accuracy": 0.8840578313854823, |
|
"step": 4315 |
|
}, |
|
{ |
|
"epoch": 9.57935627081021, |
|
"grad_norm": 0.07091688704810993, |
|
"learning_rate": 9.731931258429638e-08, |
|
"loss": 0.3665, |
|
"mean_token_accuracy": 0.878253043199692, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 9.590455049944506, |
|
"grad_norm": 0.0711174537499222, |
|
"learning_rate": 9.199596635154684e-08, |
|
"loss": 0.3496, |
|
"mean_token_accuracy": 0.8851259983663382, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 9.601553829078801, |
|
"grad_norm": 0.07024701155741234, |
|
"learning_rate": 8.682166440721729e-08, |
|
"loss": 0.3553, |
|
"mean_token_accuracy": 0.8824724160982246, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 9.612652608213097, |
|
"grad_norm": 0.07121538410184143, |
|
"learning_rate": 8.179648458738309e-08, |
|
"loss": 0.3728, |
|
"mean_token_accuracy": 0.8766866038661515, |
|
"step": 4335 |
|
}, |
|
{ |
|
"epoch": 9.623751387347392, |
|
"grad_norm": 0.07201171778030711, |
|
"learning_rate": 7.692050248490291e-08, |
|
"loss": 0.378, |
|
"mean_token_accuracy": 0.8749676272479154, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 9.634850166481687, |
|
"grad_norm": 0.07230796757518289, |
|
"learning_rate": 7.219379144828287e-08, |
|
"loss": 0.3513, |
|
"mean_token_accuracy": 0.8840763261643472, |
|
"step": 4345 |
|
}, |
|
{ |
|
"epoch": 9.645948945615983, |
|
"grad_norm": 0.07273499475598692, |
|
"learning_rate": 6.761642258056977e-08, |
|
"loss": 0.362, |
|
"mean_token_accuracy": 0.8804394627971558, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 9.657047724750278, |
|
"grad_norm": 0.0719345384206557, |
|
"learning_rate": 6.318846473828522e-08, |
|
"loss": 0.3723, |
|
"mean_token_accuracy": 0.8766675696523365, |
|
"step": 4355 |
|
}, |
|
{ |
|
"epoch": 9.668146503884573, |
|
"grad_norm": 0.07291104365357333, |
|
"learning_rate": 5.890998453038643e-08, |
|
"loss": 0.3535, |
|
"mean_token_accuracy": 0.8831918369682384, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 9.679245283018869, |
|
"grad_norm": 0.0710777472196142, |
|
"learning_rate": 5.4781046317267103e-08, |
|
"loss": 0.3594, |
|
"mean_token_accuracy": 0.8811254334090007, |
|
"step": 4365 |
|
}, |
|
{ |
|
"epoch": 9.690344062153162, |
|
"grad_norm": 0.06910781062183935, |
|
"learning_rate": 5.080171220978813e-08, |
|
"loss": 0.3724, |
|
"mean_token_accuracy": 0.877210756344294, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 9.701442841287458, |
|
"grad_norm": 0.0703723425128348, |
|
"learning_rate": 4.6972042068341714e-08, |
|
"loss": 0.3597, |
|
"mean_token_accuracy": 0.8807625164330958, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 9.712541620421753, |
|
"grad_norm": 0.07312653321865854, |
|
"learning_rate": 4.329209350195651e-08, |
|
"loss": 0.3604, |
|
"mean_token_accuracy": 0.881274793861834, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 9.723640399556048, |
|
"grad_norm": 0.07219178723430214, |
|
"learning_rate": 3.976192186742167e-08, |
|
"loss": 0.3785, |
|
"mean_token_accuracy": 0.8746446415354814, |
|
"step": 4385 |
|
}, |
|
{ |
|
"epoch": 9.734739178690344, |
|
"grad_norm": 0.06955222309447355, |
|
"learning_rate": 3.6381580268463056e-08, |
|
"loss": 0.3462, |
|
"mean_token_accuracy": 0.8857586073047825, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 9.745837957824639, |
|
"grad_norm": 0.07234203443294888, |
|
"learning_rate": 3.315111955493944e-08, |
|
"loss": 0.3575, |
|
"mean_token_accuracy": 0.8819603747530635, |
|
"step": 4395 |
|
}, |
|
{ |
|
"epoch": 9.756936736958934, |
|
"grad_norm": 0.07240545514975298, |
|
"learning_rate": 3.0070588322079765e-08, |
|
"loss": 0.3679, |
|
"mean_token_accuracy": 0.878275429419619, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 9.756936736958934, |
|
"eval_loss": 0.9646347165107727, |
|
"eval_mean_token_accuracy": 0.7575733471521328, |
|
"eval_runtime": 2.5005, |
|
"eval_samples_per_second": 51.591, |
|
"eval_steps_per_second": 4.399, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 9.76803551609323, |
|
"grad_norm": 0.07064125611324511, |
|
"learning_rate": 2.7140032909749315e-08, |
|
"loss": 0.3593, |
|
"mean_token_accuracy": 0.8812070170201969, |
|
"step": 4405 |
|
}, |
|
{ |
|
"epoch": 9.779134295227525, |
|
"grad_norm": 0.07058249125627476, |
|
"learning_rate": 2.4359497401758026e-08, |
|
"loss": 0.3645, |
|
"mean_token_accuracy": 0.8799202650266269, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 9.79023307436182, |
|
"grad_norm": 0.07207301212981171, |
|
"learning_rate": 2.1729023625189916e-08, |
|
"loss": 0.3585, |
|
"mean_token_accuracy": 0.8818855771506551, |
|
"step": 4415 |
|
}, |
|
{ |
|
"epoch": 9.801331853496116, |
|
"grad_norm": 0.07281322729220804, |
|
"learning_rate": 1.924865114978025e-08, |
|
"loss": 0.3645, |
|
"mean_token_accuracy": 0.8795988855911571, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 9.812430632630411, |
|
"grad_norm": 0.07133068454580137, |
|
"learning_rate": 1.6918417287318245e-08, |
|
"loss": 0.3659, |
|
"mean_token_accuracy": 0.879352471475125, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 9.823529411764707, |
|
"grad_norm": 0.07106856199855119, |
|
"learning_rate": 1.4738357091084177e-08, |
|
"loss": 0.358, |
|
"mean_token_accuracy": 0.8820182276270513, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 9.834628190899002, |
|
"grad_norm": 0.06905146093383048, |
|
"learning_rate": 1.2708503355323143e-08, |
|
"loss": 0.3545, |
|
"mean_token_accuracy": 0.8827259452975497, |
|
"step": 4435 |
|
}, |
|
{ |
|
"epoch": 9.845726970033297, |
|
"grad_norm": 0.06976933586547189, |
|
"learning_rate": 1.0828886614754342e-08, |
|
"loss": 0.3538, |
|
"mean_token_accuracy": 0.8825828210361928, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 9.85682574916759, |
|
"grad_norm": 0.07034227993170239, |
|
"learning_rate": 9.099535144108107e-09, |
|
"loss": 0.3553, |
|
"mean_token_accuracy": 0.8825321351350404, |
|
"step": 4445 |
|
}, |
|
{ |
|
"epoch": 9.867924528301886, |
|
"grad_norm": 0.07172489561097325, |
|
"learning_rate": 7.520474957699586e-09, |
|
"loss": 0.3619, |
|
"mean_token_accuracy": 0.8805824854864788, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 9.879023307436182, |
|
"grad_norm": 0.07089188813728774, |
|
"learning_rate": 6.091729809042379e-09, |
|
"loss": 0.3645, |
|
"mean_token_accuracy": 0.8790294765903954, |
|
"step": 4455 |
|
}, |
|
{ |
|
"epoch": 9.890122086570477, |
|
"grad_norm": 0.07130667742928198, |
|
"learning_rate": 4.813321190488829e-09, |
|
"loss": 0.3806, |
|
"mean_token_accuracy": 0.8740528261639089, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 9.901220865704772, |
|
"grad_norm": 0.06980589275633439, |
|
"learning_rate": 3.6852683329058336e-09, |
|
"loss": 0.3561, |
|
"mean_token_accuracy": 0.8826797263283062, |
|
"step": 4465 |
|
}, |
|
{ |
|
"epoch": 9.912319644839068, |
|
"grad_norm": 0.0725782887050822, |
|
"learning_rate": 2.7075882053828605e-09, |
|
"loss": 0.3606, |
|
"mean_token_accuracy": 0.8807255274320044, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 9.923418423973363, |
|
"grad_norm": 0.06911521073255959, |
|
"learning_rate": 1.8802955149865854e-09, |
|
"loss": 0.3554, |
|
"mean_token_accuracy": 0.8826477537420596, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 9.934517203107658, |
|
"grad_norm": 0.07297882233653312, |
|
"learning_rate": 1.203402706525525e-09, |
|
"loss": 0.3746, |
|
"mean_token_accuracy": 0.8760996963828843, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 9.945615982241954, |
|
"grad_norm": 0.07059910387480436, |
|
"learning_rate": 6.769199623779532e-10, |
|
"loss": 0.3555, |
|
"mean_token_accuracy": 0.882608104296593, |
|
"step": 4485 |
|
}, |
|
{ |
|
"epoch": 9.956714761376249, |
|
"grad_norm": 0.07197783726350207, |
|
"learning_rate": 3.008552023242572e-10, |
|
"loss": 0.3649, |
|
"mean_token_accuracy": 0.8792620243758646, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 9.967813540510544, |
|
"grad_norm": 0.07151420803500014, |
|
"learning_rate": 7.521408343924564e-11, |
|
"loss": 0.3752, |
|
"mean_token_accuracy": 0.8761361486307025, |
|
"step": 4495 |
|
}, |
|
{ |
|
"epoch": 9.97891231964484, |
|
"grad_norm": 0.07835980656994077, |
|
"learning_rate": 0.0, |
|
"loss": 0.3581, |
|
"mean_token_accuracy": 0.8817789333478083, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 9.97891231964484, |
|
"eval_loss": 0.9643924832344055, |
|
"eval_mean_token_accuracy": 0.7574227854751178, |
|
"eval_runtime": 2.5228, |
|
"eval_samples_per_second": 51.135, |
|
"eval_steps_per_second": 4.36, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 9.97891231964484, |
|
"step": 4500, |
|
"total_flos": 6.0804602683856e+18, |
|
"train_loss": 0.5375538207954831, |
|
"train_runtime": 15535.6447, |
|
"train_samples_per_second": 13.918, |
|
"train_steps_per_second": 0.29 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 4500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.0804602683856e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|