Qwen2.5-1.5B-Open-R1-Distill / trainer_state.json
jeff-gao's picture
Model save
cf064f3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.97891231964484,
"eval_steps": 100,
"global_step": 4500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011098779134295227,
"grad_norm": 0.32656678886781504,
"learning_rate": 2.2222222222222224e-07,
"loss": 1.1062,
"mean_token_accuracy": 0.7074863796661243,
"step": 5
},
{
"epoch": 0.022197558268590455,
"grad_norm": 0.29460412354793364,
"learning_rate": 4.444444444444445e-07,
"loss": 1.0905,
"mean_token_accuracy": 0.7132027888622114,
"step": 10
},
{
"epoch": 0.033296337402885685,
"grad_norm": 0.2926845798768026,
"learning_rate": 6.666666666666667e-07,
"loss": 1.1025,
"mean_token_accuracy": 0.7111899019997314,
"step": 15
},
{
"epoch": 0.04439511653718091,
"grad_norm": 0.2947376842618273,
"learning_rate": 8.88888888888889e-07,
"loss": 1.0792,
"mean_token_accuracy": 0.7154833115402078,
"step": 20
},
{
"epoch": 0.05549389567147614,
"grad_norm": 0.26594989197086916,
"learning_rate": 1.111111111111111e-06,
"loss": 1.1065,
"mean_token_accuracy": 0.7056332765917197,
"step": 25
},
{
"epoch": 0.06659267480577137,
"grad_norm": 0.19302469306737396,
"learning_rate": 1.3333333333333334e-06,
"loss": 1.0611,
"mean_token_accuracy": 0.7162843674231184,
"step": 30
},
{
"epoch": 0.07769145394006659,
"grad_norm": 0.18651122869654144,
"learning_rate": 1.5555555555555558e-06,
"loss": 1.0486,
"mean_token_accuracy": 0.7172663487082396,
"step": 35
},
{
"epoch": 0.08879023307436182,
"grad_norm": 0.14595121865836222,
"learning_rate": 1.777777777777778e-06,
"loss": 1.0363,
"mean_token_accuracy": 0.7196147071311284,
"step": 40
},
{
"epoch": 0.09988901220865705,
"grad_norm": 0.11721804821745417,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.9939,
"mean_token_accuracy": 0.7276444950188313,
"step": 45
},
{
"epoch": 0.11098779134295228,
"grad_norm": 0.11362901801908676,
"learning_rate": 2.222222222222222e-06,
"loss": 0.9603,
"mean_token_accuracy": 0.7342363903629822,
"step": 50
},
{
"epoch": 0.1220865704772475,
"grad_norm": 0.09796836866809913,
"learning_rate": 2.4444444444444447e-06,
"loss": 0.9519,
"mean_token_accuracy": 0.7358896327215562,
"step": 55
},
{
"epoch": 0.13318534961154274,
"grad_norm": 0.0957862317703024,
"learning_rate": 2.666666666666667e-06,
"loss": 0.9453,
"mean_token_accuracy": 0.7352604908987554,
"step": 60
},
{
"epoch": 0.14428412874583796,
"grad_norm": 0.08550990149349075,
"learning_rate": 2.888888888888889e-06,
"loss": 0.9156,
"mean_token_accuracy": 0.7417218485729853,
"step": 65
},
{
"epoch": 0.15538290788013318,
"grad_norm": 0.08055037389952527,
"learning_rate": 3.1111111111111116e-06,
"loss": 0.9174,
"mean_token_accuracy": 0.74028441707153,
"step": 70
},
{
"epoch": 0.16648168701442842,
"grad_norm": 0.07404585575455314,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.8997,
"mean_token_accuracy": 0.7443911962497833,
"step": 75
},
{
"epoch": 0.17758046614872364,
"grad_norm": 0.0755168094448169,
"learning_rate": 3.555555555555556e-06,
"loss": 0.8946,
"mean_token_accuracy": 0.7450205354457197,
"step": 80
},
{
"epoch": 0.18867924528301888,
"grad_norm": 0.06918137028638209,
"learning_rate": 3.777777777777778e-06,
"loss": 0.8819,
"mean_token_accuracy": 0.7460465509561359,
"step": 85
},
{
"epoch": 0.1997780244173141,
"grad_norm": 0.06649356448882256,
"learning_rate": 4.000000000000001e-06,
"loss": 0.865,
"mean_token_accuracy": 0.7505584314816246,
"step": 90
},
{
"epoch": 0.21087680355160932,
"grad_norm": 0.07241837843867385,
"learning_rate": 4.222222222222223e-06,
"loss": 0.8683,
"mean_token_accuracy": 0.7501267207853098,
"step": 95
},
{
"epoch": 0.22197558268590456,
"grad_norm": 0.0688273015748006,
"learning_rate": 4.444444444444444e-06,
"loss": 0.8554,
"mean_token_accuracy": 0.7520447614565848,
"step": 100
},
{
"epoch": 0.22197558268590456,
"eval_loss": 0.8836105465888977,
"eval_mean_token_accuracy": 0.7428216802026476,
"eval_runtime": 2.9492,
"eval_samples_per_second": 43.741,
"eval_steps_per_second": 3.73,
"step": 100
},
{
"epoch": 0.23307436182019978,
"grad_norm": 0.06672497320617116,
"learning_rate": 4.666666666666667e-06,
"loss": 0.8547,
"mean_token_accuracy": 0.7530808688116643,
"step": 105
},
{
"epoch": 0.244173140954495,
"grad_norm": 0.06588821630374943,
"learning_rate": 4.888888888888889e-06,
"loss": 0.8493,
"mean_token_accuracy": 0.7530783561789567,
"step": 110
},
{
"epoch": 0.25527192008879024,
"grad_norm": 0.06573805910703882,
"learning_rate": 5.1111111111111115e-06,
"loss": 0.8481,
"mean_token_accuracy": 0.7527923082243844,
"step": 115
},
{
"epoch": 0.2663706992230855,
"grad_norm": 0.06765053288672747,
"learning_rate": 5.333333333333334e-06,
"loss": 0.8345,
"mean_token_accuracy": 0.7570082248902316,
"step": 120
},
{
"epoch": 0.27746947835738067,
"grad_norm": 0.06423926782724641,
"learning_rate": 5.555555555555557e-06,
"loss": 0.8334,
"mean_token_accuracy": 0.7570059881760228,
"step": 125
},
{
"epoch": 0.2885682574916759,
"grad_norm": 0.06544256761084138,
"learning_rate": 5.777777777777778e-06,
"loss": 0.8306,
"mean_token_accuracy": 0.7560762771467135,
"step": 130
},
{
"epoch": 0.29966703662597116,
"grad_norm": 0.06689028220574877,
"learning_rate": 6e-06,
"loss": 0.8431,
"mean_token_accuracy": 0.7535974296049089,
"step": 135
},
{
"epoch": 0.31076581576026635,
"grad_norm": 0.06896235600250016,
"learning_rate": 6.222222222222223e-06,
"loss": 0.8164,
"mean_token_accuracy": 0.7605102065970375,
"step": 140
},
{
"epoch": 0.3218645948945616,
"grad_norm": 0.06652473163537684,
"learning_rate": 6.444444444444445e-06,
"loss": 0.8354,
"mean_token_accuracy": 0.7546830001786541,
"step": 145
},
{
"epoch": 0.33296337402885684,
"grad_norm": 0.0656998409953361,
"learning_rate": 6.666666666666667e-06,
"loss": 0.8204,
"mean_token_accuracy": 0.7590995752154607,
"step": 150
},
{
"epoch": 0.34406215316315203,
"grad_norm": 0.06721736905388327,
"learning_rate": 6.88888888888889e-06,
"loss": 0.8188,
"mean_token_accuracy": 0.7586616076789785,
"step": 155
},
{
"epoch": 0.3551609322974473,
"grad_norm": 0.06906077440788903,
"learning_rate": 7.111111111111112e-06,
"loss": 0.8418,
"mean_token_accuracy": 0.7528522141841469,
"step": 160
},
{
"epoch": 0.3662597114317425,
"grad_norm": 0.07152076555649528,
"learning_rate": 7.333333333333333e-06,
"loss": 0.8266,
"mean_token_accuracy": 0.7559661015692364,
"step": 165
},
{
"epoch": 0.37735849056603776,
"grad_norm": 0.07237554138006554,
"learning_rate": 7.555555555555556e-06,
"loss": 0.8124,
"mean_token_accuracy": 0.7609189512133896,
"step": 170
},
{
"epoch": 0.38845726970033295,
"grad_norm": 0.0824656116508997,
"learning_rate": 7.77777777777778e-06,
"loss": 0.8115,
"mean_token_accuracy": 0.759490100022586,
"step": 175
},
{
"epoch": 0.3995560488346282,
"grad_norm": 0.06954623757963556,
"learning_rate": 8.000000000000001e-06,
"loss": 0.8207,
"mean_token_accuracy": 0.757588037268629,
"step": 180
},
{
"epoch": 0.41065482796892344,
"grad_norm": 0.06873958713395736,
"learning_rate": 8.222222222222222e-06,
"loss": 0.7891,
"mean_token_accuracy": 0.766508490603462,
"step": 185
},
{
"epoch": 0.42175360710321863,
"grad_norm": 0.06515019023084734,
"learning_rate": 8.444444444444446e-06,
"loss": 0.7942,
"mean_token_accuracy": 0.7636216832625793,
"step": 190
},
{
"epoch": 0.4328523862375139,
"grad_norm": 0.07127344424022096,
"learning_rate": 8.666666666666668e-06,
"loss": 0.8117,
"mean_token_accuracy": 0.7599510101102652,
"step": 195
},
{
"epoch": 0.4439511653718091,
"grad_norm": 0.06750508464116467,
"learning_rate": 8.888888888888888e-06,
"loss": 0.794,
"mean_token_accuracy": 0.7647805117972197,
"step": 200
},
{
"epoch": 0.4439511653718091,
"eval_loss": 0.8218581080436707,
"eval_mean_token_accuracy": 0.7549547265212898,
"eval_runtime": 2.508,
"eval_samples_per_second": 51.434,
"eval_steps_per_second": 4.386,
"step": 200
},
{
"epoch": 0.4550499445061043,
"grad_norm": 0.07169806108493063,
"learning_rate": 9.111111111111112e-06,
"loss": 0.7796,
"mean_token_accuracy": 0.7678073917643145,
"step": 205
},
{
"epoch": 0.46614872364039955,
"grad_norm": 0.06834755097234334,
"learning_rate": 9.333333333333334e-06,
"loss": 0.8232,
"mean_token_accuracy": 0.7559485442022151,
"step": 210
},
{
"epoch": 0.4772475027746948,
"grad_norm": 0.07175829230991215,
"learning_rate": 9.555555555555556e-06,
"loss": 0.7999,
"mean_token_accuracy": 0.7625316682567083,
"step": 215
},
{
"epoch": 0.48834628190899,
"grad_norm": 0.07570550308844941,
"learning_rate": 9.777777777777779e-06,
"loss": 0.7903,
"mean_token_accuracy": 0.7654010521520587,
"step": 220
},
{
"epoch": 0.49944506104328523,
"grad_norm": 0.07216277014829452,
"learning_rate": 1e-05,
"loss": 0.8053,
"mean_token_accuracy": 0.7603297556858186,
"step": 225
},
{
"epoch": 0.5105438401775805,
"grad_norm": 0.06896541991818567,
"learning_rate": 1.0222222222222223e-05,
"loss": 0.8027,
"mean_token_accuracy": 0.7611568746863343,
"step": 230
},
{
"epoch": 0.5216426193118757,
"grad_norm": 0.07269082277182284,
"learning_rate": 1.0444444444444445e-05,
"loss": 0.7901,
"mean_token_accuracy": 0.7643428675558843,
"step": 235
},
{
"epoch": 0.532741398446171,
"grad_norm": 0.07000824171875175,
"learning_rate": 1.0666666666666667e-05,
"loss": 0.7816,
"mean_token_accuracy": 0.7676277365390342,
"step": 240
},
{
"epoch": 0.5438401775804661,
"grad_norm": 0.06721455285792531,
"learning_rate": 1.088888888888889e-05,
"loss": 0.7829,
"mean_token_accuracy": 0.7671494048062515,
"step": 245
},
{
"epoch": 0.5549389567147613,
"grad_norm": 0.06540009584572047,
"learning_rate": 1.1111111111111113e-05,
"loss": 0.7934,
"mean_token_accuracy": 0.7629588729696943,
"step": 250
},
{
"epoch": 0.5660377358490566,
"grad_norm": 0.07281032144781761,
"learning_rate": 1.1333333333333334e-05,
"loss": 0.7757,
"mean_token_accuracy": 0.7677620348353081,
"step": 255
},
{
"epoch": 0.5771365149833518,
"grad_norm": 0.06942748506092133,
"learning_rate": 1.1555555555555556e-05,
"loss": 0.7564,
"mean_token_accuracy": 0.7731927834900716,
"step": 260
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.0662825869216575,
"learning_rate": 1.177777777777778e-05,
"loss": 0.7783,
"mean_token_accuracy": 0.7668050606016081,
"step": 265
},
{
"epoch": 0.5993340732519423,
"grad_norm": 0.06693471426836645,
"learning_rate": 1.2e-05,
"loss": 0.7731,
"mean_token_accuracy": 0.768561360245379,
"step": 270
},
{
"epoch": 0.6104328523862376,
"grad_norm": 0.07214761991106813,
"learning_rate": 1.2222222222222224e-05,
"loss": 0.775,
"mean_token_accuracy": 0.7678224396480112,
"step": 275
},
{
"epoch": 0.6215316315205327,
"grad_norm": 0.07941094979893487,
"learning_rate": 1.2444444444444446e-05,
"loss": 0.7696,
"mean_token_accuracy": 0.7695845557519624,
"step": 280
},
{
"epoch": 0.632630410654828,
"grad_norm": 0.07862635171047101,
"learning_rate": 1.2666666666666667e-05,
"loss": 0.8048,
"mean_token_accuracy": 0.7594086424792449,
"step": 285
},
{
"epoch": 0.6437291897891232,
"grad_norm": 0.06739330160330582,
"learning_rate": 1.288888888888889e-05,
"loss": 0.7799,
"mean_token_accuracy": 0.7669548215148532,
"step": 290
},
{
"epoch": 0.6548279689234184,
"grad_norm": 0.07180391852563321,
"learning_rate": 1.3111111111111113e-05,
"loss": 0.7864,
"mean_token_accuracy": 0.7643820889959978,
"step": 295
},
{
"epoch": 0.6659267480577137,
"grad_norm": 0.07358894359497874,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.778,
"mean_token_accuracy": 0.7664805782566748,
"step": 300
},
{
"epoch": 0.6659267480577137,
"eval_loss": 0.7948585748672485,
"eval_mean_token_accuracy": 0.7601977645003797,
"eval_runtime": 2.5061,
"eval_samples_per_second": 51.474,
"eval_steps_per_second": 4.389,
"step": 300
},
{
"epoch": 0.6770255271920089,
"grad_norm": 0.07176558672546726,
"learning_rate": 1.3555555555555557e-05,
"loss": 0.762,
"mean_token_accuracy": 0.7715469734324651,
"step": 305
},
{
"epoch": 0.6881243063263041,
"grad_norm": 0.07661120954462695,
"learning_rate": 1.377777777777778e-05,
"loss": 0.774,
"mean_token_accuracy": 0.7688521861239055,
"step": 310
},
{
"epoch": 0.6992230854605993,
"grad_norm": 0.08099285112089856,
"learning_rate": 1.4e-05,
"loss": 0.7621,
"mean_token_accuracy": 0.7709743741953946,
"step": 315
},
{
"epoch": 0.7103218645948945,
"grad_norm": 0.07635780227021953,
"learning_rate": 1.4222222222222224e-05,
"loss": 0.741,
"mean_token_accuracy": 0.7771059346697895,
"step": 320
},
{
"epoch": 0.7214206437291898,
"grad_norm": 0.07916685859429015,
"learning_rate": 1.4444444444444446e-05,
"loss": 0.7655,
"mean_token_accuracy": 0.7704483939834843,
"step": 325
},
{
"epoch": 0.732519422863485,
"grad_norm": 0.0679326809173189,
"learning_rate": 1.4666666666666666e-05,
"loss": 0.7657,
"mean_token_accuracy": 0.7706814426497193,
"step": 330
},
{
"epoch": 0.7436182019977803,
"grad_norm": 0.0779819303278401,
"learning_rate": 1.488888888888889e-05,
"loss": 0.7744,
"mean_token_accuracy": 0.766929925481714,
"step": 335
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.0912911418984695,
"learning_rate": 1.5111111111111112e-05,
"loss": 0.7617,
"mean_token_accuracy": 0.7714522958687495,
"step": 340
},
{
"epoch": 0.7658157602663707,
"grad_norm": 0.07861319372730714,
"learning_rate": 1.5333333333333334e-05,
"loss": 0.7761,
"mean_token_accuracy": 0.7662400340052155,
"step": 345
},
{
"epoch": 0.7769145394006659,
"grad_norm": 0.07302424137610575,
"learning_rate": 1.555555555555556e-05,
"loss": 0.7657,
"mean_token_accuracy": 0.7694574502400165,
"step": 350
},
{
"epoch": 0.7880133185349611,
"grad_norm": 0.07529267936879339,
"learning_rate": 1.577777777777778e-05,
"loss": 0.7726,
"mean_token_accuracy": 0.7683270370750246,
"step": 355
},
{
"epoch": 0.7991120976692564,
"grad_norm": 0.07476756089459828,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.7621,
"mean_token_accuracy": 0.7702739762275538,
"step": 360
},
{
"epoch": 0.8102108768035516,
"grad_norm": 0.06840012057538632,
"learning_rate": 1.6222222222222223e-05,
"loss": 0.7537,
"mean_token_accuracy": 0.772770864897998,
"step": 365
},
{
"epoch": 0.8213096559378469,
"grad_norm": 0.08097322392843904,
"learning_rate": 1.6444444444444444e-05,
"loss": 0.7747,
"mean_token_accuracy": 0.7670298646020149,
"step": 370
},
{
"epoch": 0.832408435072142,
"grad_norm": 0.06862947612315898,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.7571,
"mean_token_accuracy": 0.7722756690046501,
"step": 375
},
{
"epoch": 0.8435072142064373,
"grad_norm": 0.06985899518061597,
"learning_rate": 1.688888888888889e-05,
"loss": 0.7592,
"mean_token_accuracy": 0.7716363522611088,
"step": 380
},
{
"epoch": 0.8546059933407325,
"grad_norm": 0.07236936825950281,
"learning_rate": 1.7111111111111112e-05,
"loss": 0.7623,
"mean_token_accuracy": 0.7704382284365983,
"step": 385
},
{
"epoch": 0.8657047724750278,
"grad_norm": 0.06927292766824548,
"learning_rate": 1.7333333333333336e-05,
"loss": 0.752,
"mean_token_accuracy": 0.7727048211606123,
"step": 390
},
{
"epoch": 0.876803551609323,
"grad_norm": 0.07436093330373601,
"learning_rate": 1.7555555555555556e-05,
"loss": 0.7332,
"mean_token_accuracy": 0.7787908082397291,
"step": 395
},
{
"epoch": 0.8879023307436182,
"grad_norm": 0.07000272665969048,
"learning_rate": 1.7777777777777777e-05,
"loss": 0.7595,
"mean_token_accuracy": 0.7709150293562277,
"step": 400
},
{
"epoch": 0.8879023307436182,
"eval_loss": 0.7792394161224365,
"eval_mean_token_accuracy": 0.7637743885515423,
"eval_runtime": 2.5025,
"eval_samples_per_second": 51.549,
"eval_steps_per_second": 4.396,
"step": 400
},
{
"epoch": 0.8990011098779135,
"grad_norm": 0.07828773197571512,
"learning_rate": 1.8e-05,
"loss": 0.75,
"mean_token_accuracy": 0.7742035226119346,
"step": 405
},
{
"epoch": 0.9100998890122086,
"grad_norm": 0.0786800574481835,
"learning_rate": 1.8222222222222224e-05,
"loss": 0.7458,
"mean_token_accuracy": 0.7740958458322968,
"step": 410
},
{
"epoch": 0.9211986681465039,
"grad_norm": 0.06723168272062767,
"learning_rate": 1.8444444444444448e-05,
"loss": 0.7309,
"mean_token_accuracy": 0.7794692081483484,
"step": 415
},
{
"epoch": 0.9322974472807991,
"grad_norm": 0.07897130908429875,
"learning_rate": 1.866666666666667e-05,
"loss": 0.7431,
"mean_token_accuracy": 0.7757018190238035,
"step": 420
},
{
"epoch": 0.9433962264150944,
"grad_norm": 0.07871638828751878,
"learning_rate": 1.888888888888889e-05,
"loss": 0.7489,
"mean_token_accuracy": 0.7737513410604485,
"step": 425
},
{
"epoch": 0.9544950055493896,
"grad_norm": 0.0766181271179252,
"learning_rate": 1.9111111111111113e-05,
"loss": 0.7394,
"mean_token_accuracy": 0.7765881795651786,
"step": 430
},
{
"epoch": 0.9655937846836848,
"grad_norm": 0.07235155791983158,
"learning_rate": 1.9333333333333333e-05,
"loss": 0.7359,
"mean_token_accuracy": 0.7773230673527788,
"step": 435
},
{
"epoch": 0.97669256381798,
"grad_norm": 0.06718122861403932,
"learning_rate": 1.9555555555555557e-05,
"loss": 0.7451,
"mean_token_accuracy": 0.7741639861854092,
"step": 440
},
{
"epoch": 0.9877913429522752,
"grad_norm": 0.06979726690486535,
"learning_rate": 1.977777777777778e-05,
"loss": 0.7446,
"mean_token_accuracy": 0.7742892173350467,
"step": 445
},
{
"epoch": 0.9988901220865705,
"grad_norm": 0.08187524092621244,
"learning_rate": 2e-05,
"loss": 0.7618,
"mean_token_accuracy": 0.7693789734522902,
"step": 450
},
{
"epoch": 1.0088790233074363,
"grad_norm": 0.07032602395899457,
"learning_rate": 1.9999924785916563e-05,
"loss": 0.7212,
"mean_token_accuracy": 0.7807686469529602,
"step": 455
},
{
"epoch": 1.0199778024417314,
"grad_norm": 0.07847380211681673,
"learning_rate": 1.999969914479768e-05,
"loss": 0.7193,
"mean_token_accuracy": 0.7805523329120769,
"step": 460
},
{
"epoch": 1.0310765815760266,
"grad_norm": 0.07307725873740153,
"learning_rate": 1.9999323080037623e-05,
"loss": 0.7265,
"mean_token_accuracy": 0.7779903005243214,
"step": 465
},
{
"epoch": 1.042175360710322,
"grad_norm": 0.07669795019824106,
"learning_rate": 1.9998796597293477e-05,
"loss": 0.7198,
"mean_token_accuracy": 0.779993979104892,
"step": 470
},
{
"epoch": 1.053274139844617,
"grad_norm": 0.0792411711933711,
"learning_rate": 1.9998119704485016e-05,
"loss": 0.72,
"mean_token_accuracy": 0.7801120305583913,
"step": 475
},
{
"epoch": 1.0643729189789124,
"grad_norm": 0.07886279995421645,
"learning_rate": 1.999729241179462e-05,
"loss": 0.7138,
"mean_token_accuracy": 0.781635200170369,
"step": 480
},
{
"epoch": 1.0754716981132075,
"grad_norm": 0.07435886102846764,
"learning_rate": 1.9996314731667096e-05,
"loss": 0.7076,
"mean_token_accuracy": 0.7830717794390655,
"step": 485
},
{
"epoch": 1.0865704772475029,
"grad_norm": 0.07230226563190868,
"learning_rate": 1.9995186678809513e-05,
"loss": 0.7033,
"mean_token_accuracy": 0.7850300188021543,
"step": 490
},
{
"epoch": 1.097669256381798,
"grad_norm": 0.07003963622013061,
"learning_rate": 1.999390827019096e-05,
"loss": 0.7044,
"mean_token_accuracy": 0.7842776950780501,
"step": 495
},
{
"epoch": 1.1087680355160932,
"grad_norm": 0.07508778652915178,
"learning_rate": 1.9992479525042305e-05,
"loss": 0.7372,
"mean_token_accuracy": 0.774935390675619,
"step": 500
},
{
"epoch": 1.1087680355160932,
"eval_loss": 0.7690628170967102,
"eval_mean_token_accuracy": 0.7655783087569432,
"eval_runtime": 2.4973,
"eval_samples_per_second": 51.656,
"eval_steps_per_second": 4.405,
"step": 500
},
{
"epoch": 1.1198668146503885,
"grad_norm": 0.07918378464816227,
"learning_rate": 1.9990900464855895e-05,
"loss": 0.7006,
"mean_token_accuracy": 0.7857579260565195,
"step": 505
},
{
"epoch": 1.1309655937846836,
"grad_norm": 0.07328427894357584,
"learning_rate": 1.998917111338525e-05,
"loss": 0.725,
"mean_token_accuracy": 0.7775817733453466,
"step": 510
},
{
"epoch": 1.142064372918979,
"grad_norm": 0.07204080045449197,
"learning_rate": 1.998729149664468e-05,
"loss": 0.7016,
"mean_token_accuracy": 0.7855126398407342,
"step": 515
},
{
"epoch": 1.1531631520532741,
"grad_norm": 0.08533628416371716,
"learning_rate": 1.9985261642908917e-05,
"loss": 0.7292,
"mean_token_accuracy": 0.7774274235941383,
"step": 520
},
{
"epoch": 1.1642619311875695,
"grad_norm": 0.07520433809088832,
"learning_rate": 1.9983081582712684e-05,
"loss": 0.7181,
"mean_token_accuracy": 0.780462785723534,
"step": 525
},
{
"epoch": 1.1753607103218646,
"grad_norm": 0.07190307670380537,
"learning_rate": 1.998075134885022e-05,
"loss": 0.7116,
"mean_token_accuracy": 0.7825697671237573,
"step": 530
},
{
"epoch": 1.1864594894561598,
"grad_norm": 0.06491993297695452,
"learning_rate": 1.9978270976374813e-05,
"loss": 0.6703,
"mean_token_accuracy": 0.793881569205478,
"step": 535
},
{
"epoch": 1.197558268590455,
"grad_norm": 0.06911870719514304,
"learning_rate": 1.9975640502598243e-05,
"loss": 0.7082,
"mean_token_accuracy": 0.7836861901228346,
"step": 540
},
{
"epoch": 1.2086570477247502,
"grad_norm": 0.07437166520951717,
"learning_rate": 1.9972859967090253e-05,
"loss": 0.7264,
"mean_token_accuracy": 0.7775489198837382,
"step": 545
},
{
"epoch": 1.2197558268590456,
"grad_norm": 0.06775894355173918,
"learning_rate": 1.996992941167792e-05,
"loss": 0.7133,
"mean_token_accuracy": 0.7810329813258085,
"step": 550
},
{
"epoch": 1.2308546059933407,
"grad_norm": 0.07314137750187506,
"learning_rate": 1.996684888044506e-05,
"loss": 0.7037,
"mean_token_accuracy": 0.7839758348181621,
"step": 555
},
{
"epoch": 1.2419533851276359,
"grad_norm": 0.07831492126639587,
"learning_rate": 1.996361841973154e-05,
"loss": 0.7126,
"mean_token_accuracy": 0.7816916858828595,
"step": 560
},
{
"epoch": 1.2530521642619312,
"grad_norm": 0.07517018503795618,
"learning_rate": 1.996023807813258e-05,
"loss": 0.6941,
"mean_token_accuracy": 0.7867524535551568,
"step": 565
},
{
"epoch": 1.2641509433962264,
"grad_norm": 0.0704338781552015,
"learning_rate": 1.9956707906498046e-05,
"loss": 0.6903,
"mean_token_accuracy": 0.7879095356059462,
"step": 570
},
{
"epoch": 1.2752497225305217,
"grad_norm": 0.06995474295091916,
"learning_rate": 1.9953027957931658e-05,
"loss": 0.7222,
"mean_token_accuracy": 0.7794008881208037,
"step": 575
},
{
"epoch": 1.2863485016648168,
"grad_norm": 0.07694713494942496,
"learning_rate": 1.9949198287790215e-05,
"loss": 0.6943,
"mean_token_accuracy": 0.7870831177107039,
"step": 580
},
{
"epoch": 1.297447280799112,
"grad_norm": 0.08903403872621385,
"learning_rate": 1.9945218953682736e-05,
"loss": 0.7206,
"mean_token_accuracy": 0.779562023576681,
"step": 585
},
{
"epoch": 1.3085460599334073,
"grad_norm": 0.07363661521999505,
"learning_rate": 1.9941090015469614e-05,
"loss": 0.724,
"mean_token_accuracy": 0.7783052999023876,
"step": 590
},
{
"epoch": 1.3196448390677027,
"grad_norm": 0.07544765033521425,
"learning_rate": 1.9936811535261714e-05,
"loss": 0.712,
"mean_token_accuracy": 0.7817552803483336,
"step": 595
},
{
"epoch": 1.3307436182019978,
"grad_norm": 0.07392484759536327,
"learning_rate": 1.9932383577419432e-05,
"loss": 0.7227,
"mean_token_accuracy": 0.7788691622542909,
"step": 600
},
{
"epoch": 1.3307436182019978,
"eval_loss": 0.7600793838500977,
"eval_mean_token_accuracy": 0.7673947304714178,
"eval_runtime": 2.5058,
"eval_samples_per_second": 51.48,
"eval_steps_per_second": 4.39,
"step": 600
},
{
"epoch": 1.341842397336293,
"grad_norm": 0.07911913461484033,
"learning_rate": 1.9927806208551718e-05,
"loss": 0.7093,
"mean_token_accuracy": 0.782579034246759,
"step": 605
},
{
"epoch": 1.3529411764705883,
"grad_norm": 0.0642347895827473,
"learning_rate": 1.99230794975151e-05,
"loss": 0.7212,
"mean_token_accuracy": 0.7787558227928784,
"step": 610
},
{
"epoch": 1.3640399556048834,
"grad_norm": 0.067833068280622,
"learning_rate": 1.9918203515412616e-05,
"loss": 0.7185,
"mean_token_accuracy": 0.7798822363644213,
"step": 615
},
{
"epoch": 1.3751387347391786,
"grad_norm": 0.0688915478969926,
"learning_rate": 1.9913178335592784e-05,
"loss": 0.7249,
"mean_token_accuracy": 0.7776480990593553,
"step": 620
},
{
"epoch": 1.386237513873474,
"grad_norm": 0.06744773171181109,
"learning_rate": 1.9908004033648452e-05,
"loss": 0.6984,
"mean_token_accuracy": 0.785427300541276,
"step": 625
},
{
"epoch": 1.397336293007769,
"grad_norm": 0.07602324779465816,
"learning_rate": 1.9902680687415704e-05,
"loss": 0.7138,
"mean_token_accuracy": 0.7809259935769812,
"step": 630
},
{
"epoch": 1.4084350721420644,
"grad_norm": 0.07420737656548172,
"learning_rate": 1.9897208376972655e-05,
"loss": 0.7208,
"mean_token_accuracy": 0.7793231378215638,
"step": 635
},
{
"epoch": 1.4195338512763596,
"grad_norm": 0.07343563458085059,
"learning_rate": 1.9891587184638274e-05,
"loss": 0.7221,
"mean_token_accuracy": 0.7778519855839254,
"step": 640
},
{
"epoch": 1.430632630410655,
"grad_norm": 0.07698384093927461,
"learning_rate": 1.9885817194971116e-05,
"loss": 0.7137,
"mean_token_accuracy": 0.7805180143823278,
"step": 645
},
{
"epoch": 1.44173140954495,
"grad_norm": 0.07449182501302304,
"learning_rate": 1.9879898494768093e-05,
"loss": 0.7051,
"mean_token_accuracy": 0.7831931362540656,
"step": 650
},
{
"epoch": 1.4528301886792452,
"grad_norm": 0.07581066932689472,
"learning_rate": 1.9873831173063113e-05,
"loss": 0.7049,
"mean_token_accuracy": 0.7832020777520368,
"step": 655
},
{
"epoch": 1.4639289678135405,
"grad_norm": 0.06949135831136043,
"learning_rate": 1.9867615321125796e-05,
"loss": 0.7056,
"mean_token_accuracy": 0.7838479455915908,
"step": 660
},
{
"epoch": 1.4750277469478357,
"grad_norm": 0.07439968483839138,
"learning_rate": 1.9861251032460053e-05,
"loss": 0.7081,
"mean_token_accuracy": 0.7832292936410306,
"step": 665
},
{
"epoch": 1.486126526082131,
"grad_norm": 0.06607770608293857,
"learning_rate": 1.9854738402802715e-05,
"loss": 0.6932,
"mean_token_accuracy": 0.786992797754289,
"step": 670
},
{
"epoch": 1.4972253052164262,
"grad_norm": 0.07053844606053124,
"learning_rate": 1.9848077530122083e-05,
"loss": 0.6965,
"mean_token_accuracy": 0.7860283399251268,
"step": 675
},
{
"epoch": 1.5083240843507215,
"grad_norm": 0.0651242671351587,
"learning_rate": 1.9841268514616434e-05,
"loss": 0.7181,
"mean_token_accuracy": 0.7788762916398921,
"step": 680
},
{
"epoch": 1.5194228634850167,
"grad_norm": 0.06916533407392902,
"learning_rate": 1.9834311458712547e-05,
"loss": 0.7076,
"mean_token_accuracy": 0.7832109400921368,
"step": 685
},
{
"epoch": 1.5305216426193118,
"grad_norm": 0.07466081988250255,
"learning_rate": 1.9827206467064133e-05,
"loss": 0.7074,
"mean_token_accuracy": 0.7828159386074006,
"step": 690
},
{
"epoch": 1.5416204217536071,
"grad_norm": 0.06967673384995965,
"learning_rate": 1.9819953646550286e-05,
"loss": 0.7003,
"mean_token_accuracy": 0.7843127898869101,
"step": 695
},
{
"epoch": 1.5527192008879025,
"grad_norm": 0.07129452144368828,
"learning_rate": 1.9812553106273848e-05,
"loss": 0.6914,
"mean_token_accuracy": 0.787352826911644,
"step": 700
},
{
"epoch": 1.5527192008879025,
"eval_loss": 0.7521222829818726,
"eval_mean_token_accuracy": 0.7691004239798882,
"eval_runtime": 2.4983,
"eval_samples_per_second": 51.635,
"eval_steps_per_second": 4.403,
"step": 700
},
{
"epoch": 1.5638179800221974,
"grad_norm": 0.07293730779046935,
"learning_rate": 1.9805004957559795e-05,
"loss": 0.6921,
"mean_token_accuracy": 0.7870082715054341,
"step": 705
},
{
"epoch": 1.5749167591564928,
"grad_norm": 0.06857400596374212,
"learning_rate": 1.979730931395354e-05,
"loss": 0.7156,
"mean_token_accuracy": 0.7806777176754099,
"step": 710
},
{
"epoch": 1.5860155382907881,
"grad_norm": 0.06787546605095558,
"learning_rate": 1.9789466291219246e-05,
"loss": 0.7006,
"mean_token_accuracy": 0.7845560051830258,
"step": 715
},
{
"epoch": 1.5971143174250833,
"grad_norm": 0.06978512332784513,
"learning_rate": 1.9781476007338058e-05,
"loss": 0.7063,
"mean_token_accuracy": 0.7831825376761976,
"step": 720
},
{
"epoch": 1.6082130965593784,
"grad_norm": 0.06573978556995788,
"learning_rate": 1.9773338582506357e-05,
"loss": 0.6974,
"mean_token_accuracy": 0.7848375942024458,
"step": 725
},
{
"epoch": 1.6193118756936737,
"grad_norm": 0.06846729263039855,
"learning_rate": 1.976505413913393e-05,
"loss": 0.7304,
"mean_token_accuracy": 0.7762347996924202,
"step": 730
},
{
"epoch": 1.6304106548279689,
"grad_norm": 0.07292716460344072,
"learning_rate": 1.9756622801842144e-05,
"loss": 0.6945,
"mean_token_accuracy": 0.785635234248747,
"step": 735
},
{
"epoch": 1.641509433962264,
"grad_norm": 0.06788764902071766,
"learning_rate": 1.974804469746206e-05,
"loss": 0.6871,
"mean_token_accuracy": 0.7883116176033117,
"step": 740
},
{
"epoch": 1.6526082130965594,
"grad_norm": 0.06990606289840068,
"learning_rate": 1.9739319955032522e-05,
"loss": 0.7105,
"mean_token_accuracy": 0.7815165621962155,
"step": 745
},
{
"epoch": 1.6637069922308547,
"grad_norm": 0.06745401830931097,
"learning_rate": 1.973044870579824e-05,
"loss": 0.6901,
"mean_token_accuracy": 0.7868986592436943,
"step": 750
},
{
"epoch": 1.6748057713651499,
"grad_norm": 0.06884276535878785,
"learning_rate": 1.9721431083207786e-05,
"loss": 0.7096,
"mean_token_accuracy": 0.7823803797267934,
"step": 755
},
{
"epoch": 1.685904550499445,
"grad_norm": 0.06879555042165823,
"learning_rate": 1.9712267222911605e-05,
"loss": 0.6971,
"mean_token_accuracy": 0.7852152497634741,
"step": 760
},
{
"epoch": 1.6970033296337403,
"grad_norm": 0.07979933050534435,
"learning_rate": 1.9702957262759964e-05,
"loss": 0.714,
"mean_token_accuracy": 0.7805189578063543,
"step": 765
},
{
"epoch": 1.7081021087680355,
"grad_norm": 0.07357866691629775,
"learning_rate": 1.9693501342800895e-05,
"loss": 0.6874,
"mean_token_accuracy": 0.7879884917047848,
"step": 770
},
{
"epoch": 1.7192008879023306,
"grad_norm": 0.0701902920875424,
"learning_rate": 1.9683899605278062e-05,
"loss": 0.7159,
"mean_token_accuracy": 0.7792839420183199,
"step": 775
},
{
"epoch": 1.730299667036626,
"grad_norm": 0.06842656933800775,
"learning_rate": 1.967415219462864e-05,
"loss": 0.6942,
"mean_token_accuracy": 0.7865705838505347,
"step": 780
},
{
"epoch": 1.7413984461709213,
"grad_norm": 0.06540618097934706,
"learning_rate": 1.966425925748115e-05,
"loss": 0.6898,
"mean_token_accuracy": 0.7874069573881058,
"step": 785
},
{
"epoch": 1.7524972253052165,
"grad_norm": 0.06700381366809494,
"learning_rate": 1.9654220942653223e-05,
"loss": 0.7148,
"mean_token_accuracy": 0.7804010777085539,
"step": 790
},
{
"epoch": 1.7635960044395116,
"grad_norm": 0.06251703724168588,
"learning_rate": 1.964403740114939e-05,
"loss": 0.7042,
"mean_token_accuracy": 0.7826519820274497,
"step": 795
},
{
"epoch": 1.774694783573807,
"grad_norm": 0.06965212134485024,
"learning_rate": 1.9633708786158803e-05,
"loss": 0.7041,
"mean_token_accuracy": 0.7828340916547577,
"step": 800
},
{
"epoch": 1.774694783573807,
"eval_loss": 0.7456536293029785,
"eval_mean_token_accuracy": 0.7707645481418179,
"eval_runtime": 2.5019,
"eval_samples_per_second": 51.561,
"eval_steps_per_second": 4.397,
"step": 800
},
{
"epoch": 1.785793562708102,
"grad_norm": 0.06629236648953662,
"learning_rate": 1.9623235253052924e-05,
"loss": 0.7031,
"mean_token_accuracy": 0.783023400482328,
"step": 805
},
{
"epoch": 1.7968923418423972,
"grad_norm": 0.0700851146503613,
"learning_rate": 1.961261695938319e-05,
"loss": 0.6879,
"mean_token_accuracy": 0.7871533891419487,
"step": 810
},
{
"epoch": 1.8079911209766926,
"grad_norm": 0.06481275430876884,
"learning_rate": 1.9601854064878645e-05,
"loss": 0.6976,
"mean_token_accuracy": 0.7848978893968511,
"step": 815
},
{
"epoch": 1.819089900110988,
"grad_norm": 0.06713656020683173,
"learning_rate": 1.959094673144354e-05,
"loss": 0.6972,
"mean_token_accuracy": 0.7849619060835862,
"step": 820
},
{
"epoch": 1.830188679245283,
"grad_norm": 0.06943806825337467,
"learning_rate": 1.957989512315489e-05,
"loss": 0.7057,
"mean_token_accuracy": 0.7820006305173784,
"step": 825
},
{
"epoch": 1.8412874583795782,
"grad_norm": 0.07046613499341978,
"learning_rate": 1.9568699406260016e-05,
"loss": 0.6993,
"mean_token_accuracy": 0.7842056521277757,
"step": 830
},
{
"epoch": 1.8523862375138735,
"grad_norm": 0.06742551538991974,
"learning_rate": 1.9557359749174033e-05,
"loss": 0.6937,
"mean_token_accuracy": 0.7860851352389128,
"step": 835
},
{
"epoch": 1.8634850166481687,
"grad_norm": 0.06568715165754793,
"learning_rate": 1.954587632247732e-05,
"loss": 0.7206,
"mean_token_accuracy": 0.7770859098863055,
"step": 840
},
{
"epoch": 1.8745837957824638,
"grad_norm": 0.07469838406522963,
"learning_rate": 1.9534249298912968e-05,
"loss": 0.7003,
"mean_token_accuracy": 0.7843834053290605,
"step": 845
},
{
"epoch": 1.8856825749167592,
"grad_norm": 0.07128217656144081,
"learning_rate": 1.9522478853384154e-05,
"loss": 0.7094,
"mean_token_accuracy": 0.780810941894696,
"step": 850
},
{
"epoch": 1.8967813540510545,
"grad_norm": 0.06973978771104394,
"learning_rate": 1.9510565162951538e-05,
"loss": 0.6922,
"mean_token_accuracy": 0.7863895170050037,
"step": 855
},
{
"epoch": 1.9078801331853497,
"grad_norm": 0.0670621119728851,
"learning_rate": 1.9498508406830577e-05,
"loss": 0.7063,
"mean_token_accuracy": 0.7823195111378769,
"step": 860
},
{
"epoch": 1.9189789123196448,
"grad_norm": 0.07134711572023682,
"learning_rate": 1.9486308766388843e-05,
"loss": 0.7145,
"mean_token_accuracy": 0.7796630421809072,
"step": 865
},
{
"epoch": 1.9300776914539401,
"grad_norm": 0.06724287084560662,
"learning_rate": 1.9473966425143292e-05,
"loss": 0.6996,
"mean_token_accuracy": 0.7833654062751905,
"step": 870
},
{
"epoch": 1.9411764705882353,
"grad_norm": 0.07372692352709145,
"learning_rate": 1.946148156875751e-05,
"loss": 0.7108,
"mean_token_accuracy": 0.7810817039400682,
"step": 875
},
{
"epoch": 1.9522752497225304,
"grad_norm": 0.06690981599243406,
"learning_rate": 1.944885438503888e-05,
"loss": 0.7219,
"mean_token_accuracy": 0.778134444662938,
"step": 880
},
{
"epoch": 1.9633740288568258,
"grad_norm": 0.0706861843071515,
"learning_rate": 1.9436085063935837e-05,
"loss": 0.6837,
"mean_token_accuracy": 0.7875059272645717,
"step": 885
},
{
"epoch": 1.9744728079911211,
"grad_norm": 0.06607352029909119,
"learning_rate": 1.9423173797534924e-05,
"loss": 0.7144,
"mean_token_accuracy": 0.7800783446443771,
"step": 890
},
{
"epoch": 1.9855715871254163,
"grad_norm": 0.06497044410128139,
"learning_rate": 1.9410120780057958e-05,
"loss": 0.6997,
"mean_token_accuracy": 0.7838718452599522,
"step": 895
},
{
"epoch": 1.9966703662597114,
"grad_norm": 0.0669833105277842,
"learning_rate": 1.9396926207859085e-05,
"loss": 0.7017,
"mean_token_accuracy": 0.783079952098619,
"step": 900
},
{
"epoch": 1.9966703662597114,
"eval_loss": 0.7399606108665466,
"eval_mean_token_accuracy": 0.7714698001679475,
"eval_runtime": 2.502,
"eval_samples_per_second": 51.559,
"eval_steps_per_second": 4.397,
"step": 900
},
{
"epoch": 2.006659267480577,
"grad_norm": 0.06752952481322469,
"learning_rate": 1.938359027942184e-05,
"loss": 0.6904,
"mean_token_accuracy": 0.7866080115747632,
"step": 905
},
{
"epoch": 2.0177580466148726,
"grad_norm": 0.07337066846619034,
"learning_rate": 1.937011319535615e-05,
"loss": 0.6648,
"mean_token_accuracy": 0.7914878464945215,
"step": 910
},
{
"epoch": 2.0288568257491675,
"grad_norm": 0.07482816766070267,
"learning_rate": 1.9356495158395317e-05,
"loss": 0.6491,
"mean_token_accuracy": 0.7960834519099566,
"step": 915
},
{
"epoch": 2.039955604883463,
"grad_norm": 0.07602684827791824,
"learning_rate": 1.9342736373392976e-05,
"loss": 0.6328,
"mean_token_accuracy": 0.8012870703759848,
"step": 920
},
{
"epoch": 2.051054384017758,
"grad_norm": 0.06687084853107801,
"learning_rate": 1.932883704732001e-05,
"loss": 0.6382,
"mean_token_accuracy": 0.7995388906965585,
"step": 925
},
{
"epoch": 2.062153163152053,
"grad_norm": 0.0674027612147169,
"learning_rate": 1.9314797389261426e-05,
"loss": 0.6398,
"mean_token_accuracy": 0.7987728519453021,
"step": 930
},
{
"epoch": 2.0732519422863485,
"grad_norm": 0.06627889722925441,
"learning_rate": 1.9300617610413232e-05,
"loss": 0.6379,
"mean_token_accuracy": 0.7998474325494318,
"step": 935
},
{
"epoch": 2.084350721420644,
"grad_norm": 0.06945568860354903,
"learning_rate": 1.9286297924079244e-05,
"loss": 0.6191,
"mean_token_accuracy": 0.8055909240777206,
"step": 940
},
{
"epoch": 2.095449500554939,
"grad_norm": 0.07041836195613017,
"learning_rate": 1.9271838545667876e-05,
"loss": 0.6556,
"mean_token_accuracy": 0.7946697171561378,
"step": 945
},
{
"epoch": 2.106548279689234,
"grad_norm": 0.06688242487342114,
"learning_rate": 1.9257239692688907e-05,
"loss": 0.6574,
"mean_token_accuracy": 0.7937854444231515,
"step": 950
},
{
"epoch": 2.1176470588235294,
"grad_norm": 0.06439046861743944,
"learning_rate": 1.92425015847502e-05,
"loss": 0.6403,
"mean_token_accuracy": 0.7991636690728392,
"step": 955
},
{
"epoch": 2.128745837957825,
"grad_norm": 0.06720995663335921,
"learning_rate": 1.9227624443554425e-05,
"loss": 0.6453,
"mean_token_accuracy": 0.7970826233622977,
"step": 960
},
{
"epoch": 2.1398446170921197,
"grad_norm": 0.06581057685199704,
"learning_rate": 1.921260849289568e-05,
"loss": 0.6232,
"mean_token_accuracy": 0.8048729584925922,
"step": 965
},
{
"epoch": 2.150943396226415,
"grad_norm": 0.06501306496885344,
"learning_rate": 1.9197453958656157e-05,
"loss": 0.6399,
"mean_token_accuracy": 0.7984237345184382,
"step": 970
},
{
"epoch": 2.1620421753607104,
"grad_norm": 0.06827894352599473,
"learning_rate": 1.9182161068802742e-05,
"loss": 0.6495,
"mean_token_accuracy": 0.795851682529419,
"step": 975
},
{
"epoch": 2.1731409544950058,
"grad_norm": 0.06461558807425297,
"learning_rate": 1.916673005338357e-05,
"loss": 0.656,
"mean_token_accuracy": 0.794067004713519,
"step": 980
},
{
"epoch": 2.1842397336293007,
"grad_norm": 0.07270995750344725,
"learning_rate": 1.915116114452458e-05,
"loss": 0.6385,
"mean_token_accuracy": 0.7996120848528411,
"step": 985
},
{
"epoch": 2.195338512763596,
"grad_norm": 0.06469442541114857,
"learning_rate": 1.913545457642601e-05,
"loss": 0.6479,
"mean_token_accuracy": 0.7971225711022437,
"step": 990
},
{
"epoch": 2.2064372918978914,
"grad_norm": 0.06869297358951969,
"learning_rate": 1.911961058535889e-05,
"loss": 0.6184,
"mean_token_accuracy": 0.8051116219349698,
"step": 995
},
{
"epoch": 2.2175360710321863,
"grad_norm": 0.06879744937055413,
"learning_rate": 1.9103629409661468e-05,
"loss": 0.6619,
"mean_token_accuracy": 0.7926388777759388,
"step": 1000
},
{
"epoch": 2.2175360710321863,
"eval_loss": 0.7441337704658508,
"eval_mean_token_accuracy": 0.771140236457519,
"eval_runtime": 2.5113,
"eval_samples_per_second": 51.369,
"eval_steps_per_second": 4.38,
"step": 1000
},
{
"epoch": 2.2286348501664817,
"grad_norm": 0.06942438285553687,
"learning_rate": 1.9087511289735646e-05,
"loss": 0.6319,
"mean_token_accuracy": 0.8006943272195789,
"step": 1005
},
{
"epoch": 2.239733629300777,
"grad_norm": 0.07156492009528506,
"learning_rate": 1.907125646804334e-05,
"loss": 0.66,
"mean_token_accuracy": 0.793213293023644,
"step": 1010
},
{
"epoch": 2.2508324084350724,
"grad_norm": 0.0725060557022843,
"learning_rate": 1.905486518910286e-05,
"loss": 0.6675,
"mean_token_accuracy": 0.7907495163202596,
"step": 1015
},
{
"epoch": 2.2619311875693673,
"grad_norm": 0.06665264760259867,
"learning_rate": 1.9038337699485207e-05,
"loss": 0.6574,
"mean_token_accuracy": 0.7942814820609995,
"step": 1020
},
{
"epoch": 2.2730299667036626,
"grad_norm": 0.06471591371274955,
"learning_rate": 1.902167424781038e-05,
"loss": 0.6518,
"mean_token_accuracy": 0.7957625783260307,
"step": 1025
},
{
"epoch": 2.284128745837958,
"grad_norm": 0.06342644925790648,
"learning_rate": 1.9004875084743624e-05,
"loss": 0.6441,
"mean_token_accuracy": 0.7975496945585403,
"step": 1030
},
{
"epoch": 2.295227524972253,
"grad_norm": 0.06887554911088101,
"learning_rate": 1.8987940462991673e-05,
"loss": 0.6354,
"mean_token_accuracy": 0.8001512281519416,
"step": 1035
},
{
"epoch": 2.3063263041065483,
"grad_norm": 0.06901162427572825,
"learning_rate": 1.8970870637298936e-05,
"loss": 0.6553,
"mean_token_accuracy": 0.7942396405885372,
"step": 1040
},
{
"epoch": 2.3174250832408436,
"grad_norm": 0.06479294329676767,
"learning_rate": 1.895366586444367e-05,
"loss": 0.6583,
"mean_token_accuracy": 0.7935195245732686,
"step": 1045
},
{
"epoch": 2.328523862375139,
"grad_norm": 0.06989659802389134,
"learning_rate": 1.8936326403234125e-05,
"loss": 0.652,
"mean_token_accuracy": 0.7949510683065234,
"step": 1050
},
{
"epoch": 2.339622641509434,
"grad_norm": 0.06954996933553559,
"learning_rate": 1.8918852514504632e-05,
"loss": 0.6385,
"mean_token_accuracy": 0.799856158538445,
"step": 1055
},
{
"epoch": 2.3507214206437292,
"grad_norm": 0.07092170466098426,
"learning_rate": 1.8901244461111697e-05,
"loss": 0.6269,
"mean_token_accuracy": 0.8034097121418204,
"step": 1060
},
{
"epoch": 2.3618201997780246,
"grad_norm": 0.06217401891589138,
"learning_rate": 1.8883502507930044e-05,
"loss": 0.6395,
"mean_token_accuracy": 0.7991042823069772,
"step": 1065
},
{
"epoch": 2.3729189789123195,
"grad_norm": 0.0662172520804478,
"learning_rate": 1.8865626921848615e-05,
"loss": 0.6451,
"mean_token_accuracy": 0.7974116719314959,
"step": 1070
},
{
"epoch": 2.384017758046615,
"grad_norm": 0.07112233857552575,
"learning_rate": 1.8847617971766577e-05,
"loss": 0.653,
"mean_token_accuracy": 0.7955020926307707,
"step": 1075
},
{
"epoch": 2.39511653718091,
"grad_norm": 0.07214209721888336,
"learning_rate": 1.8829475928589272e-05,
"loss": 0.649,
"mean_token_accuracy": 0.7957132739895204,
"step": 1080
},
{
"epoch": 2.4062153163152056,
"grad_norm": 0.07301580591387644,
"learning_rate": 1.8811201065224122e-05,
"loss": 0.6622,
"mean_token_accuracy": 0.7918022645898184,
"step": 1085
},
{
"epoch": 2.4173140954495005,
"grad_norm": 0.06822261694746347,
"learning_rate": 1.8792793656576544e-05,
"loss": 0.6502,
"mean_token_accuracy": 0.795963781486279,
"step": 1090
},
{
"epoch": 2.428412874583796,
"grad_norm": 0.06893747302221427,
"learning_rate": 1.877425397954582e-05,
"loss": 0.6587,
"mean_token_accuracy": 0.7934014423473963,
"step": 1095
},
{
"epoch": 2.439511653718091,
"grad_norm": 0.0674145014890421,
"learning_rate": 1.8755582313020912e-05,
"loss": 0.6257,
"mean_token_accuracy": 0.8032613355575334,
"step": 1100
},
{
"epoch": 2.439511653718091,
"eval_loss": 0.7397039532661438,
"eval_mean_token_accuracy": 0.7719497194970454,
"eval_runtime": 2.5114,
"eval_samples_per_second": 51.367,
"eval_steps_per_second": 4.38,
"step": 1100
},
{
"epoch": 2.450610432852386,
"grad_norm": 0.06748775143884418,
"learning_rate": 1.873677893787627e-05,
"loss": 0.6545,
"mean_token_accuracy": 0.7947147115927367,
"step": 1105
},
{
"epoch": 2.4617092119866815,
"grad_norm": 0.06642880789873921,
"learning_rate": 1.8717844136967626e-05,
"loss": 0.657,
"mean_token_accuracy": 0.7933624891798241,
"step": 1110
},
{
"epoch": 2.472807991120977,
"grad_norm": 0.072992943391947,
"learning_rate": 1.8698778195127715e-05,
"loss": 0.6569,
"mean_token_accuracy": 0.7930798499718653,
"step": 1115
},
{
"epoch": 2.4839067702552717,
"grad_norm": 0.0676737777720738,
"learning_rate": 1.8679581399162008e-05,
"loss": 0.6461,
"mean_token_accuracy": 0.7969763117340545,
"step": 1120
},
{
"epoch": 2.495005549389567,
"grad_norm": 0.0711336163435986,
"learning_rate": 1.866025403784439e-05,
"loss": 0.6202,
"mean_token_accuracy": 0.8044824253292406,
"step": 1125
},
{
"epoch": 2.5061043285238624,
"grad_norm": 0.0697843091991907,
"learning_rate": 1.8640796401912805e-05,
"loss": 0.6353,
"mean_token_accuracy": 0.8006583544576105,
"step": 1130
},
{
"epoch": 2.5172031076581574,
"grad_norm": 0.066545120562253,
"learning_rate": 1.8621208784064913e-05,
"loss": 0.6652,
"mean_token_accuracy": 0.7910918076425002,
"step": 1135
},
{
"epoch": 2.5283018867924527,
"grad_norm": 0.06454289193586986,
"learning_rate": 1.860149147895366e-05,
"loss": 0.6276,
"mean_token_accuracy": 0.8020308318948542,
"step": 1140
},
{
"epoch": 2.539400665926748,
"grad_norm": 0.06648253339239633,
"learning_rate": 1.8581644783182837e-05,
"loss": 0.634,
"mean_token_accuracy": 0.8003396054716794,
"step": 1145
},
{
"epoch": 2.5504994450610434,
"grad_norm": 0.060198174794412355,
"learning_rate": 1.8561668995302668e-05,
"loss": 0.6469,
"mean_token_accuracy": 0.7973357638528167,
"step": 1150
},
{
"epoch": 2.561598224195339,
"grad_norm": 0.06659840562861344,
"learning_rate": 1.854156441580526e-05,
"loss": 0.6467,
"mean_token_accuracy": 0.7969091757435878,
"step": 1155
},
{
"epoch": 2.5726970033296337,
"grad_norm": 0.06563698320336359,
"learning_rate": 1.8521331347120116e-05,
"loss": 0.6381,
"mean_token_accuracy": 0.7992970667738716,
"step": 1160
},
{
"epoch": 2.583795782463929,
"grad_norm": 0.06478226257486022,
"learning_rate": 1.850097009360958e-05,
"loss": 0.6426,
"mean_token_accuracy": 0.7983039033131962,
"step": 1165
},
{
"epoch": 2.594894561598224,
"grad_norm": 0.0664676569567307,
"learning_rate": 1.848048096156426e-05,
"loss": 0.6347,
"mean_token_accuracy": 0.8004106791529967,
"step": 1170
},
{
"epoch": 2.6059933407325193,
"grad_norm": 0.07085311611322727,
"learning_rate": 1.845986425919841e-05,
"loss": 0.6606,
"mean_token_accuracy": 0.7914169985202615,
"step": 1175
},
{
"epoch": 2.6170921198668147,
"grad_norm": 0.07197033028656963,
"learning_rate": 1.843912029664531e-05,
"loss": 0.6492,
"mean_token_accuracy": 0.7963753925899613,
"step": 1180
},
{
"epoch": 2.62819089900111,
"grad_norm": 0.06511918055148407,
"learning_rate": 1.8418249385952575e-05,
"loss": 0.622,
"mean_token_accuracy": 0.8041274702036085,
"step": 1185
},
{
"epoch": 2.6392896781354054,
"grad_norm": 0.06339181942451623,
"learning_rate": 1.83972518410775e-05,
"loss": 0.6403,
"mean_token_accuracy": 0.7988095012565483,
"step": 1190
},
{
"epoch": 2.6503884572697003,
"grad_norm": 0.06237858298480496,
"learning_rate": 1.8376127977882294e-05,
"loss": 0.6225,
"mean_token_accuracy": 0.8035680254950377,
"step": 1195
},
{
"epoch": 2.6614872364039956,
"grad_norm": 0.06430635783125142,
"learning_rate": 1.8354878114129368e-05,
"loss": 0.6359,
"mean_token_accuracy": 0.7994956540644247,
"step": 1200
},
{
"epoch": 2.6614872364039956,
"eval_loss": 0.7367499470710754,
"eval_mean_token_accuracy": 0.7725689509903593,
"eval_runtime": 2.5066,
"eval_samples_per_second": 51.465,
"eval_steps_per_second": 4.388,
"step": 1200
},
{
"epoch": 2.6725860155382906,
"grad_norm": 0.07033787900057323,
"learning_rate": 1.8333502569476516e-05,
"loss": 0.6474,
"mean_token_accuracy": 0.7964749010659795,
"step": 1205
},
{
"epoch": 2.683684794672586,
"grad_norm": 0.06484833686602164,
"learning_rate": 1.8312001665472146e-05,
"loss": 0.6496,
"mean_token_accuracy": 0.7958858397960623,
"step": 1210
},
{
"epoch": 2.6947835738068813,
"grad_norm": 0.0626772586359051,
"learning_rate": 1.8290375725550417e-05,
"loss": 0.6255,
"mean_token_accuracy": 0.8037525321584809,
"step": 1215
},
{
"epoch": 2.7058823529411766,
"grad_norm": 0.06159318683983137,
"learning_rate": 1.8268625075026375e-05,
"loss": 0.6501,
"mean_token_accuracy": 0.7965229989957979,
"step": 1220
},
{
"epoch": 2.7169811320754715,
"grad_norm": 0.0618598047924171,
"learning_rate": 1.824675004109107e-05,
"loss": 0.6513,
"mean_token_accuracy": 0.7947231244403284,
"step": 1225
},
{
"epoch": 2.728079911209767,
"grad_norm": 0.06568273055121288,
"learning_rate": 1.8224750952806626e-05,
"loss": 0.646,
"mean_token_accuracy": 0.7970477905076125,
"step": 1230
},
{
"epoch": 2.7391786903440623,
"grad_norm": 0.06819286679772224,
"learning_rate": 1.8202628141101294e-05,
"loss": 0.6422,
"mean_token_accuracy": 0.7986810100724087,
"step": 1235
},
{
"epoch": 2.750277469478357,
"grad_norm": 0.0711841520608327,
"learning_rate": 1.818038193876448e-05,
"loss": 0.6174,
"mean_token_accuracy": 0.8060896278306409,
"step": 1240
},
{
"epoch": 2.7613762486126525,
"grad_norm": 0.06971065510993014,
"learning_rate": 1.8158012680441723e-05,
"loss": 0.6432,
"mean_token_accuracy": 0.7975791405172538,
"step": 1245
},
{
"epoch": 2.772475027746948,
"grad_norm": 0.06624207898108019,
"learning_rate": 1.8135520702629677e-05,
"loss": 0.6695,
"mean_token_accuracy": 0.7898385773474513,
"step": 1250
},
{
"epoch": 2.7835738068812432,
"grad_norm": 0.06340268367764915,
"learning_rate": 1.8112906343671045e-05,
"loss": 0.6448,
"mean_token_accuracy": 0.7969842824020124,
"step": 1255
},
{
"epoch": 2.794672586015538,
"grad_norm": 0.06802225275898736,
"learning_rate": 1.8090169943749477e-05,
"loss": 0.6352,
"mean_token_accuracy": 0.7999616056672751,
"step": 1260
},
{
"epoch": 2.8057713651498335,
"grad_norm": 0.0621959926252905,
"learning_rate": 1.806731184488447e-05,
"loss": 0.6511,
"mean_token_accuracy": 0.7956011385580143,
"step": 1265
},
{
"epoch": 2.816870144284129,
"grad_norm": 0.06958322307757032,
"learning_rate": 1.8044332390926224e-05,
"loss": 0.6546,
"mean_token_accuracy": 0.7944194847521491,
"step": 1270
},
{
"epoch": 2.8279689234184238,
"grad_norm": 0.06499832024961039,
"learning_rate": 1.802123192755044e-05,
"loss": 0.6615,
"mean_token_accuracy": 0.7928298407723101,
"step": 1275
},
{
"epoch": 2.839067702552719,
"grad_norm": 0.07307630365245506,
"learning_rate": 1.799801080225316e-05,
"loss": 0.6332,
"mean_token_accuracy": 0.8002019958315876,
"step": 1280
},
{
"epoch": 2.8501664816870145,
"grad_norm": 0.07081063389589305,
"learning_rate": 1.7974669364345518e-05,
"loss": 0.6436,
"mean_token_accuracy": 0.7975661678829284,
"step": 1285
},
{
"epoch": 2.86126526082131,
"grad_norm": 0.06364104487158771,
"learning_rate": 1.795120796494848e-05,
"loss": 0.6417,
"mean_token_accuracy": 0.798216180882557,
"step": 1290
},
{
"epoch": 2.8723640399556047,
"grad_norm": 0.07022725039220128,
"learning_rate": 1.7927626956987577e-05,
"loss": 0.6193,
"mean_token_accuracy": 0.8051732373945985,
"step": 1295
},
{
"epoch": 2.8834628190899,
"grad_norm": 0.07407116375551813,
"learning_rate": 1.7903926695187595e-05,
"loss": 0.6288,
"mean_token_accuracy": 0.8017575332765929,
"step": 1300
},
{
"epoch": 2.8834628190899,
"eval_loss": 0.7337623834609985,
"eval_mean_token_accuracy": 0.7733067253190368,
"eval_runtime": 2.5114,
"eval_samples_per_second": 51.366,
"eval_steps_per_second": 4.38,
"step": 1300
},
{
"epoch": 2.8945615982241955,
"grad_norm": 0.0683657078818875,
"learning_rate": 1.788010753606722e-05,
"loss": 0.6437,
"mean_token_accuracy": 0.7979937366466249,
"step": 1305
},
{
"epoch": 2.9056603773584904,
"grad_norm": 0.06401144030251774,
"learning_rate": 1.78561698379337e-05,
"loss": 0.6525,
"mean_token_accuracy": 0.7952514850245388,
"step": 1310
},
{
"epoch": 2.9167591564927857,
"grad_norm": 0.06939812406872137,
"learning_rate": 1.7832113960877445e-05,
"loss": 0.6326,
"mean_token_accuracy": 0.8011824622832678,
"step": 1315
},
{
"epoch": 2.927857935627081,
"grad_norm": 0.06390511262490556,
"learning_rate": 1.7807940266766595e-05,
"loss": 0.6491,
"mean_token_accuracy": 0.7960628539263179,
"step": 1320
},
{
"epoch": 2.9389567147613764,
"grad_norm": 0.0679922767459088,
"learning_rate": 1.7783649119241603e-05,
"loss": 0.6343,
"mean_token_accuracy": 0.8002581088236139,
"step": 1325
},
{
"epoch": 2.9500554938956713,
"grad_norm": 0.06579108848028142,
"learning_rate": 1.7759240883709745e-05,
"loss": 0.6244,
"mean_token_accuracy": 0.8036198689328673,
"step": 1330
},
{
"epoch": 2.9611542730299667,
"grad_norm": 0.07202792613693204,
"learning_rate": 1.7734715927339642e-05,
"loss": 0.6553,
"mean_token_accuracy": 0.7938320456683176,
"step": 1335
},
{
"epoch": 2.972253052164262,
"grad_norm": 0.0669733486087782,
"learning_rate": 1.7710074619055707e-05,
"loss": 0.6518,
"mean_token_accuracy": 0.7945248169339653,
"step": 1340
},
{
"epoch": 2.983351831298557,
"grad_norm": 0.06844420691058574,
"learning_rate": 1.7685317329532633e-05,
"loss": 0.6561,
"mean_token_accuracy": 0.7932978083640501,
"step": 1345
},
{
"epoch": 2.9944506104328523,
"grad_norm": 0.06735395164638033,
"learning_rate": 1.766044443118978e-05,
"loss": 0.6451,
"mean_token_accuracy": 0.7970097680754145,
"step": 1350
},
{
"epoch": 3.004439511653718,
"grad_norm": 0.07064058723031082,
"learning_rate": 1.7635456298185607e-05,
"loss": 0.6223,
"mean_token_accuracy": 0.8053257851051954,
"step": 1355
},
{
"epoch": 3.0155382907880135,
"grad_norm": 0.07521256769387163,
"learning_rate": 1.761035330641201e-05,
"loss": 0.568,
"mean_token_accuracy": 0.8179713362423978,
"step": 1360
},
{
"epoch": 3.0266370699223084,
"grad_norm": 0.06470087882605292,
"learning_rate": 1.7585135833488692e-05,
"loss": 0.5777,
"mean_token_accuracy": 0.8148876311929595,
"step": 1365
},
{
"epoch": 3.0377358490566038,
"grad_norm": 0.06747297389090014,
"learning_rate": 1.755980425875748e-05,
"loss": 0.595,
"mean_token_accuracy": 0.8092302889910499,
"step": 1370
},
{
"epoch": 3.048834628190899,
"grad_norm": 0.06590547459656546,
"learning_rate": 1.7534358963276606e-05,
"loss": 0.5944,
"mean_token_accuracy": 0.810452184015394,
"step": 1375
},
{
"epoch": 3.059933407325194,
"grad_norm": 0.06771978622523486,
"learning_rate": 1.7508800329814993e-05,
"loss": 0.5857,
"mean_token_accuracy": 0.8124715930556723,
"step": 1380
},
{
"epoch": 3.0710321864594894,
"grad_norm": 0.06878565172783498,
"learning_rate": 1.748312874284647e-05,
"loss": 0.5818,
"mean_token_accuracy": 0.8135985202900582,
"step": 1385
},
{
"epoch": 3.0821309655937847,
"grad_norm": 0.06402876021976343,
"learning_rate": 1.7457344588544018e-05,
"loss": 0.5769,
"mean_token_accuracy": 0.8156846142367662,
"step": 1390
},
{
"epoch": 3.09322974472808,
"grad_norm": 0.0646784982224439,
"learning_rate": 1.7431448254773943e-05,
"loss": 0.5798,
"mean_token_accuracy": 0.8142970258740082,
"step": 1395
},
{
"epoch": 3.104328523862375,
"grad_norm": 0.06597855288274974,
"learning_rate": 1.740544013109005e-05,
"loss": 0.5884,
"mean_token_accuracy": 0.8115392162041799,
"step": 1400
},
{
"epoch": 3.104328523862375,
"eval_loss": 0.7474381923675537,
"eval_mean_token_accuracy": 0.7717004443797748,
"eval_runtime": 2.5041,
"eval_samples_per_second": 51.515,
"eval_steps_per_second": 4.393,
"step": 1400
},
{
"epoch": 3.1154273029966704,
"grad_norm": 0.06425734459230836,
"learning_rate": 1.7379320608727766e-05,
"loss": 0.6018,
"mean_token_accuracy": 0.8074577019491473,
"step": 1405
},
{
"epoch": 3.1265260821309657,
"grad_norm": 0.07122678407183575,
"learning_rate": 1.735309008059829e-05,
"loss": 0.5787,
"mean_token_accuracy": 0.8148202615560042,
"step": 1410
},
{
"epoch": 3.1376248612652606,
"grad_norm": 0.07472861362175476,
"learning_rate": 1.7326748941282638e-05,
"loss": 0.5773,
"mean_token_accuracy": 0.8159486725956425,
"step": 1415
},
{
"epoch": 3.148723640399556,
"grad_norm": 0.06940334372384507,
"learning_rate": 1.7300297587025748e-05,
"loss": 0.5751,
"mean_token_accuracy": 0.8157392354437368,
"step": 1420
},
{
"epoch": 3.1598224195338513,
"grad_norm": 0.06891927963787495,
"learning_rate": 1.7273736415730488e-05,
"loss": 0.604,
"mean_token_accuracy": 0.8073721273984527,
"step": 1425
},
{
"epoch": 3.1709211986681467,
"grad_norm": 0.06448868374468762,
"learning_rate": 1.7247065826951694e-05,
"loss": 0.5968,
"mean_token_accuracy": 0.8095589722183332,
"step": 1430
},
{
"epoch": 3.1820199778024416,
"grad_norm": 0.06475524602325504,
"learning_rate": 1.7220286221890137e-05,
"loss": 0.6004,
"mean_token_accuracy": 0.8077030508730166,
"step": 1435
},
{
"epoch": 3.193118756936737,
"grad_norm": 0.06875614933754029,
"learning_rate": 1.7193398003386514e-05,
"loss": 0.5803,
"mean_token_accuracy": 0.8144615967330615,
"step": 1440
},
{
"epoch": 3.2042175360710323,
"grad_norm": 0.07191758410618443,
"learning_rate": 1.716640157591536e-05,
"loss": 0.5729,
"mean_token_accuracy": 0.8167040368980463,
"step": 1445
},
{
"epoch": 3.2153163152053272,
"grad_norm": 0.07506881347861384,
"learning_rate": 1.7139297345578992e-05,
"loss": 0.5904,
"mean_token_accuracy": 0.8116413687428394,
"step": 1450
},
{
"epoch": 3.2264150943396226,
"grad_norm": 0.06683185958843758,
"learning_rate": 1.711208572010137e-05,
"loss": 0.5879,
"mean_token_accuracy": 0.8117346893203707,
"step": 1455
},
{
"epoch": 3.237513873473918,
"grad_norm": 0.06384815369613044,
"learning_rate": 1.7084767108822e-05,
"loss": 0.5786,
"mean_token_accuracy": 0.8152051484844588,
"step": 1460
},
{
"epoch": 3.2486126526082133,
"grad_norm": 0.07113482761300648,
"learning_rate": 1.7057341922689733e-05,
"loss": 0.5833,
"mean_token_accuracy": 0.8136391983287101,
"step": 1465
},
{
"epoch": 3.259711431742508,
"grad_norm": 0.06875329966774961,
"learning_rate": 1.702981057425662e-05,
"loss": 0.6055,
"mean_token_accuracy": 0.8064521514912204,
"step": 1470
},
{
"epoch": 3.2708102108768036,
"grad_norm": 0.07129076660764946,
"learning_rate": 1.7002173477671685e-05,
"loss": 0.5795,
"mean_token_accuracy": 0.8147054150827504,
"step": 1475
},
{
"epoch": 3.281908990011099,
"grad_norm": 0.073218043150243,
"learning_rate": 1.6974431048674714e-05,
"loss": 0.5838,
"mean_token_accuracy": 0.8133932009098818,
"step": 1480
},
{
"epoch": 3.293007769145394,
"grad_norm": 0.06689255732673634,
"learning_rate": 1.6946583704589973e-05,
"loss": 0.597,
"mean_token_accuracy": 0.8088926481283882,
"step": 1485
},
{
"epoch": 3.304106548279689,
"grad_norm": 0.06374243658625131,
"learning_rate": 1.691863186431996e-05,
"loss": 0.5905,
"mean_token_accuracy": 0.8113178827657815,
"step": 1490
},
{
"epoch": 3.3152053274139845,
"grad_norm": 0.06295813244992231,
"learning_rate": 1.689057594833908e-05,
"loss": 0.6032,
"mean_token_accuracy": 0.8069019652730617,
"step": 1495
},
{
"epoch": 3.32630410654828,
"grad_norm": 0.06579736493219757,
"learning_rate": 1.686241637868734e-05,
"loss": 0.5826,
"mean_token_accuracy": 0.8140389465289355,
"step": 1500
},
{
"epoch": 3.32630410654828,
"eval_loss": 0.7457160353660583,
"eval_mean_token_accuracy": 0.7719505953380342,
"eval_runtime": 2.5024,
"eval_samples_per_second": 51.551,
"eval_steps_per_second": 4.396,
"step": 1500
},
{
"epoch": 3.337402885682575,
"grad_norm": 0.07091243083602151,
"learning_rate": 1.683415357896397e-05,
"loss": 0.5857,
"mean_token_accuracy": 0.8123794127619786,
"step": 1505
},
{
"epoch": 3.34850166481687,
"grad_norm": 0.061307383634218,
"learning_rate": 1.6805787974321107e-05,
"loss": 0.5746,
"mean_token_accuracy": 0.8166298304759364,
"step": 1510
},
{
"epoch": 3.3596004439511655,
"grad_norm": 0.07258491640438769,
"learning_rate": 1.6777319991457325e-05,
"loss": 0.597,
"mean_token_accuracy": 0.8087377760474664,
"step": 1515
},
{
"epoch": 3.3706992230854604,
"grad_norm": 0.07268211516228792,
"learning_rate": 1.674875005861128e-05,
"loss": 0.5722,
"mean_token_accuracy": 0.8174179468241606,
"step": 1520
},
{
"epoch": 3.381798002219756,
"grad_norm": 0.07425718968092598,
"learning_rate": 1.6720078605555227e-05,
"loss": 0.5878,
"mean_token_accuracy": 0.8117901692302955,
"step": 1525
},
{
"epoch": 3.392896781354051,
"grad_norm": 0.06477589781728992,
"learning_rate": 1.6691306063588583e-05,
"loss": 0.5807,
"mean_token_accuracy": 0.8140918874113428,
"step": 1530
},
{
"epoch": 3.4039955604883465,
"grad_norm": 0.06662810742289851,
"learning_rate": 1.6662432865531428e-05,
"loss": 0.585,
"mean_token_accuracy": 0.8134061399415382,
"step": 1535
},
{
"epoch": 3.4150943396226414,
"grad_norm": 0.06459536165537999,
"learning_rate": 1.6633459445717973e-05,
"loss": 0.5978,
"mean_token_accuracy": 0.8095315062509003,
"step": 1540
},
{
"epoch": 3.4261931187569368,
"grad_norm": 0.06639712291679585,
"learning_rate": 1.6604386239990077e-05,
"loss": 0.589,
"mean_token_accuracy": 0.8114936681679807,
"step": 1545
},
{
"epoch": 3.437291897891232,
"grad_norm": 0.06853852815958374,
"learning_rate": 1.657521368569064e-05,
"loss": 0.6109,
"mean_token_accuracy": 0.8049504842638907,
"step": 1550
},
{
"epoch": 3.448390677025527,
"grad_norm": 0.06982303363818694,
"learning_rate": 1.6545942221657042e-05,
"loss": 0.5911,
"mean_token_accuracy": 0.8112879302127055,
"step": 1555
},
{
"epoch": 3.4594894561598224,
"grad_norm": 0.06760713569034003,
"learning_rate": 1.6516572288214555e-05,
"loss": 0.5852,
"mean_token_accuracy": 0.8129200675672401,
"step": 1560
},
{
"epoch": 3.4705882352941178,
"grad_norm": 0.07217456505648073,
"learning_rate": 1.6487104327169702e-05,
"loss": 0.5976,
"mean_token_accuracy": 0.8093511038402598,
"step": 1565
},
{
"epoch": 3.481687014428413,
"grad_norm": 0.070935181965617,
"learning_rate": 1.6457538781803625e-05,
"loss": 0.5905,
"mean_token_accuracy": 0.8113604696928229,
"step": 1570
},
{
"epoch": 3.492785793562708,
"grad_norm": 0.06078018778473591,
"learning_rate": 1.6427876096865394e-05,
"loss": 0.5784,
"mean_token_accuracy": 0.8158070281321645,
"step": 1575
},
{
"epoch": 3.5038845726970034,
"grad_norm": 0.06704102646198273,
"learning_rate": 1.639811671856535e-05,
"loss": 0.6082,
"mean_token_accuracy": 0.8057792572972777,
"step": 1580
},
{
"epoch": 3.5149833518312983,
"grad_norm": 0.06759521898261396,
"learning_rate": 1.636826109456836e-05,
"loss": 0.5926,
"mean_token_accuracy": 0.8109007114764084,
"step": 1585
},
{
"epoch": 3.5260821309655936,
"grad_norm": 0.06645230810548192,
"learning_rate": 1.63383096739871e-05,
"loss": 0.582,
"mean_token_accuracy": 0.8141154963928526,
"step": 1590
},
{
"epoch": 3.537180910099889,
"grad_norm": 0.07053272601617808,
"learning_rate": 1.6308262907375314e-05,
"loss": 0.5831,
"mean_token_accuracy": 0.8134722581791063,
"step": 1595
},
{
"epoch": 3.5482796892341844,
"grad_norm": 0.06509128182629838,
"learning_rate": 1.627812124672099e-05,
"loss": 0.5788,
"mean_token_accuracy": 0.8153766770371117,
"step": 1600
},
{
"epoch": 3.5482796892341844,
"eval_loss": 0.7433986067771912,
"eval_mean_token_accuracy": 0.7729161029544169,
"eval_runtime": 2.5059,
"eval_samples_per_second": 51.478,
"eval_steps_per_second": 4.39,
"step": 1600
},
{
"epoch": 3.5593784683684797,
"grad_norm": 0.06880331086378055,
"learning_rate": 1.6247885145439602e-05,
"loss": 0.5922,
"mean_token_accuracy": 0.8106637998812584,
"step": 1605
},
{
"epoch": 3.5704772475027746,
"grad_norm": 0.06368796917129804,
"learning_rate": 1.6217555058367288e-05,
"loss": 0.5927,
"mean_token_accuracy": 0.8103974761600462,
"step": 1610
},
{
"epoch": 3.58157602663707,
"grad_norm": 0.07182991192738422,
"learning_rate": 1.618713144175399e-05,
"loss": 0.6022,
"mean_token_accuracy": 0.8067329885490656,
"step": 1615
},
{
"epoch": 3.592674805771365,
"grad_norm": 0.06863230039491855,
"learning_rate": 1.6156614753256583e-05,
"loss": 0.6132,
"mean_token_accuracy": 0.8041375083354169,
"step": 1620
},
{
"epoch": 3.6037735849056602,
"grad_norm": 0.06734776872058247,
"learning_rate": 1.6126005451932028e-05,
"loss": 0.5986,
"mean_token_accuracy": 0.8086936598937127,
"step": 1625
},
{
"epoch": 3.6148723640399556,
"grad_norm": 0.06691389231458741,
"learning_rate": 1.6095303998230432e-05,
"loss": 0.5903,
"mean_token_accuracy": 0.8117445609970693,
"step": 1630
},
{
"epoch": 3.625971143174251,
"grad_norm": 0.0626209547447361,
"learning_rate": 1.6064510853988137e-05,
"loss": 0.6066,
"mean_token_accuracy": 0.8063172615190451,
"step": 1635
},
{
"epoch": 3.6370699223085463,
"grad_norm": 0.06374216425499483,
"learning_rate": 1.603362648242076e-05,
"loss": 0.5901,
"mean_token_accuracy": 0.8116106105320405,
"step": 1640
},
{
"epoch": 3.648168701442841,
"grad_norm": 0.0648674771391545,
"learning_rate": 1.6002651348116248e-05,
"loss": 0.5944,
"mean_token_accuracy": 0.8101063167096149,
"step": 1645
},
{
"epoch": 3.6592674805771366,
"grad_norm": 0.06499753351633389,
"learning_rate": 1.5971585917027864e-05,
"loss": 0.5915,
"mean_token_accuracy": 0.8101395566228055,
"step": 1650
},
{
"epoch": 3.6703662597114315,
"grad_norm": 0.06831724829782981,
"learning_rate": 1.5940430656467193e-05,
"loss": 0.5992,
"mean_token_accuracy": 0.8083434092585243,
"step": 1655
},
{
"epoch": 3.681465038845727,
"grad_norm": 0.06790573799595492,
"learning_rate": 1.5909186035097114e-05,
"loss": 0.5785,
"mean_token_accuracy": 0.8147185760230325,
"step": 1660
},
{
"epoch": 3.692563817980022,
"grad_norm": 0.06427088903327956,
"learning_rate": 1.5877852522924733e-05,
"loss": 0.5794,
"mean_token_accuracy": 0.8144371602569551,
"step": 1665
},
{
"epoch": 3.7036625971143176,
"grad_norm": 0.06239985639848319,
"learning_rate": 1.5846430591294334e-05,
"loss": 0.5718,
"mean_token_accuracy": 0.8172256989894547,
"step": 1670
},
{
"epoch": 3.714761376248613,
"grad_norm": 0.06568843799685702,
"learning_rate": 1.5814920712880267e-05,
"loss": 0.5925,
"mean_token_accuracy": 0.8102354089861006,
"step": 1675
},
{
"epoch": 3.725860155382908,
"grad_norm": 0.0651721851556958,
"learning_rate": 1.5783323361679865e-05,
"loss": 0.5727,
"mean_token_accuracy": 0.8159369895094459,
"step": 1680
},
{
"epoch": 3.736958934517203,
"grad_norm": 0.0690220199124441,
"learning_rate": 1.575163901300629e-05,
"loss": 0.5906,
"mean_token_accuracy": 0.8114687402753267,
"step": 1685
},
{
"epoch": 3.748057713651498,
"grad_norm": 0.07248322781827103,
"learning_rate": 1.5719868143481385e-05,
"loss": 0.5884,
"mean_token_accuracy": 0.8114747809234526,
"step": 1690
},
{
"epoch": 3.7591564927857934,
"grad_norm": 0.06344203324482209,
"learning_rate": 1.568801123102852e-05,
"loss": 0.6063,
"mean_token_accuracy": 0.8063411988778503,
"step": 1695
},
{
"epoch": 3.770255271920089,
"grad_norm": 0.06819373858730451,
"learning_rate": 1.5656068754865388e-05,
"loss": 0.6054,
"mean_token_accuracy": 0.8064807145119008,
"step": 1700
},
{
"epoch": 3.770255271920089,
"eval_loss": 0.7394784688949585,
"eval_mean_token_accuracy": 0.7736173949838345,
"eval_runtime": 2.5098,
"eval_samples_per_second": 51.398,
"eval_steps_per_second": 4.383,
"step": 1700
},
{
"epoch": 3.781354051054384,
"grad_norm": 0.06974470044867787,
"learning_rate": 1.56240411954968e-05,
"loss": 0.6052,
"mean_token_accuracy": 0.8067864576933559,
"step": 1705
},
{
"epoch": 3.7924528301886795,
"grad_norm": 0.06783881626076696,
"learning_rate": 1.5591929034707468e-05,
"loss": 0.6029,
"mean_token_accuracy": 0.8078461112558084,
"step": 1710
},
{
"epoch": 3.8035516093229744,
"grad_norm": 0.07199172367413192,
"learning_rate": 1.5559732755554734e-05,
"loss": 0.6165,
"mean_token_accuracy": 0.8034011148204714,
"step": 1715
},
{
"epoch": 3.81465038845727,
"grad_norm": 0.0627264927528077,
"learning_rate": 1.552745284236133e-05,
"loss": 0.581,
"mean_token_accuracy": 0.8138149750532312,
"step": 1720
},
{
"epoch": 3.8257491675915647,
"grad_norm": 0.0666022446878339,
"learning_rate": 1.5495089780708062e-05,
"loss": 0.5853,
"mean_token_accuracy": 0.8127597504780851,
"step": 1725
},
{
"epoch": 3.83684794672586,
"grad_norm": 0.06651837216095642,
"learning_rate": 1.546264405742654e-05,
"loss": 0.5901,
"mean_token_accuracy": 0.8110268386574733,
"step": 1730
},
{
"epoch": 3.8479467258601554,
"grad_norm": 0.06551396537364293,
"learning_rate": 1.5430116160591836e-05,
"loss": 0.5879,
"mean_token_accuracy": 0.8116350609023744,
"step": 1735
},
{
"epoch": 3.8590455049944508,
"grad_norm": 0.06347646901521764,
"learning_rate": 1.539750657951513e-05,
"loss": 0.582,
"mean_token_accuracy": 0.8141941271798734,
"step": 1740
},
{
"epoch": 3.870144284128746,
"grad_norm": 0.06581227966023888,
"learning_rate": 1.536481580473638e-05,
"loss": 0.6041,
"mean_token_accuracy": 0.8068086974339712,
"step": 1745
},
{
"epoch": 3.881243063263041,
"grad_norm": 0.06485201916018662,
"learning_rate": 1.5332044328016916e-05,
"loss": 0.5827,
"mean_token_accuracy": 0.8135174002384069,
"step": 1750
},
{
"epoch": 3.8923418423973364,
"grad_norm": 0.07023014750305509,
"learning_rate": 1.529919264233205e-05,
"loss": 0.5781,
"mean_token_accuracy": 0.8148667595200646,
"step": 1755
},
{
"epoch": 3.9034406215316313,
"grad_norm": 0.06788151535669468,
"learning_rate": 1.5266261241863675e-05,
"loss": 0.6046,
"mean_token_accuracy": 0.8066485319988587,
"step": 1760
},
{
"epoch": 3.9145394006659266,
"grad_norm": 0.06540773197792465,
"learning_rate": 1.523325062199281e-05,
"loss": 0.5912,
"mean_token_accuracy": 0.810199155848963,
"step": 1765
},
{
"epoch": 3.925638179800222,
"grad_norm": 0.06385079909541433,
"learning_rate": 1.5200161279292154e-05,
"loss": 0.606,
"mean_token_accuracy": 0.806549376246009,
"step": 1770
},
{
"epoch": 3.9367369589345174,
"grad_norm": 0.06405640899804474,
"learning_rate": 1.5166993711518631e-05,
"loss": 0.6074,
"mean_token_accuracy": 0.805698963377606,
"step": 1775
},
{
"epoch": 3.9478357380688123,
"grad_norm": 0.0656239778913707,
"learning_rate": 1.5133748417605878e-05,
"loss": 0.6042,
"mean_token_accuracy": 0.8072386929802121,
"step": 1780
},
{
"epoch": 3.9589345172031076,
"grad_norm": 0.06537373035601372,
"learning_rate": 1.5100425897656754e-05,
"loss": 0.5914,
"mean_token_accuracy": 0.8114940759127753,
"step": 1785
},
{
"epoch": 3.970033296337403,
"grad_norm": 0.07228070817170734,
"learning_rate": 1.5067026652935823e-05,
"loss": 0.591,
"mean_token_accuracy": 0.8108245554957352,
"step": 1790
},
{
"epoch": 3.981132075471698,
"grad_norm": 0.06997011004537755,
"learning_rate": 1.50335511858618e-05,
"loss": 0.6117,
"mean_token_accuracy": 0.8037908682685642,
"step": 1795
},
{
"epoch": 3.9922308546059933,
"grad_norm": 0.06257709842877483,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.5861,
"mean_token_accuracy": 0.8126290702777871,
"step": 1800
},
{
"epoch": 3.9922308546059933,
"eval_loss": 0.7382122874259949,
"eval_mean_token_accuracy": 0.7738011020672627,
"eval_runtime": 2.5042,
"eval_samples_per_second": 51.514,
"eval_steps_per_second": 4.393,
"step": 1800
},
{
"epoch": 4.002219755826859,
"grad_norm": 0.0730797652438335,
"learning_rate": 1.4966373600054763e-05,
"loss": 0.5779,
"mean_token_accuracy": 0.8164720629136107,
"step": 1805
},
{
"epoch": 4.013318534961154,
"grad_norm": 0.08672989514268875,
"learning_rate": 1.4932672491861855e-05,
"loss": 0.5221,
"mean_token_accuracy": 0.8303200814447649,
"step": 1810
},
{
"epoch": 4.02441731409545,
"grad_norm": 0.08072906911113291,
"learning_rate": 1.4898897182380872e-05,
"loss": 0.5357,
"mean_token_accuracy": 0.825473979487524,
"step": 1815
},
{
"epoch": 4.035516093229745,
"grad_norm": 0.07018425634528604,
"learning_rate": 1.48650481796876e-05,
"loss": 0.5307,
"mean_token_accuracy": 0.8279411218844428,
"step": 1820
},
{
"epoch": 4.04661487236404,
"grad_norm": 0.068007941882165,
"learning_rate": 1.4831125992966386e-05,
"loss": 0.5196,
"mean_token_accuracy": 0.831187564208158,
"step": 1825
},
{
"epoch": 4.057713651498335,
"grad_norm": 0.0689958997732716,
"learning_rate": 1.4797131132502464e-05,
"loss": 0.5442,
"mean_token_accuracy": 0.8231890900584835,
"step": 1830
},
{
"epoch": 4.06881243063263,
"grad_norm": 0.07171455936324264,
"learning_rate": 1.476306410967429e-05,
"loss": 0.5345,
"mean_token_accuracy": 0.8263741901210816,
"step": 1835
},
{
"epoch": 4.079911209766926,
"grad_norm": 0.06804956275279826,
"learning_rate": 1.4728925436945838e-05,
"loss": 0.5207,
"mean_token_accuracy": 0.8303962847835349,
"step": 1840
},
{
"epoch": 4.091009988901221,
"grad_norm": 0.07083430472325235,
"learning_rate": 1.469471562785891e-05,
"loss": 0.5158,
"mean_token_accuracy": 0.8320593088222301,
"step": 1845
},
{
"epoch": 4.102108768035516,
"grad_norm": 0.06503541249215597,
"learning_rate": 1.4660435197025391e-05,
"loss": 0.525,
"mean_token_accuracy": 0.8300507388211443,
"step": 1850
},
{
"epoch": 4.113207547169812,
"grad_norm": 0.06971798835469743,
"learning_rate": 1.4626084660119515e-05,
"loss": 0.5295,
"mean_token_accuracy": 0.8280530516037452,
"step": 1855
},
{
"epoch": 4.124306326304106,
"grad_norm": 0.06603173419848209,
"learning_rate": 1.4591664533870118e-05,
"loss": 0.5266,
"mean_token_accuracy": 0.8291048758745919,
"step": 1860
},
{
"epoch": 4.135405105438402,
"grad_norm": 0.06938457936618332,
"learning_rate": 1.4557175336052844e-05,
"loss": 0.536,
"mean_token_accuracy": 0.8256845911513351,
"step": 1865
},
{
"epoch": 4.146503884572697,
"grad_norm": 0.06883650062666277,
"learning_rate": 1.4522617585482377e-05,
"loss": 0.5204,
"mean_token_accuracy": 0.8310854137446115,
"step": 1870
},
{
"epoch": 4.157602663706992,
"grad_norm": 0.07063831352536717,
"learning_rate": 1.4487991802004625e-05,
"loss": 0.5433,
"mean_token_accuracy": 0.8236911002393945,
"step": 1875
},
{
"epoch": 4.168701442841288,
"grad_norm": 0.06752461645830253,
"learning_rate": 1.4453298506488896e-05,
"loss": 0.538,
"mean_token_accuracy": 0.8248770954289781,
"step": 1880
},
{
"epoch": 4.179800221975583,
"grad_norm": 0.06690086776158738,
"learning_rate": 1.441853822082008e-05,
"loss": 0.5423,
"mean_token_accuracy": 0.8239579652997131,
"step": 1885
},
{
"epoch": 4.190899001109878,
"grad_norm": 0.0669777882641211,
"learning_rate": 1.4383711467890776e-05,
"loss": 0.5364,
"mean_token_accuracy": 0.8257569366739599,
"step": 1890
},
{
"epoch": 4.201997780244173,
"grad_norm": 0.0669780263721949,
"learning_rate": 1.4348818771593452e-05,
"loss": 0.5262,
"mean_token_accuracy": 0.8289039644335492,
"step": 1895
},
{
"epoch": 4.213096559378468,
"grad_norm": 0.06926763124785959,
"learning_rate": 1.4313860656812537e-05,
"loss": 0.5534,
"mean_token_accuracy": 0.820804290469477,
"step": 1900
},
{
"epoch": 4.213096559378468,
"eval_loss": 0.765135645866394,
"eval_mean_token_accuracy": 0.7707166053233169,
"eval_runtime": 2.5096,
"eval_samples_per_second": 51.402,
"eval_steps_per_second": 4.383,
"step": 1900
},
{
"epoch": 4.2241953385127635,
"grad_norm": 0.07023846548990398,
"learning_rate": 1.4278837649416543e-05,
"loss": 0.5302,
"mean_token_accuracy": 0.8275532628167696,
"step": 1905
},
{
"epoch": 4.235294117647059,
"grad_norm": 0.06738468385547651,
"learning_rate": 1.4243750276250154e-05,
"loss": 0.5429,
"mean_token_accuracy": 0.8233446568456619,
"step": 1910
},
{
"epoch": 4.246392896781354,
"grad_norm": 0.06482738400996359,
"learning_rate": 1.4208599065126292e-05,
"loss": 0.5189,
"mean_token_accuracy": 0.8316966637591312,
"step": 1915
},
{
"epoch": 4.25749167591565,
"grad_norm": 0.06396002535376463,
"learning_rate": 1.417338454481818e-05,
"loss": 0.5278,
"mean_token_accuracy": 0.8282958367271627,
"step": 1920
},
{
"epoch": 4.268590455049944,
"grad_norm": 0.06689652898165803,
"learning_rate": 1.4138107245051394e-05,
"loss": 0.5427,
"mean_token_accuracy": 0.8237128012084165,
"step": 1925
},
{
"epoch": 4.279689234184239,
"grad_norm": 0.06899145550235058,
"learning_rate": 1.4102767696495885e-05,
"loss": 0.5416,
"mean_token_accuracy": 0.8238680752683756,
"step": 1930
},
{
"epoch": 4.290788013318535,
"grad_norm": 0.06653016304820043,
"learning_rate": 1.4067366430758004e-05,
"loss": 0.5412,
"mean_token_accuracy": 0.824632569309992,
"step": 1935
},
{
"epoch": 4.30188679245283,
"grad_norm": 0.07135924894684956,
"learning_rate": 1.4031903980372503e-05,
"loss": 0.5588,
"mean_token_accuracy": 0.8186822026511017,
"step": 1940
},
{
"epoch": 4.3129855715871255,
"grad_norm": 0.06901506479682018,
"learning_rate": 1.3996380878794524e-05,
"loss": 0.5307,
"mean_token_accuracy": 0.8278875389394228,
"step": 1945
},
{
"epoch": 4.324084350721421,
"grad_norm": 0.06828083288970177,
"learning_rate": 1.396079766039157e-05,
"loss": 0.5365,
"mean_token_accuracy": 0.8254911164406096,
"step": 1950
},
{
"epoch": 4.335183129855716,
"grad_norm": 0.06770085673952302,
"learning_rate": 1.3925154860435473e-05,
"loss": 0.5457,
"mean_token_accuracy": 0.8232314349438562,
"step": 1955
},
{
"epoch": 4.3462819089900115,
"grad_norm": 0.0659269142449662,
"learning_rate": 1.3889453015094338e-05,
"loss": 0.546,
"mean_token_accuracy": 0.823248835683452,
"step": 1960
},
{
"epoch": 4.357380688124306,
"grad_norm": 0.06685910108488621,
"learning_rate": 1.3853692661424485e-05,
"loss": 0.5313,
"mean_token_accuracy": 0.8275431625307241,
"step": 1965
},
{
"epoch": 4.368479467258601,
"grad_norm": 0.0699277758009232,
"learning_rate": 1.3817874337362351e-05,
"loss": 0.5435,
"mean_token_accuracy": 0.8237799596940005,
"step": 1970
},
{
"epoch": 4.379578246392897,
"grad_norm": 0.06463358677008339,
"learning_rate": 1.3781998581716427e-05,
"loss": 0.5307,
"mean_token_accuracy": 0.8274417707074175,
"step": 1975
},
{
"epoch": 4.390677025527192,
"grad_norm": 0.06736039165720337,
"learning_rate": 1.3746065934159123e-05,
"loss": 0.5296,
"mean_token_accuracy": 0.8283872712315985,
"step": 1980
},
{
"epoch": 4.401775804661487,
"grad_norm": 0.06597768478568991,
"learning_rate": 1.3710076935218671e-05,
"loss": 0.5337,
"mean_token_accuracy": 0.8272544528484194,
"step": 1985
},
{
"epoch": 4.412874583795783,
"grad_norm": 0.06441872232432956,
"learning_rate": 1.3674032126270982e-05,
"loss": 0.5349,
"mean_token_accuracy": 0.8267573283926994,
"step": 1990
},
{
"epoch": 4.423973362930077,
"grad_norm": 0.06605167933787068,
"learning_rate": 1.3637932049531517e-05,
"loss": 0.5307,
"mean_token_accuracy": 0.8282462134771424,
"step": 1995
},
{
"epoch": 4.435072142064373,
"grad_norm": 0.0665118285232646,
"learning_rate": 1.3601777248047105e-05,
"loss": 0.5449,
"mean_token_accuracy": 0.8234090069569587,
"step": 2000
},
{
"epoch": 4.435072142064373,
"eval_loss": 0.7611222267150879,
"eval_mean_token_accuracy": 0.7715752214807218,
"eval_runtime": 2.5096,
"eval_samples_per_second": 51.403,
"eval_steps_per_second": 4.383,
"step": 2000
},
{
"epoch": 4.446170921198668,
"grad_norm": 0.0633952053347362,
"learning_rate": 1.3565568265687802e-05,
"loss": 0.5441,
"mean_token_accuracy": 0.8236108702838326,
"step": 2005
},
{
"epoch": 4.457269700332963,
"grad_norm": 0.06396077346644374,
"learning_rate": 1.3529305647138689e-05,
"loss": 0.5254,
"mean_token_accuracy": 0.8295401585160572,
"step": 2010
},
{
"epoch": 4.468368479467259,
"grad_norm": 0.0656532841801477,
"learning_rate": 1.3492989937891694e-05,
"loss": 0.5336,
"mean_token_accuracy": 0.8266195612283573,
"step": 2015
},
{
"epoch": 4.479467258601554,
"grad_norm": 0.06837988228446373,
"learning_rate": 1.3456621684237367e-05,
"loss": 0.5341,
"mean_token_accuracy": 0.8274741807935527,
"step": 2020
},
{
"epoch": 4.490566037735849,
"grad_norm": 0.0667039963365344,
"learning_rate": 1.342020143325669e-05,
"loss": 0.5466,
"mean_token_accuracy": 0.8229245927993392,
"step": 2025
},
{
"epoch": 4.501664816870145,
"grad_norm": 0.0673181621735528,
"learning_rate": 1.3383729732812814e-05,
"loss": 0.5539,
"mean_token_accuracy": 0.8201497596550249,
"step": 2030
},
{
"epoch": 4.512763596004439,
"grad_norm": 0.06895598701514556,
"learning_rate": 1.3347207131542847e-05,
"loss": 0.5553,
"mean_token_accuracy": 0.8203718930059478,
"step": 2035
},
{
"epoch": 4.523862375138735,
"grad_norm": 0.06570681535103308,
"learning_rate": 1.3310634178849583e-05,
"loss": 0.526,
"mean_token_accuracy": 0.8288809425485892,
"step": 2040
},
{
"epoch": 4.53496115427303,
"grad_norm": 0.0701180297067736,
"learning_rate": 1.3274011424893245e-05,
"loss": 0.5408,
"mean_token_accuracy": 0.8247304133164256,
"step": 2045
},
{
"epoch": 4.546059933407325,
"grad_norm": 0.06791266083149289,
"learning_rate": 1.3237339420583213e-05,
"loss": 0.543,
"mean_token_accuracy": 0.8240936918076546,
"step": 2050
},
{
"epoch": 4.557158712541621,
"grad_norm": 0.06711140536306563,
"learning_rate": 1.3200618717569716e-05,
"loss": 0.5511,
"mean_token_accuracy": 0.8213156804488854,
"step": 2055
},
{
"epoch": 4.568257491675916,
"grad_norm": 0.06793298802768283,
"learning_rate": 1.3163849868235566e-05,
"loss": 0.5363,
"mean_token_accuracy": 0.825724493844356,
"step": 2060
},
{
"epoch": 4.5793562708102105,
"grad_norm": 0.06892866532615381,
"learning_rate": 1.312703342568782e-05,
"loss": 0.534,
"mean_token_accuracy": 0.8264291321992333,
"step": 2065
},
{
"epoch": 4.590455049944506,
"grad_norm": 0.06664285296724572,
"learning_rate": 1.3090169943749475e-05,
"loss": 0.5348,
"mean_token_accuracy": 0.8258681376303734,
"step": 2070
},
{
"epoch": 4.601553829078801,
"grad_norm": 0.0654483232429745,
"learning_rate": 1.3053259976951134e-05,
"loss": 0.5596,
"mean_token_accuracy": 0.8185533510882992,
"step": 2075
},
{
"epoch": 4.6126526082130965,
"grad_norm": 0.07041916180841737,
"learning_rate": 1.3016304080522657e-05,
"loss": 0.5443,
"mean_token_accuracy": 0.8240659035373834,
"step": 2080
},
{
"epoch": 4.623751387347392,
"grad_norm": 0.06702813062704534,
"learning_rate": 1.297930281038482e-05,
"loss": 0.5491,
"mean_token_accuracy": 0.8222027550986537,
"step": 2085
},
{
"epoch": 4.634850166481687,
"grad_norm": 0.06248394102983872,
"learning_rate": 1.2942256723140951e-05,
"loss": 0.5336,
"mean_token_accuracy": 0.8269264702799436,
"step": 2090
},
{
"epoch": 4.645948945615983,
"grad_norm": 0.0643747533736713,
"learning_rate": 1.290516637606855e-05,
"loss": 0.5354,
"mean_token_accuracy": 0.826579599216724,
"step": 2095
},
{
"epoch": 4.657047724750278,
"grad_norm": 0.06547506463619156,
"learning_rate": 1.2868032327110904e-05,
"loss": 0.5261,
"mean_token_accuracy": 0.8294653993974614,
"step": 2100
},
{
"epoch": 4.657047724750278,
"eval_loss": 0.7608128190040588,
"eval_mean_token_accuracy": 0.7713916980015862,
"eval_runtime": 2.5115,
"eval_samples_per_second": 51.364,
"eval_steps_per_second": 4.38,
"step": 2100
},
{
"epoch": 4.668146503884572,
"grad_norm": 0.06586818079405167,
"learning_rate": 1.2830855134868705e-05,
"loss": 0.5334,
"mean_token_accuracy": 0.8269698798269095,
"step": 2105
},
{
"epoch": 4.679245283018868,
"grad_norm": 0.06919497157074231,
"learning_rate": 1.2793635358591645e-05,
"loss": 0.5262,
"mean_token_accuracy": 0.8291748631685987,
"step": 2110
},
{
"epoch": 4.690344062153163,
"grad_norm": 0.0638134236313798,
"learning_rate": 1.2756373558169992e-05,
"loss": 0.5451,
"mean_token_accuracy": 0.823209000051959,
"step": 2115
},
{
"epoch": 4.7014428412874585,
"grad_norm": 0.0673456304701257,
"learning_rate": 1.2719070294126183e-05,
"loss": 0.5425,
"mean_token_accuracy": 0.824060788934791,
"step": 2120
},
{
"epoch": 4.712541620421754,
"grad_norm": 0.0628128288015867,
"learning_rate": 1.2681726127606374e-05,
"loss": 0.5371,
"mean_token_accuracy": 0.8254979634319394,
"step": 2125
},
{
"epoch": 4.723640399556049,
"grad_norm": 0.06717000875377617,
"learning_rate": 1.2644341620372025e-05,
"loss": 0.5437,
"mean_token_accuracy": 0.8236075508618403,
"step": 2130
},
{
"epoch": 4.734739178690344,
"grad_norm": 0.06808573080841533,
"learning_rate": 1.2606917334791415e-05,
"loss": 0.5493,
"mean_token_accuracy": 0.8218429272375017,
"step": 2135
},
{
"epoch": 4.745837957824639,
"grad_norm": 0.06914709442447933,
"learning_rate": 1.2569453833831222e-05,
"loss": 0.5431,
"mean_token_accuracy": 0.8241093534073686,
"step": 2140
},
{
"epoch": 4.756936736958934,
"grad_norm": 0.06334309954829766,
"learning_rate": 1.253195168104802e-05,
"loss": 0.5567,
"mean_token_accuracy": 0.8192453652654745,
"step": 2145
},
{
"epoch": 4.76803551609323,
"grad_norm": 0.06619495996123374,
"learning_rate": 1.2494411440579814e-05,
"loss": 0.5442,
"mean_token_accuracy": 0.8232290532177181,
"step": 2150
},
{
"epoch": 4.779134295227525,
"grad_norm": 0.07297705038597685,
"learning_rate": 1.2456833677137563e-05,
"loss": 0.5451,
"mean_token_accuracy": 0.8230713940589893,
"step": 2155
},
{
"epoch": 4.79023307436182,
"grad_norm": 0.0701375699397559,
"learning_rate": 1.2419218955996677e-05,
"loss": 0.5447,
"mean_token_accuracy": 0.8232547438813633,
"step": 2160
},
{
"epoch": 4.801331853496116,
"grad_norm": 0.06979220683803566,
"learning_rate": 1.238156784298851e-05,
"loss": 0.5414,
"mean_token_accuracy": 0.8242423296299666,
"step": 2165
},
{
"epoch": 4.812430632630411,
"grad_norm": 0.06538541811983827,
"learning_rate": 1.2343880904491846e-05,
"loss": 0.544,
"mean_token_accuracy": 0.8230058901259314,
"step": 2170
},
{
"epoch": 4.823529411764706,
"grad_norm": 0.06323017444968718,
"learning_rate": 1.2306158707424402e-05,
"loss": 0.5289,
"mean_token_accuracy": 0.8285157998211631,
"step": 2175
},
{
"epoch": 4.834628190899001,
"grad_norm": 0.0630498338664204,
"learning_rate": 1.226840181923427e-05,
"loss": 0.5384,
"mean_token_accuracy": 0.8254811299839719,
"step": 2180
},
{
"epoch": 4.845726970033296,
"grad_norm": 0.0677977464932852,
"learning_rate": 1.2230610807891394e-05,
"loss": 0.5428,
"mean_token_accuracy": 0.8239020278215602,
"step": 2185
},
{
"epoch": 4.856825749167592,
"grad_norm": 0.06837957133766576,
"learning_rate": 1.2192786241879033e-05,
"loss": 0.5369,
"mean_token_accuracy": 0.8253965695150642,
"step": 2190
},
{
"epoch": 4.867924528301887,
"grad_norm": 0.06858490759021686,
"learning_rate": 1.2154928690185201e-05,
"loss": 0.5499,
"mean_token_accuracy": 0.8211856043294985,
"step": 2195
},
{
"epoch": 4.879023307436182,
"grad_norm": 0.07073949700021516,
"learning_rate": 1.211703872229411e-05,
"loss": 0.5564,
"mean_token_accuracy": 0.8193733641312193,
"step": 2200
},
{
"epoch": 4.879023307436182,
"eval_loss": 0.7586882710456848,
"eval_mean_token_accuracy": 0.7719546157599555,
"eval_runtime": 2.5069,
"eval_samples_per_second": 51.458,
"eval_steps_per_second": 4.388,
"step": 2200
},
{
"epoch": 4.890122086570477,
"grad_norm": 0.06682518985110458,
"learning_rate": 1.2079116908177592e-05,
"loss": 0.5503,
"mean_token_accuracy": 0.8219519923319263,
"step": 2205
},
{
"epoch": 4.901220865704772,
"grad_norm": 0.06878010685643873,
"learning_rate": 1.2041163818286558e-05,
"loss": 0.5412,
"mean_token_accuracy": 0.8244903210988521,
"step": 2210
},
{
"epoch": 4.912319644839068,
"grad_norm": 0.06859265775491848,
"learning_rate": 1.2003180023542375e-05,
"loss": 0.5482,
"mean_token_accuracy": 0.8223593399026905,
"step": 2215
},
{
"epoch": 4.923418423973363,
"grad_norm": 0.06582995971058746,
"learning_rate": 1.1965166095328302e-05,
"loss": 0.5444,
"mean_token_accuracy": 0.8238218612141541,
"step": 2220
},
{
"epoch": 4.934517203107658,
"grad_norm": 0.06844088791978743,
"learning_rate": 1.1927122605480899e-05,
"loss": 0.5404,
"mean_token_accuracy": 0.8245272129167904,
"step": 2225
},
{
"epoch": 4.945615982241954,
"grad_norm": 0.07288470554374063,
"learning_rate": 1.1889050126281405e-05,
"loss": 0.5352,
"mean_token_accuracy": 0.8261003602637317,
"step": 2230
},
{
"epoch": 4.956714761376249,
"grad_norm": 0.06671251103356769,
"learning_rate": 1.1850949230447146e-05,
"loss": 0.5306,
"mean_token_accuracy": 0.8276477690305077,
"step": 2235
},
{
"epoch": 4.9678135405105435,
"grad_norm": 0.06439515034329467,
"learning_rate": 1.1812820491122918e-05,
"loss": 0.5253,
"mean_token_accuracy": 0.8289079489294556,
"step": 2240
},
{
"epoch": 4.978912319644839,
"grad_norm": 0.06384436174886099,
"learning_rate": 1.1774664481872354e-05,
"loss": 0.533,
"mean_token_accuracy": 0.8269564316724558,
"step": 2245
},
{
"epoch": 4.990011098779134,
"grad_norm": 0.064914486533765,
"learning_rate": 1.1736481776669307e-05,
"loss": 0.5337,
"mean_token_accuracy": 0.8273936231132242,
"step": 2250
},
{
"epoch": 5.0,
"grad_norm": 0.06525786189415514,
"learning_rate": 1.1698272949889206e-05,
"loss": 0.5516,
"mean_token_accuracy": 0.8209712929381258,
"step": 2255
},
{
"epoch": 5.011098779134295,
"grad_norm": 0.07325252213881911,
"learning_rate": 1.1660038576300444e-05,
"loss": 0.4878,
"mean_token_accuracy": 0.8401779370635157,
"step": 2260
},
{
"epoch": 5.022197558268591,
"grad_norm": 0.07048539425609023,
"learning_rate": 1.1621779231055677e-05,
"loss": 0.4684,
"mean_token_accuracy": 0.8463123294941672,
"step": 2265
},
{
"epoch": 5.033296337402886,
"grad_norm": 0.07119257946253367,
"learning_rate": 1.158349548968323e-05,
"loss": 0.4852,
"mean_token_accuracy": 0.8406624644516423,
"step": 2270
},
{
"epoch": 5.0443951165371805,
"grad_norm": 0.06758001059113193,
"learning_rate": 1.1545187928078407e-05,
"loss": 0.4851,
"mean_token_accuracy": 0.8410094757922542,
"step": 2275
},
{
"epoch": 5.055493895671476,
"grad_norm": 0.06748477290601286,
"learning_rate": 1.1506857122494832e-05,
"loss": 0.4666,
"mean_token_accuracy": 0.8467701105668647,
"step": 2280
},
{
"epoch": 5.066592674805771,
"grad_norm": 0.06670660224212706,
"learning_rate": 1.146850364953579e-05,
"loss": 0.4898,
"mean_token_accuracy": 0.839417112282745,
"step": 2285
},
{
"epoch": 5.077691453940067,
"grad_norm": 0.07087893469659705,
"learning_rate": 1.1430128086145542e-05,
"loss": 0.4791,
"mean_token_accuracy": 0.8429755949198621,
"step": 2290
},
{
"epoch": 5.088790233074362,
"grad_norm": 0.07478834139797155,
"learning_rate": 1.1391731009600655e-05,
"loss": 0.4892,
"mean_token_accuracy": 0.8396017922119625,
"step": 2295
},
{
"epoch": 5.099889012208657,
"grad_norm": 0.06766266447181689,
"learning_rate": 1.1353312997501313e-05,
"loss": 0.4691,
"mean_token_accuracy": 0.8461667825407023,
"step": 2300
},
{
"epoch": 5.099889012208657,
"eval_loss": 0.7955626845359802,
"eval_mean_token_accuracy": 0.768756609510345,
"eval_runtime": 2.5081,
"eval_samples_per_second": 51.432,
"eval_steps_per_second": 4.386,
"step": 2300
},
{
"epoch": 5.110987791342953,
"grad_norm": 0.06999143705646484,
"learning_rate": 1.1314874627762627e-05,
"loss": 0.4738,
"mean_token_accuracy": 0.8446440579759826,
"step": 2305
},
{
"epoch": 5.122086570477247,
"grad_norm": 0.0718794241969376,
"learning_rate": 1.127641647860595e-05,
"loss": 0.4841,
"mean_token_accuracy": 0.8411325365807377,
"step": 2310
},
{
"epoch": 5.1331853496115425,
"grad_norm": 0.06704940991855901,
"learning_rate": 1.1237939128550167e-05,
"loss": 0.4743,
"mean_token_accuracy": 0.8441928002045515,
"step": 2315
},
{
"epoch": 5.144284128745838,
"grad_norm": 0.06501577030440127,
"learning_rate": 1.1199443156402998e-05,
"loss": 0.4572,
"mean_token_accuracy": 0.8498967529805903,
"step": 2320
},
{
"epoch": 5.155382907880133,
"grad_norm": 0.06928626193910288,
"learning_rate": 1.1160929141252303e-05,
"loss": 0.4645,
"mean_token_accuracy": 0.8472524675665444,
"step": 2325
},
{
"epoch": 5.166481687014429,
"grad_norm": 0.06840759490883057,
"learning_rate": 1.1122397662457352e-05,
"loss": 0.4776,
"mean_token_accuracy": 0.8436041188428363,
"step": 2330
},
{
"epoch": 5.177580466148724,
"grad_norm": 0.07013395237638928,
"learning_rate": 1.1083849299640109e-05,
"loss": 0.4846,
"mean_token_accuracy": 0.8408465709590693,
"step": 2335
},
{
"epoch": 5.188679245283019,
"grad_norm": 0.07333313859905721,
"learning_rate": 1.1045284632676535e-05,
"loss": 0.4823,
"mean_token_accuracy": 0.8415907001724943,
"step": 2340
},
{
"epoch": 5.199778024417314,
"grad_norm": 0.07150128563971167,
"learning_rate": 1.1006704241687846e-05,
"loss": 0.4891,
"mean_token_accuracy": 0.8399067214225251,
"step": 2345
},
{
"epoch": 5.210876803551609,
"grad_norm": 0.06766056951417396,
"learning_rate": 1.0968108707031792e-05,
"loss": 0.4943,
"mean_token_accuracy": 0.8384693039607134,
"step": 2350
},
{
"epoch": 5.2219755826859044,
"grad_norm": 0.07097733536527409,
"learning_rate": 1.0929498609293925e-05,
"loss": 0.485,
"mean_token_accuracy": 0.8407270559505065,
"step": 2355
},
{
"epoch": 5.2330743618202,
"grad_norm": 0.06905091060781053,
"learning_rate": 1.0890874529278866e-05,
"loss": 0.4945,
"mean_token_accuracy": 0.8379440541935631,
"step": 2360
},
{
"epoch": 5.244173140954495,
"grad_norm": 0.06825070345988755,
"learning_rate": 1.0852237048001568e-05,
"loss": 0.4748,
"mean_token_accuracy": 0.8437304701186099,
"step": 2365
},
{
"epoch": 5.2552719200887905,
"grad_norm": 0.06920597616240777,
"learning_rate": 1.0813586746678584e-05,
"loss": 0.4821,
"mean_token_accuracy": 0.8413251705971904,
"step": 2370
},
{
"epoch": 5.266370699223086,
"grad_norm": 0.07014168297576887,
"learning_rate": 1.077492420671931e-05,
"loss": 0.4682,
"mean_token_accuracy": 0.84606039955115,
"step": 2375
},
{
"epoch": 5.27746947835738,
"grad_norm": 0.06768993844433478,
"learning_rate": 1.0736250009717249e-05,
"loss": 0.4732,
"mean_token_accuracy": 0.8445257386106653,
"step": 2380
},
{
"epoch": 5.288568257491676,
"grad_norm": 0.06679687620254271,
"learning_rate": 1.0697564737441254e-05,
"loss": 0.4916,
"mean_token_accuracy": 0.8387902210796765,
"step": 2385
},
{
"epoch": 5.299667036625971,
"grad_norm": 0.07076922691915866,
"learning_rate": 1.0658868971826785e-05,
"loss": 0.4897,
"mean_token_accuracy": 0.8391406358484407,
"step": 2390
},
{
"epoch": 5.310765815760266,
"grad_norm": 0.06748496250773908,
"learning_rate": 1.0620163294967155e-05,
"loss": 0.4701,
"mean_token_accuracy": 0.8459595898789651,
"step": 2395
},
{
"epoch": 5.321864594894562,
"grad_norm": 0.07124845733985237,
"learning_rate": 1.0581448289104759e-05,
"loss": 0.4846,
"mean_token_accuracy": 0.8411545447134726,
"step": 2400
},
{
"epoch": 5.321864594894562,
"eval_loss": 0.7953329682350159,
"eval_mean_token_accuracy": 0.7690350065166047,
"eval_runtime": 2.4994,
"eval_samples_per_second": 51.612,
"eval_steps_per_second": 4.401,
"step": 2400
},
{
"epoch": 5.332963374028857,
"grad_norm": 0.06547207880107556,
"learning_rate": 1.054272453662234e-05,
"loss": 0.4907,
"mean_token_accuracy": 0.839171047344597,
"step": 2405
},
{
"epoch": 5.3440621531631525,
"grad_norm": 0.07181124979794333,
"learning_rate": 1.0503992620034202e-05,
"loss": 0.4948,
"mean_token_accuracy": 0.8375239063233091,
"step": 2410
},
{
"epoch": 5.355160932297447,
"grad_norm": 0.06656783092915997,
"learning_rate": 1.046525312197747e-05,
"loss": 0.4857,
"mean_token_accuracy": 0.8407924111332499,
"step": 2415
},
{
"epoch": 5.366259711431742,
"grad_norm": 0.06794012917390951,
"learning_rate": 1.0426506625203308e-05,
"loss": 0.4784,
"mean_token_accuracy": 0.8426411179712732,
"step": 2420
},
{
"epoch": 5.377358490566038,
"grad_norm": 0.07188722795683923,
"learning_rate": 1.038775371256817e-05,
"loss": 0.4877,
"mean_token_accuracy": 0.8399363439649579,
"step": 2425
},
{
"epoch": 5.388457269700333,
"grad_norm": 0.06594729610705517,
"learning_rate": 1.0348994967025012e-05,
"loss": 0.4779,
"mean_token_accuracy": 0.84393559716034,
"step": 2430
},
{
"epoch": 5.399556048834628,
"grad_norm": 0.06794091373175933,
"learning_rate": 1.0310230971614538e-05,
"loss": 0.4855,
"mean_token_accuracy": 0.8409262512902004,
"step": 2435
},
{
"epoch": 5.410654827968924,
"grad_norm": 0.06808160516295343,
"learning_rate": 1.027146230945643e-05,
"loss": 0.4901,
"mean_token_accuracy": 0.8390278774503441,
"step": 2440
},
{
"epoch": 5.421753607103218,
"grad_norm": 0.06811284626620154,
"learning_rate": 1.0232689563740563e-05,
"loss": 0.4852,
"mean_token_accuracy": 0.8407756696746276,
"step": 2445
},
{
"epoch": 5.4328523862375135,
"grad_norm": 0.06846143690666025,
"learning_rate": 1.0193913317718245e-05,
"loss": 0.4984,
"mean_token_accuracy": 0.8360630667700921,
"step": 2450
},
{
"epoch": 5.443951165371809,
"grad_norm": 0.06588535272540232,
"learning_rate": 1.0155134154693434e-05,
"loss": 0.4714,
"mean_token_accuracy": 0.8450789528948887,
"step": 2455
},
{
"epoch": 5.455049944506104,
"grad_norm": 0.07326355147752138,
"learning_rate": 1.0116352658013973e-05,
"loss": 0.4844,
"mean_token_accuracy": 0.8414956570792598,
"step": 2460
},
{
"epoch": 5.4661487236404,
"grad_norm": 0.06613241730536085,
"learning_rate": 1.0077569411062804e-05,
"loss": 0.477,
"mean_token_accuracy": 0.8429474129481453,
"step": 2465
},
{
"epoch": 5.477247502774695,
"grad_norm": 0.07241995391150818,
"learning_rate": 1.0038784997249205e-05,
"loss": 0.4915,
"mean_token_accuracy": 0.8389679819932333,
"step": 2470
},
{
"epoch": 5.48834628190899,
"grad_norm": 0.07063682530565008,
"learning_rate": 1e-05,
"loss": 0.498,
"mean_token_accuracy": 0.8369453954053674,
"step": 2475
},
{
"epoch": 5.499445061043286,
"grad_norm": 0.07271110291337514,
"learning_rate": 9.961215002750799e-06,
"loss": 0.5087,
"mean_token_accuracy": 0.8337575984868868,
"step": 2480
},
{
"epoch": 5.51054384017758,
"grad_norm": 0.0683258143306416,
"learning_rate": 9.9224305889372e-06,
"loss": 0.4858,
"mean_token_accuracy": 0.8407680099048033,
"step": 2485
},
{
"epoch": 5.5216426193118755,
"grad_norm": 0.06532011380188109,
"learning_rate": 9.883647341986032e-06,
"loss": 0.4658,
"mean_token_accuracy": 0.8468363678085075,
"step": 2490
},
{
"epoch": 5.532741398446171,
"grad_norm": 0.06900876165623743,
"learning_rate": 9.844865845306568e-06,
"loss": 0.4767,
"mean_token_accuracy": 0.8428589991860804,
"step": 2495
},
{
"epoch": 5.543840177580466,
"grad_norm": 0.0740986106472282,
"learning_rate": 9.806086682281759e-06,
"loss": 0.4895,
"mean_token_accuracy": 0.8394784540835362,
"step": 2500
},
{
"epoch": 5.543840177580466,
"eval_loss": 0.7955297231674194,
"eval_mean_token_accuracy": 0.7687454143115147,
"eval_runtime": 2.5046,
"eval_samples_per_second": 51.505,
"eval_steps_per_second": 4.392,
"step": 2500
},
{
"epoch": 5.554938956714762,
"grad_norm": 0.06642030466005984,
"learning_rate": 9.767310436259438e-06,
"loss": 0.5008,
"mean_token_accuracy": 0.8361122903383699,
"step": 2505
},
{
"epoch": 5.566037735849057,
"grad_norm": 0.06895013221498675,
"learning_rate": 9.728537690543573e-06,
"loss": 0.505,
"mean_token_accuracy": 0.8343795016151843,
"step": 2510
},
{
"epoch": 5.577136514983351,
"grad_norm": 0.07055240437970804,
"learning_rate": 9.689769028385463e-06,
"loss": 0.4951,
"mean_token_accuracy": 0.837626593905515,
"step": 2515
},
{
"epoch": 5.588235294117647,
"grad_norm": 0.06871556526072226,
"learning_rate": 9.651005032974994e-06,
"loss": 0.4886,
"mean_token_accuracy": 0.8398810214660981,
"step": 2520
},
{
"epoch": 5.599334073251942,
"grad_norm": 0.069682340612611,
"learning_rate": 9.612246287431832e-06,
"loss": 0.4945,
"mean_token_accuracy": 0.8379422985314058,
"step": 2525
},
{
"epoch": 5.6104328523862375,
"grad_norm": 0.06833821495209477,
"learning_rate": 9.573493374796694e-06,
"loss": 0.4808,
"mean_token_accuracy": 0.8421461028149512,
"step": 2530
},
{
"epoch": 5.621531631520533,
"grad_norm": 0.06636646706954583,
"learning_rate": 9.534746878022533e-06,
"loss": 0.4929,
"mean_token_accuracy": 0.8380201477005709,
"step": 2535
},
{
"epoch": 5.632630410654828,
"grad_norm": 0.06698898046345116,
"learning_rate": 9.496007379965801e-06,
"loss": 0.4959,
"mean_token_accuracy": 0.837790504949783,
"step": 2540
},
{
"epoch": 5.6437291897891235,
"grad_norm": 0.06805236937676203,
"learning_rate": 9.457275463377665e-06,
"loss": 0.498,
"mean_token_accuracy": 0.8367196494144162,
"step": 2545
},
{
"epoch": 5.654827968923419,
"grad_norm": 0.06705492549315094,
"learning_rate": 9.418551710895243e-06,
"loss": 0.4864,
"mean_token_accuracy": 0.8409273808238578,
"step": 2550
},
{
"epoch": 5.665926748057713,
"grad_norm": 0.07076074969261581,
"learning_rate": 9.379836705032849e-06,
"loss": 0.4864,
"mean_token_accuracy": 0.8401968178066268,
"step": 2555
},
{
"epoch": 5.677025527192009,
"grad_norm": 0.06659007567798325,
"learning_rate": 9.341131028173215e-06,
"loss": 0.4882,
"mean_token_accuracy": 0.8395267262735404,
"step": 2560
},
{
"epoch": 5.688124306326304,
"grad_norm": 0.06977970775580196,
"learning_rate": 9.302435262558748e-06,
"loss": 0.5151,
"mean_token_accuracy": 0.8313035947119021,
"step": 2565
},
{
"epoch": 5.699223085460599,
"grad_norm": 0.07036821942399733,
"learning_rate": 9.263749990282753e-06,
"loss": 0.4877,
"mean_token_accuracy": 0.8402796498452879,
"step": 2570
},
{
"epoch": 5.710321864594895,
"grad_norm": 0.06710082302155221,
"learning_rate": 9.225075793280693e-06,
"loss": 0.5081,
"mean_token_accuracy": 0.8333654764490961,
"step": 2575
},
{
"epoch": 5.72142064372919,
"grad_norm": 0.07006409892288358,
"learning_rate": 9.18641325332142e-06,
"loss": 0.5041,
"mean_token_accuracy": 0.8346068076076595,
"step": 2580
},
{
"epoch": 5.732519422863485,
"grad_norm": 0.06625593193600583,
"learning_rate": 9.147762951998436e-06,
"loss": 0.4975,
"mean_token_accuracy": 0.8366931486178567,
"step": 2585
},
{
"epoch": 5.74361820199778,
"grad_norm": 0.06676241932683132,
"learning_rate": 9.109125470721141e-06,
"loss": 0.4928,
"mean_token_accuracy": 0.8382421368141244,
"step": 2590
},
{
"epoch": 5.754716981132075,
"grad_norm": 0.0657279064724614,
"learning_rate": 9.07050139070608e-06,
"loss": 0.4939,
"mean_token_accuracy": 0.8383761354079148,
"step": 2595
},
{
"epoch": 5.765815760266371,
"grad_norm": 0.06661508771228203,
"learning_rate": 9.03189129296821e-06,
"loss": 0.4834,
"mean_token_accuracy": 0.8413924101678466,
"step": 2600
},
{
"epoch": 5.765815760266371,
"eval_loss": 0.7937665581703186,
"eval_mean_token_accuracy": 0.7687533450104067,
"eval_runtime": 2.5047,
"eval_samples_per_second": 51.502,
"eval_steps_per_second": 4.392,
"step": 2600
},
{
"epoch": 5.776914539400666,
"grad_norm": 0.06972339544716691,
"learning_rate": 8.993295758312155e-06,
"loss": 0.4871,
"mean_token_accuracy": 0.8403621459374608,
"step": 2605
},
{
"epoch": 5.788013318534961,
"grad_norm": 0.06960404283948679,
"learning_rate": 8.954715367323468e-06,
"loss": 0.4793,
"mean_token_accuracy": 0.8426401722260557,
"step": 2610
},
{
"epoch": 5.799112097669257,
"grad_norm": 0.07020730896508283,
"learning_rate": 8.916150700359896e-06,
"loss": 0.4924,
"mean_token_accuracy": 0.8382286290026955,
"step": 2615
},
{
"epoch": 5.810210876803552,
"grad_norm": 0.06524987595853364,
"learning_rate": 8.877602337542655e-06,
"loss": 0.4835,
"mean_token_accuracy": 0.8416452873609049,
"step": 2620
},
{
"epoch": 5.8213096559378465,
"grad_norm": 0.06907599091717904,
"learning_rate": 8.839070858747697e-06,
"loss": 0.5007,
"mean_token_accuracy": 0.8361772989154721,
"step": 2625
},
{
"epoch": 5.832408435072142,
"grad_norm": 0.06957920316225238,
"learning_rate": 8.800556843597002e-06,
"loss": 0.5004,
"mean_token_accuracy": 0.8360992024044208,
"step": 2630
},
{
"epoch": 5.843507214206437,
"grad_norm": 0.07274736715088309,
"learning_rate": 8.762060871449838e-06,
"loss": 0.5003,
"mean_token_accuracy": 0.8356531106891625,
"step": 2635
},
{
"epoch": 5.854605993340733,
"grad_norm": 0.0749025930532487,
"learning_rate": 8.723583521394054e-06,
"loss": 0.4962,
"mean_token_accuracy": 0.8370848915881123,
"step": 2640
},
{
"epoch": 5.865704772475028,
"grad_norm": 0.07120547137676199,
"learning_rate": 8.685125372237374e-06,
"loss": 0.5145,
"mean_token_accuracy": 0.8315248587134981,
"step": 2645
},
{
"epoch": 5.876803551609323,
"grad_norm": 0.06848724166463453,
"learning_rate": 8.646687002498692e-06,
"loss": 0.4863,
"mean_token_accuracy": 0.8405704017043709,
"step": 2650
},
{
"epoch": 5.887902330743618,
"grad_norm": 0.0704312988672066,
"learning_rate": 8.60826899039935e-06,
"loss": 0.4954,
"mean_token_accuracy": 0.8377861152224325,
"step": 2655
},
{
"epoch": 5.899001109877913,
"grad_norm": 0.06953670462472285,
"learning_rate": 8.569871913854458e-06,
"loss": 0.4823,
"mean_token_accuracy": 0.84187656853583,
"step": 2660
},
{
"epoch": 5.9100998890122085,
"grad_norm": 0.0662886535942646,
"learning_rate": 8.53149635046421e-06,
"loss": 0.4954,
"mean_token_accuracy": 0.8372553782052586,
"step": 2665
},
{
"epoch": 5.921198668146504,
"grad_norm": 0.06315234294777407,
"learning_rate": 8.49314287750517e-06,
"loss": 0.4757,
"mean_token_accuracy": 0.8433855066382703,
"step": 2670
},
{
"epoch": 5.932297447280799,
"grad_norm": 0.06942216407055792,
"learning_rate": 8.454812071921597e-06,
"loss": 0.4877,
"mean_token_accuracy": 0.8396264773743953,
"step": 2675
},
{
"epoch": 5.943396226415095,
"grad_norm": 0.0691582315900679,
"learning_rate": 8.416504510316774e-06,
"loss": 0.484,
"mean_token_accuracy": 0.8411113970529893,
"step": 2680
},
{
"epoch": 5.95449500554939,
"grad_norm": 0.06769801918255437,
"learning_rate": 8.378220768944328e-06,
"loss": 0.4908,
"mean_token_accuracy": 0.8392534567740544,
"step": 2685
},
{
"epoch": 5.965593784683685,
"grad_norm": 0.0659122470311469,
"learning_rate": 8.339961423699563e-06,
"loss": 0.4759,
"mean_token_accuracy": 0.843660765151712,
"step": 2690
},
{
"epoch": 5.97669256381798,
"grad_norm": 0.06888231081448111,
"learning_rate": 8.301727050110794e-06,
"loss": 0.4993,
"mean_token_accuracy": 0.8362201832443829,
"step": 2695
},
{
"epoch": 5.987791342952275,
"grad_norm": 0.06698188670444734,
"learning_rate": 8.263518223330698e-06,
"loss": 0.4955,
"mean_token_accuracy": 0.8371192889929862,
"step": 2700
},
{
"epoch": 5.987791342952275,
"eval_loss": 0.7910561561584473,
"eval_mean_token_accuracy": 0.7692369949186039,
"eval_runtime": 2.5036,
"eval_samples_per_second": 51.527,
"eval_steps_per_second": 4.394,
"step": 2700
},
{
"epoch": 5.9988901220865705,
"grad_norm": 0.06672049278308162,
"learning_rate": 8.22533551812765e-06,
"loss": 0.4758,
"mean_token_accuracy": 0.8434463878747221,
"step": 2705
},
{
"epoch": 6.008879023307436,
"grad_norm": 0.07710909975015609,
"learning_rate": 8.187179508877086e-06,
"loss": 0.446,
"mean_token_accuracy": 0.8533357209356791,
"step": 2710
},
{
"epoch": 6.019977802441732,
"grad_norm": 0.0771418028863615,
"learning_rate": 8.149050769552856e-06,
"loss": 0.4547,
"mean_token_accuracy": 0.8494889642325584,
"step": 2715
},
{
"epoch": 6.031076581576027,
"grad_norm": 0.07240683362402148,
"learning_rate": 8.1109498737186e-06,
"loss": 0.4373,
"mean_token_accuracy": 0.8554890372187032,
"step": 2720
},
{
"epoch": 6.0421753607103215,
"grad_norm": 0.07709077815156115,
"learning_rate": 8.072877394519103e-06,
"loss": 0.4496,
"mean_token_accuracy": 0.8512469701533624,
"step": 2725
},
{
"epoch": 6.053274139844617,
"grad_norm": 0.06941387115968416,
"learning_rate": 8.034833904671698e-06,
"loss": 0.4295,
"mean_token_accuracy": 0.8584763547579994,
"step": 2730
},
{
"epoch": 6.064372918978912,
"grad_norm": 0.07578900972120883,
"learning_rate": 7.996819976457626e-06,
"loss": 0.4365,
"mean_token_accuracy": 0.855757515743538,
"step": 2735
},
{
"epoch": 6.0754716981132075,
"grad_norm": 0.06934693627289155,
"learning_rate": 7.958836181713445e-06,
"loss": 0.4367,
"mean_token_accuracy": 0.8556017916118088,
"step": 2740
},
{
"epoch": 6.086570477247503,
"grad_norm": 0.07210016497005568,
"learning_rate": 7.92088309182241e-06,
"loss": 0.429,
"mean_token_accuracy": 0.8580510382712909,
"step": 2745
},
{
"epoch": 6.097669256381798,
"grad_norm": 0.07577767471629779,
"learning_rate": 7.882961277705897e-06,
"loss": 0.4362,
"mean_token_accuracy": 0.8562088212016213,
"step": 2750
},
{
"epoch": 6.108768035516094,
"grad_norm": 0.07179031519158982,
"learning_rate": 7.845071309814802e-06,
"loss": 0.4415,
"mean_token_accuracy": 0.853879791220472,
"step": 2755
},
{
"epoch": 6.119866814650388,
"grad_norm": 0.07041898999257495,
"learning_rate": 7.807213758120965e-06,
"loss": 0.4212,
"mean_token_accuracy": 0.8610253997443506,
"step": 2760
},
{
"epoch": 6.130965593784683,
"grad_norm": 0.06879152875025886,
"learning_rate": 7.769389192108608e-06,
"loss": 0.4319,
"mean_token_accuracy": 0.8573816275209529,
"step": 2765
},
{
"epoch": 6.142064372918979,
"grad_norm": 0.07185324061137859,
"learning_rate": 7.731598180765732e-06,
"loss": 0.4455,
"mean_token_accuracy": 0.8524752572560315,
"step": 2770
},
{
"epoch": 6.153163152053274,
"grad_norm": 0.07033926941192889,
"learning_rate": 7.6938412925756e-06,
"loss": 0.4433,
"mean_token_accuracy": 0.853844047762491,
"step": 2775
},
{
"epoch": 6.1642619311875695,
"grad_norm": 0.06915140786615713,
"learning_rate": 7.656119095508155e-06,
"loss": 0.4388,
"mean_token_accuracy": 0.8552899831553875,
"step": 2780
},
{
"epoch": 6.175360710321865,
"grad_norm": 0.06646950976574176,
"learning_rate": 7.618432157011494e-06,
"loss": 0.4287,
"mean_token_accuracy": 0.8587684335286895,
"step": 2785
},
{
"epoch": 6.18645948945616,
"grad_norm": 0.06954390622394957,
"learning_rate": 7.580781044003324e-06,
"loss": 0.4295,
"mean_token_accuracy": 0.8582828818285677,
"step": 2790
},
{
"epoch": 6.197558268590455,
"grad_norm": 0.07254257043936811,
"learning_rate": 7.543166322862437e-06,
"loss": 0.4333,
"mean_token_accuracy": 0.8573370650863762,
"step": 2795
},
{
"epoch": 6.20865704772475,
"grad_norm": 0.07422334950870361,
"learning_rate": 7.505588559420188e-06,
"loss": 0.4399,
"mean_token_accuracy": 0.8540619797081618,
"step": 2800
},
{
"epoch": 6.20865704772475,
"eval_loss": 0.8441078662872314,
"eval_mean_token_accuracy": 0.7648808112159277,
"eval_runtime": 2.5051,
"eval_samples_per_second": 51.494,
"eval_steps_per_second": 4.391,
"step": 2800
},
{
"epoch": 6.219755826859045,
"grad_norm": 0.07396919663784399,
"learning_rate": 7.468048318951983e-06,
"loss": 0.4401,
"mean_token_accuracy": 0.8546519363654996,
"step": 2805
},
{
"epoch": 6.230854605993341,
"grad_norm": 0.0713243210685145,
"learning_rate": 7.430546166168781e-06,
"loss": 0.448,
"mean_token_accuracy": 0.8519195241739214,
"step": 2810
},
{
"epoch": 6.241953385127636,
"grad_norm": 0.07231708357633806,
"learning_rate": 7.393082665208587e-06,
"loss": 0.4448,
"mean_token_accuracy": 0.8527369511433074,
"step": 2815
},
{
"epoch": 6.2530521642619314,
"grad_norm": 0.06794319280745313,
"learning_rate": 7.355658379627981e-06,
"loss": 0.406,
"mean_token_accuracy": 0.8656115445513048,
"step": 2820
},
{
"epoch": 6.264150943396227,
"grad_norm": 0.07345780378767434,
"learning_rate": 7.3182738723936255e-06,
"loss": 0.4447,
"mean_token_accuracy": 0.8527881707515051,
"step": 2825
},
{
"epoch": 6.275249722530521,
"grad_norm": 0.06997362095232236,
"learning_rate": 7.280929705873818e-06,
"loss": 0.4238,
"mean_token_accuracy": 0.8601466268048498,
"step": 2830
},
{
"epoch": 6.286348501664817,
"grad_norm": 0.07131948856861685,
"learning_rate": 7.243626441830009e-06,
"loss": 0.4241,
"mean_token_accuracy": 0.859541994237266,
"step": 2835
},
{
"epoch": 6.297447280799112,
"grad_norm": 0.06944083932451703,
"learning_rate": 7.206364641408358e-06,
"loss": 0.4391,
"mean_token_accuracy": 0.8550273221116059,
"step": 2840
},
{
"epoch": 6.308546059933407,
"grad_norm": 0.07325056316103613,
"learning_rate": 7.169144865131297e-06,
"loss": 0.4455,
"mean_token_accuracy": 0.8528142965379706,
"step": 2845
},
{
"epoch": 6.319644839067703,
"grad_norm": 0.06840803745235291,
"learning_rate": 7.131967672889101e-06,
"loss": 0.4356,
"mean_token_accuracy": 0.8564159584911453,
"step": 2850
},
{
"epoch": 6.330743618201998,
"grad_norm": 0.07172434553638189,
"learning_rate": 7.094833623931455e-06,
"loss": 0.4474,
"mean_token_accuracy": 0.8522211836045799,
"step": 2855
},
{
"epoch": 6.341842397336293,
"grad_norm": 0.07657883060054264,
"learning_rate": 7.057743276859048e-06,
"loss": 0.4406,
"mean_token_accuracy": 0.8546443812030414,
"step": 2860
},
{
"epoch": 6.352941176470588,
"grad_norm": 0.07406808152539951,
"learning_rate": 7.02069718961518e-06,
"loss": 0.4506,
"mean_token_accuracy": 0.8510959785080605,
"step": 2865
},
{
"epoch": 6.364039955604883,
"grad_norm": 0.06875336053765568,
"learning_rate": 6.983695919477346e-06,
"loss": 0.4437,
"mean_token_accuracy": 0.8531396837999757,
"step": 2870
},
{
"epoch": 6.375138734739179,
"grad_norm": 0.07057356421532587,
"learning_rate": 6.94674002304887e-06,
"loss": 0.4501,
"mean_token_accuracy": 0.8510574126425563,
"step": 2875
},
{
"epoch": 6.386237513873474,
"grad_norm": 0.07071385124297858,
"learning_rate": 6.909830056250527e-06,
"loss": 0.4374,
"mean_token_accuracy": 0.8555903780776044,
"step": 2880
},
{
"epoch": 6.397336293007769,
"grad_norm": 0.07085634003180746,
"learning_rate": 6.872966574312182e-06,
"loss": 0.436,
"mean_token_accuracy": 0.8559481393760334,
"step": 2885
},
{
"epoch": 6.408435072142065,
"grad_norm": 0.06979867330158783,
"learning_rate": 6.836150131764434e-06,
"loss": 0.4486,
"mean_token_accuracy": 0.8521716226340693,
"step": 2890
},
{
"epoch": 6.41953385127636,
"grad_norm": 0.06904944422094117,
"learning_rate": 6.799381282430284e-06,
"loss": 0.4489,
"mean_token_accuracy": 0.8513674182168313,
"step": 2895
},
{
"epoch": 6.4306326304106545,
"grad_norm": 0.06827799167873423,
"learning_rate": 6.762660579416791e-06,
"loss": 0.4318,
"mean_token_accuracy": 0.8570870173989833,
"step": 2900
},
{
"epoch": 6.4306326304106545,
"eval_loss": 0.8432488441467285,
"eval_mean_token_accuracy": 0.7648142474990643,
"eval_runtime": 2.5092,
"eval_samples_per_second": 51.411,
"eval_steps_per_second": 4.384,
"step": 2900
},
{
"epoch": 6.44173140954495,
"grad_norm": 0.07038913878573082,
"learning_rate": 6.725988575106757e-06,
"loss": 0.435,
"mean_token_accuracy": 0.8566420371191622,
"step": 2905
},
{
"epoch": 6.452830188679245,
"grad_norm": 0.07021550296983327,
"learning_rate": 6.689365821150421e-06,
"loss": 0.4424,
"mean_token_accuracy": 0.8539050033549904,
"step": 2910
},
{
"epoch": 6.4639289678135405,
"grad_norm": 0.07022203073927909,
"learning_rate": 6.652792868457159e-06,
"loss": 0.4375,
"mean_token_accuracy": 0.8550374690978672,
"step": 2915
},
{
"epoch": 6.475027746947836,
"grad_norm": 0.07214969010434828,
"learning_rate": 6.61627026718719e-06,
"loss": 0.4453,
"mean_token_accuracy": 0.8527398643984709,
"step": 2920
},
{
"epoch": 6.486126526082131,
"grad_norm": 0.0718849792187647,
"learning_rate": 6.579798566743314e-06,
"loss": 0.4316,
"mean_token_accuracy": 0.8571454376520589,
"step": 2925
},
{
"epoch": 6.497225305216427,
"grad_norm": 0.0687959823292499,
"learning_rate": 6.543378315762634e-06,
"loss": 0.4457,
"mean_token_accuracy": 0.8522867903149901,
"step": 2930
},
{
"epoch": 6.508324084350721,
"grad_norm": 0.06939011310417309,
"learning_rate": 6.50701006210831e-06,
"loss": 0.418,
"mean_token_accuracy": 0.8624145286054207,
"step": 2935
},
{
"epoch": 6.519422863485016,
"grad_norm": 0.06919702181469813,
"learning_rate": 6.4706943528613135e-06,
"loss": 0.4405,
"mean_token_accuracy": 0.8544047091826522,
"step": 2940
},
{
"epoch": 6.530521642619312,
"grad_norm": 0.07090082072019381,
"learning_rate": 6.434431734312201e-06,
"loss": 0.451,
"mean_token_accuracy": 0.8512350567008408,
"step": 2945
},
{
"epoch": 6.541620421753607,
"grad_norm": 0.07311782278151045,
"learning_rate": 6.3982227519528986e-06,
"loss": 0.4372,
"mean_token_accuracy": 0.8551011624954604,
"step": 2950
},
{
"epoch": 6.5527192008879025,
"grad_norm": 0.06853793673243276,
"learning_rate": 6.362067950468489e-06,
"loss": 0.4289,
"mean_token_accuracy": 0.8585236146309112,
"step": 2955
},
{
"epoch": 6.563817980022198,
"grad_norm": 0.07293011821392871,
"learning_rate": 6.3259678737290174e-06,
"loss": 0.4346,
"mean_token_accuracy": 0.8561082904244961,
"step": 2960
},
{
"epoch": 6.574916759156492,
"grad_norm": 0.06978086285931945,
"learning_rate": 6.2899230647813315e-06,
"loss": 0.4361,
"mean_token_accuracy": 0.8558985457340178,
"step": 2965
},
{
"epoch": 6.586015538290788,
"grad_norm": 0.07112662316350266,
"learning_rate": 6.25393406584088e-06,
"loss": 0.4413,
"mean_token_accuracy": 0.8543134405793346,
"step": 2970
},
{
"epoch": 6.597114317425083,
"grad_norm": 0.07262725739836273,
"learning_rate": 6.218001418283577e-06,
"loss": 0.4589,
"mean_token_accuracy": 0.8485135772896658,
"step": 2975
},
{
"epoch": 6.608213096559378,
"grad_norm": 0.07293978841730346,
"learning_rate": 6.18212566263765e-06,
"loss": 0.4462,
"mean_token_accuracy": 0.8527901022563297,
"step": 2980
},
{
"epoch": 6.619311875693674,
"grad_norm": 0.06869051011062992,
"learning_rate": 6.146307338575519e-06,
"loss": 0.4365,
"mean_token_accuracy": 0.8560732349752435,
"step": 2985
},
{
"epoch": 6.630410654827969,
"grad_norm": 0.07011481935789973,
"learning_rate": 6.110546984905661e-06,
"loss": 0.4325,
"mean_token_accuracy": 0.8569647960292948,
"step": 2990
},
{
"epoch": 6.6415094339622645,
"grad_norm": 0.06867347376570304,
"learning_rate": 6.074845139564529e-06,
"loss": 0.4482,
"mean_token_accuracy": 0.8520339802204481,
"step": 2995
},
{
"epoch": 6.65260821309656,
"grad_norm": 0.07167935814663243,
"learning_rate": 6.039202339608432e-06,
"loss": 0.434,
"mean_token_accuracy": 0.8564132679138142,
"step": 3000
},
{
"epoch": 6.65260821309656,
"eval_loss": 0.8441133499145508,
"eval_mean_token_accuracy": 0.765247487379611,
"eval_runtime": 2.5087,
"eval_samples_per_second": 51.421,
"eval_steps_per_second": 4.385,
"step": 3000
},
{
"epoch": 6.663706992230854,
"grad_norm": 0.07186071828643208,
"learning_rate": 6.00361912120548e-06,
"loss": 0.4452,
"mean_token_accuracy": 0.8528585061113889,
"step": 3005
},
{
"epoch": 6.67480577136515,
"grad_norm": 0.06855121014435227,
"learning_rate": 5.9680960196274995e-06,
"loss": 0.4325,
"mean_token_accuracy": 0.8572836863361127,
"step": 3010
},
{
"epoch": 6.685904550499445,
"grad_norm": 0.07110585699367894,
"learning_rate": 5.932633569242e-06,
"loss": 0.4384,
"mean_token_accuracy": 0.8553335978565515,
"step": 3015
},
{
"epoch": 6.69700332963374,
"grad_norm": 0.07085757772624833,
"learning_rate": 5.89723230350412e-06,
"loss": 0.4379,
"mean_token_accuracy": 0.8555847653374402,
"step": 3020
},
{
"epoch": 6.708102108768036,
"grad_norm": 0.07063511706597377,
"learning_rate": 5.8618927549486095e-06,
"loss": 0.4468,
"mean_token_accuracy": 0.8523510627335507,
"step": 3025
},
{
"epoch": 6.719200887902331,
"grad_norm": 0.07187844503905214,
"learning_rate": 5.8266154551818225e-06,
"loss": 0.4527,
"mean_token_accuracy": 0.8498687190225727,
"step": 3030
},
{
"epoch": 6.7302996670366255,
"grad_norm": 0.07058997469604424,
"learning_rate": 5.79140093487371e-06,
"loss": 0.4435,
"mean_token_accuracy": 0.8533467542770972,
"step": 3035
},
{
"epoch": 6.741398446170921,
"grad_norm": 0.0717038610864614,
"learning_rate": 5.756249723749847e-06,
"loss": 0.4437,
"mean_token_accuracy": 0.8533376800900824,
"step": 3040
},
{
"epoch": 6.752497225305216,
"grad_norm": 0.06801675759895583,
"learning_rate": 5.72116235058346e-06,
"loss": 0.4322,
"mean_token_accuracy": 0.8569636338319049,
"step": 3045
},
{
"epoch": 6.763596004439512,
"grad_norm": 0.06801099401082193,
"learning_rate": 5.686139343187468e-06,
"loss": 0.4346,
"mean_token_accuracy": 0.8563081469335604,
"step": 3050
},
{
"epoch": 6.774694783573807,
"grad_norm": 0.06796075420482334,
"learning_rate": 5.651181228406554e-06,
"loss": 0.4511,
"mean_token_accuracy": 0.8507139912972462,
"step": 3055
},
{
"epoch": 6.785793562708102,
"grad_norm": 0.06858342437191409,
"learning_rate": 5.616288532109225e-06,
"loss": 0.4476,
"mean_token_accuracy": 0.85227963226098,
"step": 3060
},
{
"epoch": 6.796892341842398,
"grad_norm": 0.07033819528015756,
"learning_rate": 5.581461779179924e-06,
"loss": 0.4338,
"mean_token_accuracy": 0.8568131935312552,
"step": 3065
},
{
"epoch": 6.807991120976693,
"grad_norm": 0.0713035107971796,
"learning_rate": 5.5467014935111065e-06,
"loss": 0.4355,
"mean_token_accuracy": 0.8558951804357434,
"step": 3070
},
{
"epoch": 6.8190899001109875,
"grad_norm": 0.07131150385903263,
"learning_rate": 5.512008197995379e-06,
"loss": 0.4564,
"mean_token_accuracy": 0.8497975681079393,
"step": 3075
},
{
"epoch": 6.830188679245283,
"grad_norm": 0.06817277614907659,
"learning_rate": 5.477382414517625e-06,
"loss": 0.4316,
"mean_token_accuracy": 0.8580683047832622,
"step": 3080
},
{
"epoch": 6.841287458379578,
"grad_norm": 0.07057273044950617,
"learning_rate": 5.442824663947157e-06,
"loss": 0.4616,
"mean_token_accuracy": 0.8475735324796743,
"step": 3085
},
{
"epoch": 6.8523862375138735,
"grad_norm": 0.07036554647139312,
"learning_rate": 5.4083354661298816e-06,
"loss": 0.4498,
"mean_token_accuracy": 0.8521374369729602,
"step": 3090
},
{
"epoch": 6.863485016648169,
"grad_norm": 0.07222437693843485,
"learning_rate": 5.373915339880484e-06,
"loss": 0.4292,
"mean_token_accuracy": 0.8582999085532814,
"step": 3095
},
{
"epoch": 6.874583795782464,
"grad_norm": 0.072182918394675,
"learning_rate": 5.339564802974615e-06,
"loss": 0.4484,
"mean_token_accuracy": 0.8518725421130162,
"step": 3100
},
{
"epoch": 6.874583795782464,
"eval_loss": 0.8406283259391785,
"eval_mean_token_accuracy": 0.7652879746380005,
"eval_runtime": 2.5059,
"eval_samples_per_second": 51.479,
"eval_steps_per_second": 4.39,
"step": 3100
},
{
"epoch": 6.885682574916759,
"grad_norm": 0.07096548464795761,
"learning_rate": 5.305284372141095e-06,
"loss": 0.4347,
"mean_token_accuracy": 0.8562329599216175,
"step": 3105
},
{
"epoch": 6.896781354051054,
"grad_norm": 0.06975587960479132,
"learning_rate": 5.271074563054167e-06,
"loss": 0.4571,
"mean_token_accuracy": 0.8489463468135968,
"step": 3110
},
{
"epoch": 6.907880133185349,
"grad_norm": 0.06741935931261574,
"learning_rate": 5.236935890325717e-06,
"loss": 0.4323,
"mean_token_accuracy": 0.8573877517767727,
"step": 3115
},
{
"epoch": 6.918978912319645,
"grad_norm": 0.07380361466603937,
"learning_rate": 5.202868867497542e-06,
"loss": 0.4619,
"mean_token_accuracy": 0.8474778792524758,
"step": 3120
},
{
"epoch": 6.93007769145394,
"grad_norm": 0.07025381857611887,
"learning_rate": 5.168874007033615e-06,
"loss": 0.4528,
"mean_token_accuracy": 0.851322918006027,
"step": 3125
},
{
"epoch": 6.9411764705882355,
"grad_norm": 0.06752810608010477,
"learning_rate": 5.134951820312402e-06,
"loss": 0.4261,
"mean_token_accuracy": 0.8592220693486257,
"step": 3130
},
{
"epoch": 6.952275249722531,
"grad_norm": 0.07028173714769523,
"learning_rate": 5.101102817619132e-06,
"loss": 0.4289,
"mean_token_accuracy": 0.8581184918015656,
"step": 3135
},
{
"epoch": 6.963374028856826,
"grad_norm": 0.07055892240185084,
"learning_rate": 5.067327508138148e-06,
"loss": 0.4405,
"mean_token_accuracy": 0.8548007954650545,
"step": 3140
},
{
"epoch": 6.974472807991121,
"grad_norm": 0.06971017906950491,
"learning_rate": 5.033626399945241e-06,
"loss": 0.4558,
"mean_token_accuracy": 0.8490455064016939,
"step": 3145
},
{
"epoch": 6.985571587125416,
"grad_norm": 0.07286514586275572,
"learning_rate": 5.000000000000003e-06,
"loss": 0.4277,
"mean_token_accuracy": 0.8587458553947747,
"step": 3150
},
{
"epoch": 6.996670366259711,
"grad_norm": 0.07098175478184185,
"learning_rate": 4.9664488141382026e-06,
"loss": 0.4247,
"mean_token_accuracy": 0.8598055136686054,
"step": 3155
},
{
"epoch": 7.006659267480577,
"grad_norm": 0.07629907033137776,
"learning_rate": 4.932973347064177e-06,
"loss": 0.4125,
"mean_token_accuracy": 0.8638759362274172,
"step": 3160
},
{
"epoch": 7.017758046614873,
"grad_norm": 0.08557497683687372,
"learning_rate": 4.899574102343247e-06,
"loss": 0.4068,
"mean_token_accuracy": 0.8653847908917432,
"step": 3165
},
{
"epoch": 7.028856825749168,
"grad_norm": 0.07500777155917483,
"learning_rate": 4.8662515823941255e-06,
"loss": 0.3952,
"mean_token_accuracy": 0.8694597047997366,
"step": 3170
},
{
"epoch": 7.039955604883462,
"grad_norm": 0.07125380371706241,
"learning_rate": 4.8330062884813714e-06,
"loss": 0.3797,
"mean_token_accuracy": 0.8746208020590464,
"step": 3175
},
{
"epoch": 7.051054384017758,
"grad_norm": 0.0725012432074861,
"learning_rate": 4.799838720707847e-06,
"loss": 0.3849,
"mean_token_accuracy": 0.8724296658830342,
"step": 3180
},
{
"epoch": 7.062153163152053,
"grad_norm": 0.07410286674955183,
"learning_rate": 4.766749378007193e-06,
"loss": 0.3871,
"mean_token_accuracy": 0.8719934430139469,
"step": 3185
},
{
"epoch": 7.0732519422863485,
"grad_norm": 0.0711137297349784,
"learning_rate": 4.733738758136327e-06,
"loss": 0.3976,
"mean_token_accuracy": 0.8683940340664813,
"step": 3190
},
{
"epoch": 7.084350721420644,
"grad_norm": 0.07262361207927566,
"learning_rate": 4.700807357667953e-06,
"loss": 0.3978,
"mean_token_accuracy": 0.8685349502275285,
"step": 3195
},
{
"epoch": 7.095449500554939,
"grad_norm": 0.07309131618569278,
"learning_rate": 4.66795567198309e-06,
"loss": 0.4045,
"mean_token_accuracy": 0.8659610376788004,
"step": 3200
},
{
"epoch": 7.095449500554939,
"eval_loss": 0.8952147960662842,
"eval_mean_token_accuracy": 0.7615468810098737,
"eval_runtime": 2.5036,
"eval_samples_per_second": 51.525,
"eval_steps_per_second": 4.394,
"step": 3200
},
{
"epoch": 7.1065482796892345,
"grad_norm": 0.0721920803563012,
"learning_rate": 4.635184195263624e-06,
"loss": 0.3966,
"mean_token_accuracy": 0.8681120501745102,
"step": 3205
},
{
"epoch": 7.117647058823529,
"grad_norm": 0.07285210208532443,
"learning_rate": 4.6024934204848745e-06,
"loss": 0.3961,
"mean_token_accuracy": 0.8689753794301351,
"step": 3210
},
{
"epoch": 7.128745837957824,
"grad_norm": 0.07326422808489647,
"learning_rate": 4.56988383940817e-06,
"loss": 0.4076,
"mean_token_accuracy": 0.8650006041676468,
"step": 3215
},
{
"epoch": 7.13984461709212,
"grad_norm": 0.0725191006821199,
"learning_rate": 4.537355942573464e-06,
"loss": 0.4018,
"mean_token_accuracy": 0.8670280432137408,
"step": 3220
},
{
"epoch": 7.150943396226415,
"grad_norm": 0.07203104950103884,
"learning_rate": 4.504910219291941e-06,
"loss": 0.3963,
"mean_token_accuracy": 0.8690148347435971,
"step": 3225
},
{
"epoch": 7.16204217536071,
"grad_norm": 0.07639160145123591,
"learning_rate": 4.472547157638674e-06,
"loss": 0.389,
"mean_token_accuracy": 0.8710246712104658,
"step": 3230
},
{
"epoch": 7.173140954495006,
"grad_norm": 0.07324338020343289,
"learning_rate": 4.4402672444452664e-06,
"loss": 0.3827,
"mean_token_accuracy": 0.8731911923413589,
"step": 3235
},
{
"epoch": 7.184239733629301,
"grad_norm": 0.07087742268240406,
"learning_rate": 4.408070965292534e-06,
"loss": 0.4053,
"mean_token_accuracy": 0.8657608620764454,
"step": 3240
},
{
"epoch": 7.195338512763596,
"grad_norm": 0.07201590538776476,
"learning_rate": 4.375958804503201e-06,
"loss": 0.4157,
"mean_token_accuracy": 0.8624986168900755,
"step": 3245
},
{
"epoch": 7.206437291897891,
"grad_norm": 0.06999596265135467,
"learning_rate": 4.343931245134616e-06,
"loss": 0.402,
"mean_token_accuracy": 0.8665917993204305,
"step": 3250
},
{
"epoch": 7.217536071032186,
"grad_norm": 0.07182851184142412,
"learning_rate": 4.311988768971484e-06,
"loss": 0.4015,
"mean_token_accuracy": 0.8674501943469238,
"step": 3255
},
{
"epoch": 7.228634850166482,
"grad_norm": 0.07362845834328512,
"learning_rate": 4.2801318565186165e-06,
"loss": 0.394,
"mean_token_accuracy": 0.8695416479952799,
"step": 3260
},
{
"epoch": 7.239733629300777,
"grad_norm": 0.0717682552415512,
"learning_rate": 4.2483609869937115e-06,
"loss": 0.4103,
"mean_token_accuracy": 0.8642538533621753,
"step": 3265
},
{
"epoch": 7.250832408435072,
"grad_norm": 0.07272893232968056,
"learning_rate": 4.216676638320135e-06,
"loss": 0.4131,
"mean_token_accuracy": 0.8626869196808453,
"step": 3270
},
{
"epoch": 7.261931187569368,
"grad_norm": 0.07135546730180682,
"learning_rate": 4.185079287119733e-06,
"loss": 0.4005,
"mean_token_accuracy": 0.8674417568639401,
"step": 3275
},
{
"epoch": 7.273029966703662,
"grad_norm": 0.0762468607841131,
"learning_rate": 4.15356940870567e-06,
"loss": 0.4035,
"mean_token_accuracy": 0.8656978705707024,
"step": 3280
},
{
"epoch": 7.2841287458379576,
"grad_norm": 0.07186567403941488,
"learning_rate": 4.12214747707527e-06,
"loss": 0.41,
"mean_token_accuracy": 0.8647636654184666,
"step": 3285
},
{
"epoch": 7.295227524972253,
"grad_norm": 0.07355942774719212,
"learning_rate": 4.090813964902889e-06,
"loss": 0.3943,
"mean_token_accuracy": 0.8696648856429228,
"step": 3290
},
{
"epoch": 7.306326304106548,
"grad_norm": 0.07215730561355123,
"learning_rate": 4.059569343532809e-06,
"loss": 0.3963,
"mean_token_accuracy": 0.8685829093160791,
"step": 3295
},
{
"epoch": 7.317425083240844,
"grad_norm": 0.07430099594156658,
"learning_rate": 4.028414082972141e-06,
"loss": 0.391,
"mean_token_accuracy": 0.8703857625330743,
"step": 3300
},
{
"epoch": 7.317425083240844,
"eval_loss": 0.8958173394203186,
"eval_mean_token_accuracy": 0.7614542924776546,
"eval_runtime": 2.504,
"eval_samples_per_second": 51.518,
"eval_steps_per_second": 4.393,
"step": 3300
},
{
"epoch": 7.328523862375139,
"grad_norm": 0.0756589764512869,
"learning_rate": 3.997348651883757e-06,
"loss": 0.4024,
"mean_token_accuracy": 0.8669605659453884,
"step": 3305
},
{
"epoch": 7.339622641509434,
"grad_norm": 0.0732762921056893,
"learning_rate": 3.966373517579244e-06,
"loss": 0.4082,
"mean_token_accuracy": 0.8643103527207086,
"step": 3310
},
{
"epoch": 7.350721420643729,
"grad_norm": 0.07257949282418803,
"learning_rate": 3.9354891460118695e-06,
"loss": 0.4041,
"mean_token_accuracy": 0.8662132721963183,
"step": 3315
},
{
"epoch": 7.361820199778024,
"grad_norm": 0.07505384362015047,
"learning_rate": 3.904696001769571e-06,
"loss": 0.3963,
"mean_token_accuracy": 0.8687588550722725,
"step": 3320
},
{
"epoch": 7.3729189789123195,
"grad_norm": 0.07238130150060536,
"learning_rate": 3.873994548067972e-06,
"loss": 0.3903,
"mean_token_accuracy": 0.8709913260405348,
"step": 3325
},
{
"epoch": 7.384017758046615,
"grad_norm": 0.07413721682804225,
"learning_rate": 3.8433852467434175e-06,
"loss": 0.3998,
"mean_token_accuracy": 0.8674460460383848,
"step": 3330
},
{
"epoch": 7.39511653718091,
"grad_norm": 0.07312004025850016,
"learning_rate": 3.8128685582460144e-06,
"loss": 0.4236,
"mean_token_accuracy": 0.8600225688180017,
"step": 3335
},
{
"epoch": 7.406215316315206,
"grad_norm": 0.0695191654744446,
"learning_rate": 3.7824449416327123e-06,
"loss": 0.3883,
"mean_token_accuracy": 0.8713507670108156,
"step": 3340
},
{
"epoch": 7.417314095449501,
"grad_norm": 0.07317832388015687,
"learning_rate": 3.7521148545604003e-06,
"loss": 0.3961,
"mean_token_accuracy": 0.8689663506413468,
"step": 3345
},
{
"epoch": 7.428412874583795,
"grad_norm": 0.0714931532503113,
"learning_rate": 3.7218787532790167e-06,
"loss": 0.4077,
"mean_token_accuracy": 0.865370666345699,
"step": 3350
},
{
"epoch": 7.439511653718091,
"grad_norm": 0.07325374247465251,
"learning_rate": 3.6917370926246877e-06,
"loss": 0.3972,
"mean_token_accuracy": 0.8682190031253774,
"step": 3355
},
{
"epoch": 7.450610432852386,
"grad_norm": 0.07512751267328739,
"learning_rate": 3.661690326012897e-06,
"loss": 0.409,
"mean_token_accuracy": 0.8650642432358119,
"step": 3360
},
{
"epoch": 7.4617092119866815,
"grad_norm": 0.07195371705801658,
"learning_rate": 3.631738905431641e-06,
"loss": 0.3962,
"mean_token_accuracy": 0.8694184936368143,
"step": 3365
},
{
"epoch": 7.472807991120977,
"grad_norm": 0.0729446774567736,
"learning_rate": 3.6018832814346516e-06,
"loss": 0.4125,
"mean_token_accuracy": 0.8633514638223495,
"step": 3370
},
{
"epoch": 7.483906770255272,
"grad_norm": 0.07561542233758643,
"learning_rate": 3.5721239031346067e-06,
"loss": 0.4159,
"mean_token_accuracy": 0.8619894212817206,
"step": 3375
},
{
"epoch": 7.4950055493895675,
"grad_norm": 0.07252151252018872,
"learning_rate": 3.542461218196379e-06,
"loss": 0.3939,
"mean_token_accuracy": 0.8697520947244627,
"step": 3380
},
{
"epoch": 7.506104328523862,
"grad_norm": 0.07293598599568907,
"learning_rate": 3.5128956728303e-06,
"loss": 0.4028,
"mean_token_accuracy": 0.8664042150786312,
"step": 3385
},
{
"epoch": 7.517203107658157,
"grad_norm": 0.07266488282445482,
"learning_rate": 3.483427711785449e-06,
"loss": 0.3944,
"mean_token_accuracy": 0.8695094069973557,
"step": 3390
},
{
"epoch": 7.528301886792453,
"grad_norm": 0.07308961587910096,
"learning_rate": 3.454057778342963e-06,
"loss": 0.3937,
"mean_token_accuracy": 0.8693873915221115,
"step": 3395
},
{
"epoch": 7.539400665926748,
"grad_norm": 0.07427508856731285,
"learning_rate": 3.424786314309365e-06,
"loss": 0.3976,
"mean_token_accuracy": 0.8685723175278763,
"step": 3400
},
{
"epoch": 7.539400665926748,
"eval_loss": 0.8988686203956604,
"eval_mean_token_accuracy": 0.7612407115662695,
"eval_runtime": 2.5058,
"eval_samples_per_second": 51.481,
"eval_steps_per_second": 4.39,
"step": 3400
},
{
"epoch": 7.550499445061043,
"grad_norm": 0.07215047426411422,
"learning_rate": 3.3956137600099248e-06,
"loss": 0.3961,
"mean_token_accuracy": 0.8689609143492317,
"step": 3405
},
{
"epoch": 7.561598224195339,
"grad_norm": 0.07388260988647496,
"learning_rate": 3.3665405542820283e-06,
"loss": 0.3992,
"mean_token_accuracy": 0.8683887719751194,
"step": 3410
},
{
"epoch": 7.572697003329633,
"grad_norm": 0.07356747008095715,
"learning_rate": 3.337567134468579e-06,
"loss": 0.4172,
"mean_token_accuracy": 0.8613869632526061,
"step": 3415
},
{
"epoch": 7.583795782463929,
"grad_norm": 0.0720764629548677,
"learning_rate": 3.308693936411421e-06,
"loss": 0.397,
"mean_token_accuracy": 0.8684071063453782,
"step": 3420
},
{
"epoch": 7.594894561598224,
"grad_norm": 0.0708557636265316,
"learning_rate": 3.279921394444776e-06,
"loss": 0.3961,
"mean_token_accuracy": 0.8690219002022339,
"step": 3425
},
{
"epoch": 7.605993340732519,
"grad_norm": 0.07196094239780987,
"learning_rate": 3.2512499413887255e-06,
"loss": 0.4099,
"mean_token_accuracy": 0.8641867909633174,
"step": 3430
},
{
"epoch": 7.617092119866815,
"grad_norm": 0.07170030139792326,
"learning_rate": 3.222680008542678e-06,
"loss": 0.3897,
"mean_token_accuracy": 0.8706013772529104,
"step": 3435
},
{
"epoch": 7.62819089900111,
"grad_norm": 0.0725499574373253,
"learning_rate": 3.1942120256788966e-06,
"loss": 0.4096,
"mean_token_accuracy": 0.8641443370530075,
"step": 3440
},
{
"epoch": 7.639289678135405,
"grad_norm": 0.07550535184165293,
"learning_rate": 3.1658464210360285e-06,
"loss": 0.3934,
"mean_token_accuracy": 0.8700940005126734,
"step": 3445
},
{
"epoch": 7.650388457269701,
"grad_norm": 0.07331647230148707,
"learning_rate": 3.1375836213126653e-06,
"loss": 0.4182,
"mean_token_accuracy": 0.8615029840680271,
"step": 3450
},
{
"epoch": 7.661487236403995,
"grad_norm": 0.07266758714245847,
"learning_rate": 3.10942405166092e-06,
"loss": 0.3995,
"mean_token_accuracy": 0.8675580745382989,
"step": 3455
},
{
"epoch": 7.672586015538291,
"grad_norm": 0.07202168132939574,
"learning_rate": 3.081368135680041e-06,
"loss": 0.41,
"mean_token_accuracy": 0.8646123888948718,
"step": 3460
},
{
"epoch": 7.683684794672586,
"grad_norm": 0.07108514628989722,
"learning_rate": 3.0534162954100264e-06,
"loss": 0.4053,
"mean_token_accuracy": 0.8659444203679675,
"step": 3465
},
{
"epoch": 7.694783573806881,
"grad_norm": 0.0724262143342174,
"learning_rate": 3.0255689513252873e-06,
"loss": 0.4119,
"mean_token_accuracy": 0.8638014320646695,
"step": 3470
},
{
"epoch": 7.705882352941177,
"grad_norm": 0.07001654568154396,
"learning_rate": 2.9978265223283152e-06,
"loss": 0.4008,
"mean_token_accuracy": 0.8676831414391628,
"step": 3475
},
{
"epoch": 7.716981132075472,
"grad_norm": 0.07167793631617325,
"learning_rate": 2.970189425743383e-06,
"loss": 0.3911,
"mean_token_accuracy": 0.8705106921301781,
"step": 3480
},
{
"epoch": 7.7280799112097665,
"grad_norm": 0.07293712480603332,
"learning_rate": 2.94265807731027e-06,
"loss": 0.3986,
"mean_token_accuracy": 0.8681102444656519,
"step": 3485
},
{
"epoch": 7.739178690344062,
"grad_norm": 0.0733576527410462,
"learning_rate": 2.9152328911780027e-06,
"loss": 0.4144,
"mean_token_accuracy": 0.862807001709438,
"step": 3490
},
{
"epoch": 7.750277469478357,
"grad_norm": 0.07182334518627545,
"learning_rate": 2.8879142798986293e-06,
"loss": 0.4062,
"mean_token_accuracy": 0.8654581017683796,
"step": 3495
},
{
"epoch": 7.7613762486126525,
"grad_norm": 0.07149491856929646,
"learning_rate": 2.8607026544210115e-06,
"loss": 0.3852,
"mean_token_accuracy": 0.8726495300506489,
"step": 3500
},
{
"epoch": 7.7613762486126525,
"eval_loss": 0.8958276510238647,
"eval_mean_token_accuracy": 0.7615317905610097,
"eval_runtime": 2.5054,
"eval_samples_per_second": 51.489,
"eval_steps_per_second": 4.391,
"step": 3500
},
{
"epoch": 7.772475027746948,
"grad_norm": 0.07205023198542661,
"learning_rate": 2.8335984240846424e-06,
"loss": 0.4032,
"mean_token_accuracy": 0.8664543028489167,
"step": 3505
},
{
"epoch": 7.783573806881243,
"grad_norm": 0.07001518219013764,
"learning_rate": 2.8066019966134907e-06,
"loss": 0.3978,
"mean_token_accuracy": 0.8678734443119132,
"step": 3510
},
{
"epoch": 7.794672586015539,
"grad_norm": 0.07060004533338061,
"learning_rate": 2.779713778109867e-06,
"loss": 0.4032,
"mean_token_accuracy": 0.8667982857834049,
"step": 3515
},
{
"epoch": 7.805771365149834,
"grad_norm": 0.0692844958885861,
"learning_rate": 2.7529341730483115e-06,
"loss": 0.4127,
"mean_token_accuracy": 0.8629632311173948,
"step": 3520
},
{
"epoch": 7.816870144284128,
"grad_norm": 0.0729157537799244,
"learning_rate": 2.726263584269513e-06,
"loss": 0.4044,
"mean_token_accuracy": 0.86610341023193,
"step": 3525
},
{
"epoch": 7.827968923418424,
"grad_norm": 0.0723306129297517,
"learning_rate": 2.6997024129742544e-06,
"loss": 0.4025,
"mean_token_accuracy": 0.8666444767653877,
"step": 3530
},
{
"epoch": 7.839067702552719,
"grad_norm": 0.07329107603229254,
"learning_rate": 2.6732510587173645e-06,
"loss": 0.4033,
"mean_token_accuracy": 0.8662954720595669,
"step": 3535
},
{
"epoch": 7.8501664816870145,
"grad_norm": 0.07332189134275487,
"learning_rate": 2.6469099194017144e-06,
"loss": 0.3835,
"mean_token_accuracy": 0.8732789802263794,
"step": 3540
},
{
"epoch": 7.86126526082131,
"grad_norm": 0.07209594577795633,
"learning_rate": 2.620679391272236e-06,
"loss": 0.4079,
"mean_token_accuracy": 0.8645226582422317,
"step": 3545
},
{
"epoch": 7.872364039955605,
"grad_norm": 0.07114236658659995,
"learning_rate": 2.594559868909956e-06,
"loss": 0.3842,
"mean_token_accuracy": 0.873175329490459,
"step": 3550
},
{
"epoch": 7.8834628190899,
"grad_norm": 0.07221931580949754,
"learning_rate": 2.5685517452260566e-06,
"loss": 0.3987,
"mean_token_accuracy": 0.8678625176266003,
"step": 3555
},
{
"epoch": 7.894561598224195,
"grad_norm": 0.0714379162963992,
"learning_rate": 2.542655411455982e-06,
"loss": 0.393,
"mean_token_accuracy": 0.8699922866936953,
"step": 3560
},
{
"epoch": 7.90566037735849,
"grad_norm": 0.07121360401963349,
"learning_rate": 2.5168712571535305e-06,
"loss": 0.4037,
"mean_token_accuracy": 0.8662862219762447,
"step": 3565
},
{
"epoch": 7.916759156492786,
"grad_norm": 0.07302661140425189,
"learning_rate": 2.4911996701850083e-06,
"loss": 0.3951,
"mean_token_accuracy": 0.8693212476026023,
"step": 3570
},
{
"epoch": 7.927857935627081,
"grad_norm": 0.07114846005078426,
"learning_rate": 2.4656410367233928e-06,
"loss": 0.4032,
"mean_token_accuracy": 0.8667585155126627,
"step": 3575
},
{
"epoch": 7.938956714761376,
"grad_norm": 0.07063055606195018,
"learning_rate": 2.4401957412425213e-06,
"loss": 0.3954,
"mean_token_accuracy": 0.8692018437360971,
"step": 3580
},
{
"epoch": 7.950055493895672,
"grad_norm": 0.07709872630246001,
"learning_rate": 2.4148641665113116e-06,
"loss": 0.405,
"mean_token_accuracy": 0.866025568545432,
"step": 3585
},
{
"epoch": 7.961154273029967,
"grad_norm": 0.07479654482901768,
"learning_rate": 2.3896466935879957e-06,
"loss": 0.3971,
"mean_token_accuracy": 0.868972963732916,
"step": 3590
},
{
"epoch": 7.972253052164262,
"grad_norm": 0.07459975485093782,
"learning_rate": 2.364543701814398e-06,
"loss": 0.3963,
"mean_token_accuracy": 0.8689697104778606,
"step": 3595
},
{
"epoch": 7.983351831298557,
"grad_norm": 0.07491883674276509,
"learning_rate": 2.339555568810221e-06,
"loss": 0.4176,
"mean_token_accuracy": 0.8613320490404834,
"step": 3600
},
{
"epoch": 7.983351831298557,
"eval_loss": 0.8959746956825256,
"eval_mean_token_accuracy": 0.7616840841270472,
"eval_runtime": 2.5049,
"eval_samples_per_second": 51.499,
"eval_steps_per_second": 4.391,
"step": 3600
},
{
"epoch": 7.994450610432852,
"grad_norm": 0.0705701747454165,
"learning_rate": 2.3146826704673696e-06,
"loss": 0.3948,
"mean_token_accuracy": 0.8686890434955494,
"step": 3605
},
{
"epoch": 8.004439511653718,
"grad_norm": 0.07856101400524802,
"learning_rate": 2.2899253809442944e-06,
"loss": 0.4023,
"mean_token_accuracy": 0.8675475603916234,
"step": 3610
},
{
"epoch": 8.015538290788013,
"grad_norm": 0.07096704231396721,
"learning_rate": 2.265284072660362e-06,
"loss": 0.3536,
"mean_token_accuracy": 0.8834662318113198,
"step": 3615
},
{
"epoch": 8.026637069922309,
"grad_norm": 0.07811200288429487,
"learning_rate": 2.2407591162902576e-06,
"loss": 0.3783,
"mean_token_accuracy": 0.8743016220721802,
"step": 3620
},
{
"epoch": 8.037735849056604,
"grad_norm": 0.06847870304456238,
"learning_rate": 2.2163508807584e-06,
"loss": 0.3567,
"mean_token_accuracy": 0.8822303443810826,
"step": 3625
},
{
"epoch": 8.0488346281909,
"grad_norm": 0.07237316029649163,
"learning_rate": 2.192059733233408e-06,
"loss": 0.3642,
"mean_token_accuracy": 0.8798146287904443,
"step": 3630
},
{
"epoch": 8.059933407325195,
"grad_norm": 0.07326537582767348,
"learning_rate": 2.1678860391225588e-06,
"loss": 0.3748,
"mean_token_accuracy": 0.8763103580641246,
"step": 3635
},
{
"epoch": 8.07103218645949,
"grad_norm": 0.0716311973227314,
"learning_rate": 2.1438301620662994e-06,
"loss": 0.381,
"mean_token_accuracy": 0.8740629045020978,
"step": 3640
},
{
"epoch": 8.082130965593784,
"grad_norm": 0.07156015129797201,
"learning_rate": 2.119892463932781e-06,
"loss": 0.3602,
"mean_token_accuracy": 0.8808393768354014,
"step": 3645
},
{
"epoch": 8.09322974472808,
"grad_norm": 0.0729212945492825,
"learning_rate": 2.0960733048124082e-06,
"loss": 0.392,
"mean_token_accuracy": 0.8698077920762731,
"step": 3650
},
{
"epoch": 8.104328523862375,
"grad_norm": 0.07233081181998524,
"learning_rate": 2.072373043012422e-06,
"loss": 0.3788,
"mean_token_accuracy": 0.874932114508718,
"step": 3655
},
{
"epoch": 8.11542730299667,
"grad_norm": 0.07255689066428407,
"learning_rate": 2.048792035051521e-06,
"loss": 0.3864,
"mean_token_accuracy": 0.8725265939389528,
"step": 3660
},
{
"epoch": 8.126526082130965,
"grad_norm": 0.06796721021247674,
"learning_rate": 2.0253306356544843e-06,
"loss": 0.3534,
"mean_token_accuracy": 0.8830378557607703,
"step": 3665
},
{
"epoch": 8.13762486126526,
"grad_norm": 0.07079763475731628,
"learning_rate": 2.001989197746841e-06,
"loss": 0.3621,
"mean_token_accuracy": 0.8800683843848924,
"step": 3670
},
{
"epoch": 8.148723640399556,
"grad_norm": 0.06934827260274774,
"learning_rate": 1.9787680724495617e-06,
"loss": 0.3633,
"mean_token_accuracy": 0.8795690658099342,
"step": 3675
},
{
"epoch": 8.159822419533851,
"grad_norm": 0.07149671546021895,
"learning_rate": 1.9556676090737803e-06,
"loss": 0.376,
"mean_token_accuracy": 0.8752081791023549,
"step": 3680
},
{
"epoch": 8.170921198668147,
"grad_norm": 0.07306552731180865,
"learning_rate": 1.9326881551155307e-06,
"loss": 0.375,
"mean_token_accuracy": 0.8762224073973204,
"step": 3685
},
{
"epoch": 8.182019977802442,
"grad_norm": 0.07245624271929346,
"learning_rate": 1.9098300562505266e-06,
"loss": 0.3628,
"mean_token_accuracy": 0.8800849389151534,
"step": 3690
},
{
"epoch": 8.193118756936737,
"grad_norm": 0.0717221526393695,
"learning_rate": 1.8870936563289598e-06,
"loss": 0.3736,
"mean_token_accuracy": 0.8766231182349811,
"step": 3695
},
{
"epoch": 8.204217536071033,
"grad_norm": 0.07427725350287043,
"learning_rate": 1.8644792973703252e-06,
"loss": 0.3865,
"mean_token_accuracy": 0.8728949638922477,
"step": 3700
},
{
"epoch": 8.204217536071033,
"eval_loss": 0.9425944089889526,
"eval_mean_token_accuracy": 0.7587572912638287,
"eval_runtime": 2.513,
"eval_samples_per_second": 51.333,
"eval_steps_per_second": 4.377,
"step": 3700
},
{
"epoch": 8.215316315205328,
"grad_norm": 0.07078445884735189,
"learning_rate": 1.8419873195582815e-06,
"loss": 0.3667,
"mean_token_accuracy": 0.8785569874719916,
"step": 3705
},
{
"epoch": 8.226415094339623,
"grad_norm": 0.0714911223340155,
"learning_rate": 1.8196180612355252e-06,
"loss": 0.3633,
"mean_token_accuracy": 0.8797422666007302,
"step": 3710
},
{
"epoch": 8.237513873473917,
"grad_norm": 0.07175228593929144,
"learning_rate": 1.79737185889871e-06,
"loss": 0.3659,
"mean_token_accuracy": 0.8794436477784764,
"step": 3715
},
{
"epoch": 8.248612652608212,
"grad_norm": 0.07124623704997705,
"learning_rate": 1.7752490471933769e-06,
"loss": 0.3801,
"mean_token_accuracy": 0.874297933476225,
"step": 3720
},
{
"epoch": 8.259711431742508,
"grad_norm": 0.07340107106058204,
"learning_rate": 1.7532499589089324e-06,
"loss": 0.364,
"mean_token_accuracy": 0.8794662910609281,
"step": 3725
},
{
"epoch": 8.270810210876803,
"grad_norm": 0.06990896856577071,
"learning_rate": 1.7313749249736266e-06,
"loss": 0.3792,
"mean_token_accuracy": 0.8745357270848668,
"step": 3730
},
{
"epoch": 8.281908990011098,
"grad_norm": 0.0713247064610054,
"learning_rate": 1.709624274449584e-06,
"loss": 0.3711,
"mean_token_accuracy": 0.8773109810868555,
"step": 3735
},
{
"epoch": 8.293007769145394,
"grad_norm": 0.07760277266498634,
"learning_rate": 1.6879983345278528e-06,
"loss": 0.3785,
"mean_token_accuracy": 0.8745675120863103,
"step": 3740
},
{
"epoch": 8.30410654827969,
"grad_norm": 0.07306991998908845,
"learning_rate": 1.6664974305234848e-06,
"loss": 0.3768,
"mean_token_accuracy": 0.8757597027758562,
"step": 3745
},
{
"epoch": 8.315205327413985,
"grad_norm": 0.07485329674104144,
"learning_rate": 1.6451218858706374e-06,
"loss": 0.3694,
"mean_token_accuracy": 0.8776273219353218,
"step": 3750
},
{
"epoch": 8.32630410654828,
"grad_norm": 0.07294537461380335,
"learning_rate": 1.6238720221177062e-06,
"loss": 0.3841,
"mean_token_accuracy": 0.873170659995959,
"step": 3755
},
{
"epoch": 8.337402885682575,
"grad_norm": 0.07530630160471269,
"learning_rate": 1.6027481589225024e-06,
"loss": 0.3685,
"mean_token_accuracy": 0.8781178863003885,
"step": 3760
},
{
"epoch": 8.34850166481687,
"grad_norm": 0.07717414395687935,
"learning_rate": 1.5817506140474248e-06,
"loss": 0.3869,
"mean_token_accuracy": 0.871778824694412,
"step": 3765
},
{
"epoch": 8.359600443951166,
"grad_norm": 0.072807969887607,
"learning_rate": 1.560879703354693e-06,
"loss": 0.3652,
"mean_token_accuracy": 0.8794815169500076,
"step": 3770
},
{
"epoch": 8.370699223085461,
"grad_norm": 0.07519402001309598,
"learning_rate": 1.5401357408015893e-06,
"loss": 0.3932,
"mean_token_accuracy": 0.8699543946559007,
"step": 3775
},
{
"epoch": 8.381798002219757,
"grad_norm": 0.07230015905299639,
"learning_rate": 1.5195190384357405e-06,
"loss": 0.3799,
"mean_token_accuracy": 0.874275331240719,
"step": 3780
},
{
"epoch": 8.39289678135405,
"grad_norm": 0.07234384636097141,
"learning_rate": 1.4990299063904202e-06,
"loss": 0.3608,
"mean_token_accuracy": 0.8807774420928469,
"step": 3785
},
{
"epoch": 8.403995560488346,
"grad_norm": 0.07523663390266445,
"learning_rate": 1.4786686528798878e-06,
"loss": 0.38,
"mean_token_accuracy": 0.8738187855023073,
"step": 3790
},
{
"epoch": 8.415094339622641,
"grad_norm": 0.07140911792883375,
"learning_rate": 1.4584355841947452e-06,
"loss": 0.3698,
"mean_token_accuracy": 0.8777678442461072,
"step": 3795
},
{
"epoch": 8.426193118756936,
"grad_norm": 0.07260987883106731,
"learning_rate": 1.4383310046973365e-06,
"loss": 0.3862,
"mean_token_accuracy": 0.8725054701888313,
"step": 3800
},
{
"epoch": 8.426193118756936,
"eval_loss": 0.9420565962791443,
"eval_mean_token_accuracy": 0.7586397433634224,
"eval_runtime": 2.5095,
"eval_samples_per_second": 51.405,
"eval_steps_per_second": 4.383,
"step": 3800
},
{
"epoch": 8.437291897891232,
"grad_norm": 0.07262092270909268,
"learning_rate": 1.4183552168171655e-06,
"loss": 0.3825,
"mean_token_accuracy": 0.8734111086359343,
"step": 3805
},
{
"epoch": 8.448390677025527,
"grad_norm": 0.07299203132420759,
"learning_rate": 1.3985085210463479e-06,
"loss": 0.3721,
"mean_token_accuracy": 0.8770900982908305,
"step": 3810
},
{
"epoch": 8.459489456159822,
"grad_norm": 0.06995744118261267,
"learning_rate": 1.3787912159350903e-06,
"loss": 0.3711,
"mean_token_accuracy": 0.8772603519720799,
"step": 3815
},
{
"epoch": 8.470588235294118,
"grad_norm": 0.07314470110332796,
"learning_rate": 1.3592035980871954e-06,
"loss": 0.3664,
"mean_token_accuracy": 0.8791009918268561,
"step": 3820
},
{
"epoch": 8.481687014428413,
"grad_norm": 0.07223228849997157,
"learning_rate": 1.339745962155613e-06,
"loss": 0.3509,
"mean_token_accuracy": 0.8839586558883811,
"step": 3825
},
{
"epoch": 8.492785793562708,
"grad_norm": 0.069403514773057,
"learning_rate": 1.3204186008379926e-06,
"loss": 0.3706,
"mean_token_accuracy": 0.8774691970570467,
"step": 3830
},
{
"epoch": 8.503884572697004,
"grad_norm": 0.07233134301819387,
"learning_rate": 1.3012218048722858e-06,
"loss": 0.3716,
"mean_token_accuracy": 0.8770607873907654,
"step": 3835
},
{
"epoch": 8.5149833518313,
"grad_norm": 0.07609229799981544,
"learning_rate": 1.282155863032377e-06,
"loss": 0.3806,
"mean_token_accuracy": 0.8743671333344023,
"step": 3840
},
{
"epoch": 8.526082130965595,
"grad_norm": 0.07475884278608393,
"learning_rate": 1.2632210621237329e-06,
"loss": 0.3785,
"mean_token_accuracy": 0.8742028469705245,
"step": 3845
},
{
"epoch": 8.537180910099888,
"grad_norm": 0.07033717924572155,
"learning_rate": 1.2444176869790925e-06,
"loss": 0.3695,
"mean_token_accuracy": 0.8776175277300983,
"step": 3850
},
{
"epoch": 8.548279689234183,
"grad_norm": 0.07239360928547944,
"learning_rate": 1.2257460204541793e-06,
"loss": 0.3874,
"mean_token_accuracy": 0.8717868736796097,
"step": 3855
},
{
"epoch": 8.559378468368479,
"grad_norm": 0.07174016953600017,
"learning_rate": 1.207206343423456e-06,
"loss": 0.384,
"mean_token_accuracy": 0.8728361979192087,
"step": 3860
},
{
"epoch": 8.570477247502774,
"grad_norm": 0.07129464090092877,
"learning_rate": 1.188798934775881e-06,
"loss": 0.3655,
"mean_token_accuracy": 0.879167498582583,
"step": 3865
},
{
"epoch": 8.58157602663707,
"grad_norm": 0.07073053510147226,
"learning_rate": 1.1705240714107301e-06,
"loss": 0.3825,
"mean_token_accuracy": 0.8730660416822736,
"step": 3870
},
{
"epoch": 8.592674805771365,
"grad_norm": 0.07250361995881191,
"learning_rate": 1.152382028233422e-06,
"loss": 0.3829,
"mean_token_accuracy": 0.8733264398074165,
"step": 3875
},
{
"epoch": 8.60377358490566,
"grad_norm": 0.071804460982185,
"learning_rate": 1.1343730781513896e-06,
"loss": 0.3887,
"mean_token_accuracy": 0.8710065827782969,
"step": 3880
},
{
"epoch": 8.614872364039956,
"grad_norm": 0.07472671484346735,
"learning_rate": 1.1164974920699611e-06,
"loss": 0.3782,
"mean_token_accuracy": 0.8746941554136797,
"step": 3885
},
{
"epoch": 8.625971143174251,
"grad_norm": 0.07311763408887324,
"learning_rate": 1.0987555388883042e-06,
"loss": 0.3773,
"mean_token_accuracy": 0.8753191156569005,
"step": 3890
},
{
"epoch": 8.637069922308546,
"grad_norm": 0.0718876150770474,
"learning_rate": 1.0811474854953708e-06,
"loss": 0.3744,
"mean_token_accuracy": 0.8761765009973879,
"step": 3895
},
{
"epoch": 8.648168701442842,
"grad_norm": 0.0723882381184508,
"learning_rate": 1.0636735967658785e-06,
"loss": 0.3602,
"mean_token_accuracy": 0.8808158833620467,
"step": 3900
},
{
"epoch": 8.648168701442842,
"eval_loss": 0.9418078064918518,
"eval_mean_token_accuracy": 0.7587906580527934,
"eval_runtime": 2.5083,
"eval_samples_per_second": 51.429,
"eval_steps_per_second": 4.385,
"step": 3900
},
{
"epoch": 8.659267480577137,
"grad_norm": 0.0704714071422713,
"learning_rate": 1.0463341355563318e-06,
"loss": 0.3748,
"mean_token_accuracy": 0.8759210630008264,
"step": 3905
},
{
"epoch": 8.670366259711432,
"grad_norm": 0.0723912490097062,
"learning_rate": 1.0291293627010678e-06,
"loss": 0.3748,
"mean_token_accuracy": 0.8756913392135388,
"step": 3910
},
{
"epoch": 8.681465038845728,
"grad_norm": 0.07363633481469684,
"learning_rate": 1.012059537008332e-06,
"loss": 0.3729,
"mean_token_accuracy": 0.8765285287087201,
"step": 3915
},
{
"epoch": 8.692563817980023,
"grad_norm": 0.07295013091365997,
"learning_rate": 9.95124915256378e-07,
"loss": 0.3735,
"mean_token_accuracy": 0.8767417582495198,
"step": 3920
},
{
"epoch": 8.703662597114317,
"grad_norm": 0.07168690933716926,
"learning_rate": 9.783257521896228e-07,
"loss": 0.3649,
"mean_token_accuracy": 0.8790620620466859,
"step": 3925
},
{
"epoch": 8.714761376248612,
"grad_norm": 0.07210660521074372,
"learning_rate": 9.616623005147952e-07,
"loss": 0.392,
"mean_token_accuracy": 0.8699732825725949,
"step": 3930
},
{
"epoch": 8.725860155382907,
"grad_norm": 0.07265661223655243,
"learning_rate": 9.451348108971425e-07,
"loss": 0.3923,
"mean_token_accuracy": 0.8697471915224153,
"step": 3935
},
{
"epoch": 8.736958934517203,
"grad_norm": 0.07360746515128512,
"learning_rate": 9.287435319566618e-07,
"loss": 0.3894,
"mean_token_accuracy": 0.871409389950205,
"step": 3940
},
{
"epoch": 8.748057713651498,
"grad_norm": 0.07031615395240165,
"learning_rate": 9.124887102643576e-07,
"loss": 0.3652,
"mean_token_accuracy": 0.8781901781375167,
"step": 3945
},
{
"epoch": 8.759156492785793,
"grad_norm": 0.07463303052325941,
"learning_rate": 8.963705903385344e-07,
"loss": 0.3797,
"mean_token_accuracy": 0.8744689688569356,
"step": 3950
},
{
"epoch": 8.770255271920089,
"grad_norm": 0.07076393949841339,
"learning_rate": 8.803894146411118e-07,
"loss": 0.3843,
"mean_token_accuracy": 0.8729947995264962,
"step": 3955
},
{
"epoch": 8.781354051054384,
"grad_norm": 0.0763471702517904,
"learning_rate": 8.645454235739903e-07,
"loss": 0.354,
"mean_token_accuracy": 0.8833278525225567,
"step": 3960
},
{
"epoch": 8.79245283018868,
"grad_norm": 0.07382769897849192,
"learning_rate": 8.488388554754223e-07,
"loss": 0.3972,
"mean_token_accuracy": 0.8683902843534799,
"step": 3965
},
{
"epoch": 8.803551609322975,
"grad_norm": 0.07341021239100626,
"learning_rate": 8.332699466164307e-07,
"loss": 0.3724,
"mean_token_accuracy": 0.8769335032073728,
"step": 3970
},
{
"epoch": 8.81465038845727,
"grad_norm": 0.07084973072946958,
"learning_rate": 8.178389311972612e-07,
"loss": 0.3739,
"mean_token_accuracy": 0.8766542744127456,
"step": 3975
},
{
"epoch": 8.825749167591566,
"grad_norm": 0.0748288898335603,
"learning_rate": 8.025460413438457e-07,
"loss": 0.3753,
"mean_token_accuracy": 0.876294181154862,
"step": 3980
},
{
"epoch": 8.836847946725861,
"grad_norm": 0.07189696727103217,
"learning_rate": 7.873915071043248e-07,
"loss": 0.3832,
"mean_token_accuracy": 0.8733729707285276,
"step": 3985
},
{
"epoch": 8.847946725860155,
"grad_norm": 0.07359083102051164,
"learning_rate": 7.723755564455771e-07,
"loss": 0.3698,
"mean_token_accuracy": 0.8773191235829898,
"step": 3990
},
{
"epoch": 8.85904550499445,
"grad_norm": 0.07127519773440091,
"learning_rate": 7.574984152497988e-07,
"loss": 0.3702,
"mean_token_accuracy": 0.8777848908337162,
"step": 3995
},
{
"epoch": 8.870144284128745,
"grad_norm": 0.07660045354772042,
"learning_rate": 7.427603073110967e-07,
"loss": 0.3687,
"mean_token_accuracy": 0.8779125901278991,
"step": 4000
},
{
"epoch": 8.870144284128745,
"eval_loss": 0.9427998661994934,
"eval_mean_token_accuracy": 0.7590286003467458,
"eval_runtime": 2.5139,
"eval_samples_per_second": 51.316,
"eval_steps_per_second": 4.376,
"step": 4000
},
{
"epoch": 8.88124306326304,
"grad_norm": 0.07081398095943368,
"learning_rate": 7.281614543321269e-07,
"loss": 0.3722,
"mean_token_accuracy": 0.8771854375405537,
"step": 4005
},
{
"epoch": 8.892341842397336,
"grad_norm": 0.07087618891479863,
"learning_rate": 7.13702075920758e-07,
"loss": 0.3682,
"mean_token_accuracy": 0.8779584769791494,
"step": 4010
},
{
"epoch": 8.903440621531631,
"grad_norm": 0.0717888807240299,
"learning_rate": 6.99382389586769e-07,
"loss": 0.3717,
"mean_token_accuracy": 0.8771199704414011,
"step": 4015
},
{
"epoch": 8.914539400665927,
"grad_norm": 0.07423804991496352,
"learning_rate": 6.852026107385756e-07,
"loss": 0.378,
"mean_token_accuracy": 0.8751318821537319,
"step": 4020
},
{
"epoch": 8.925638179800222,
"grad_norm": 0.07693603325948843,
"learning_rate": 6.711629526799946e-07,
"loss": 0.37,
"mean_token_accuracy": 0.877517280827164,
"step": 4025
},
{
"epoch": 8.936736958934517,
"grad_norm": 0.0711413133732257,
"learning_rate": 6.572636266070265e-07,
"loss": 0.3747,
"mean_token_accuracy": 0.8761638578721318,
"step": 4030
},
{
"epoch": 8.947835738068813,
"grad_norm": 0.07342329249702224,
"learning_rate": 6.435048416046863e-07,
"loss": 0.3743,
"mean_token_accuracy": 0.8757618608773102,
"step": 4035
},
{
"epoch": 8.958934517203108,
"grad_norm": 0.07445700415992246,
"learning_rate": 6.298868046438533e-07,
"loss": 0.3817,
"mean_token_accuracy": 0.8735115567669782,
"step": 4040
},
{
"epoch": 8.970033296337403,
"grad_norm": 0.07393330425473023,
"learning_rate": 6.164097205781616e-07,
"loss": 0.3776,
"mean_token_accuracy": 0.8752504462904687,
"step": 4045
},
{
"epoch": 8.981132075471699,
"grad_norm": 0.07192510715946791,
"learning_rate": 6.030737921409169e-07,
"loss": 0.3742,
"mean_token_accuracy": 0.8764858951863983,
"step": 4050
},
{
"epoch": 8.992230854605994,
"grad_norm": 0.07378669584037137,
"learning_rate": 5.898792199420445e-07,
"loss": 0.3814,
"mean_token_accuracy": 0.8738473459261946,
"step": 4055
},
{
"epoch": 9.002219755826859,
"grad_norm": 0.11321983827426473,
"learning_rate": 5.768262024650773e-07,
"loss": 0.3831,
"mean_token_accuracy": 0.8741038114540909,
"step": 4060
},
{
"epoch": 9.013318534961154,
"grad_norm": 0.06955479623267266,
"learning_rate": 5.63914936064165e-07,
"loss": 0.3751,
"mean_token_accuracy": 0.876432982412313,
"step": 4065
},
{
"epoch": 9.02441731409545,
"grad_norm": 0.07002975386890714,
"learning_rate": 5.511456149611194e-07,
"loss": 0.368,
"mean_token_accuracy": 0.8784786449055064,
"step": 4070
},
{
"epoch": 9.035516093229745,
"grad_norm": 0.07426559705057928,
"learning_rate": 5.385184312424973e-07,
"loss": 0.3493,
"mean_token_accuracy": 0.8844570839583301,
"step": 4075
},
{
"epoch": 9.04661487236404,
"grad_norm": 0.07328109369066403,
"learning_rate": 5.26033574856708e-07,
"loss": 0.3516,
"mean_token_accuracy": 0.8844177074732104,
"step": 4080
},
{
"epoch": 9.057713651498336,
"grad_norm": 0.07007036388790919,
"learning_rate": 5.136912336111599e-07,
"loss": 0.365,
"mean_token_accuracy": 0.8789461900681786,
"step": 4085
},
{
"epoch": 9.068812430632631,
"grad_norm": 0.06902457824440712,
"learning_rate": 5.014915931694253e-07,
"loss": 0.3557,
"mean_token_accuracy": 0.8822892955015667,
"step": 4090
},
{
"epoch": 9.079911209766925,
"grad_norm": 0.07023725613632394,
"learning_rate": 4.894348370484648e-07,
"loss": 0.3668,
"mean_token_accuracy": 0.8788434835781784,
"step": 4095
},
{
"epoch": 9.09100998890122,
"grad_norm": 0.06843363288085369,
"learning_rate": 4.775211466158469e-07,
"loss": 0.3572,
"mean_token_accuracy": 0.8819544770759362,
"step": 4100
},
{
"epoch": 9.09100998890122,
"eval_loss": 0.9615139961242676,
"eval_mean_token_accuracy": 0.7577368010204709,
"eval_runtime": 2.5049,
"eval_samples_per_second": 51.499,
"eval_steps_per_second": 4.391,
"step": 4100
},
{
"epoch": 9.102108768035515,
"grad_norm": 0.07099833327070085,
"learning_rate": 4.6575070108703433e-07,
"loss": 0.3569,
"mean_token_accuracy": 0.8820014538736366,
"step": 4105
},
{
"epoch": 9.11320754716981,
"grad_norm": 0.07199267346067889,
"learning_rate": 4.5412367752268094e-07,
"loss": 0.3605,
"mean_token_accuracy": 0.8809590109620006,
"step": 4110
},
{
"epoch": 9.124306326304106,
"grad_norm": 0.07090954996655408,
"learning_rate": 4.4264025082597084e-07,
"loss": 0.368,
"mean_token_accuracy": 0.8785202173573573,
"step": 4115
},
{
"epoch": 9.135405105438402,
"grad_norm": 0.07176215249132896,
"learning_rate": 4.313005937399861e-07,
"loss": 0.3532,
"mean_token_accuracy": 0.8833904371817389,
"step": 4120
},
{
"epoch": 9.146503884572697,
"grad_norm": 0.07295746294927272,
"learning_rate": 4.2010487684511105e-07,
"loss": 0.3608,
"mean_token_accuracy": 0.8812105612763963,
"step": 4125
},
{
"epoch": 9.157602663706992,
"grad_norm": 0.07080889187808312,
"learning_rate": 4.0905326855646186e-07,
"loss": 0.3479,
"mean_token_accuracy": 0.8851608707330438,
"step": 4130
},
{
"epoch": 9.168701442841288,
"grad_norm": 0.07276807335314621,
"learning_rate": 3.981459351213568e-07,
"loss": 0.3777,
"mean_token_accuracy": 0.875140887125394,
"step": 4135
},
{
"epoch": 9.179800221975583,
"grad_norm": 0.07224696890024185,
"learning_rate": 3.8738304061681107e-07,
"loss": 0.3708,
"mean_token_accuracy": 0.8769411028535119,
"step": 4140
},
{
"epoch": 9.190899001109878,
"grad_norm": 0.07110354177350459,
"learning_rate": 3.7676474694707697e-07,
"loss": 0.3623,
"mean_token_accuracy": 0.8798909253279312,
"step": 4145
},
{
"epoch": 9.201997780244174,
"grad_norm": 0.07248736421213088,
"learning_rate": 3.662912138411967e-07,
"loss": 0.3626,
"mean_token_accuracy": 0.8803352453391436,
"step": 4150
},
{
"epoch": 9.213096559378469,
"grad_norm": 0.07027410169677895,
"learning_rate": 3.55962598850611e-07,
"loss": 0.3562,
"mean_token_accuracy": 0.8819391769764504,
"step": 4155
},
{
"epoch": 9.224195338512764,
"grad_norm": 0.07317371460252099,
"learning_rate": 3.457790573467812e-07,
"loss": 0.3556,
"mean_token_accuracy": 0.8822487731331142,
"step": 4160
},
{
"epoch": 9.235294117647058,
"grad_norm": 0.07476687816670186,
"learning_rate": 3.357407425188541e-07,
"loss": 0.3668,
"mean_token_accuracy": 0.8782191688237292,
"step": 4165
},
{
"epoch": 9.246392896781353,
"grad_norm": 0.07118871381012193,
"learning_rate": 3.2584780537136206e-07,
"loss": 0.365,
"mean_token_accuracy": 0.8795225550635518,
"step": 4170
},
{
"epoch": 9.257491675915649,
"grad_norm": 0.07290781684699486,
"learning_rate": 3.161003947219421e-07,
"loss": 0.3565,
"mean_token_accuracy": 0.8823807741440511,
"step": 4175
},
{
"epoch": 9.268590455049944,
"grad_norm": 0.07402119674237229,
"learning_rate": 3.06498657199108e-07,
"loss": 0.3588,
"mean_token_accuracy": 0.881556779460829,
"step": 4180
},
{
"epoch": 9.27968923418424,
"grad_norm": 0.07281681233758444,
"learning_rate": 2.970427372400353e-07,
"loss": 0.3707,
"mean_token_accuracy": 0.8775587538856486,
"step": 4185
},
{
"epoch": 9.290788013318535,
"grad_norm": 0.07176258029954591,
"learning_rate": 2.877327770883964e-07,
"loss": 0.3556,
"mean_token_accuracy": 0.8826148656597541,
"step": 4190
},
{
"epoch": 9.30188679245283,
"grad_norm": 0.07116410644119966,
"learning_rate": 2.7856891679221565e-07,
"loss": 0.353,
"mean_token_accuracy": 0.8834335910667945,
"step": 4195
},
{
"epoch": 9.312985571587125,
"grad_norm": 0.07142192428482035,
"learning_rate": 2.6955129420176193e-07,
"loss": 0.353,
"mean_token_accuracy": 0.8831818252941022,
"step": 4200
},
{
"epoch": 9.312985571587125,
"eval_loss": 0.9637966752052307,
"eval_mean_token_accuracy": 0.7577037486164723,
"eval_runtime": 2.5041,
"eval_samples_per_second": 51.515,
"eval_steps_per_second": 4.393,
"step": 4200
},
{
"epoch": 9.32408435072142,
"grad_norm": 0.06866435467347672,
"learning_rate": 2.606800449674796e-07,
"loss": 0.355,
"mean_token_accuracy": 0.8828792506438695,
"step": 4205
},
{
"epoch": 9.335183129855716,
"grad_norm": 0.07266126009595562,
"learning_rate": 2.51955302537944e-07,
"loss": 0.3682,
"mean_token_accuracy": 0.8779147885127332,
"step": 4210
},
{
"epoch": 9.346281908990012,
"grad_norm": 0.07054897167424838,
"learning_rate": 2.433771981578581e-07,
"loss": 0.368,
"mean_token_accuracy": 0.8782042065862056,
"step": 4215
},
{
"epoch": 9.357380688124307,
"grad_norm": 0.07285199787679796,
"learning_rate": 2.349458608660704e-07,
"loss": 0.367,
"mean_token_accuracy": 0.878290004869226,
"step": 4220
},
{
"epoch": 9.368479467258602,
"grad_norm": 0.07059019804792459,
"learning_rate": 2.2666141749364434e-07,
"loss": 0.3639,
"mean_token_accuracy": 0.8798083702482842,
"step": 4225
},
{
"epoch": 9.379578246392898,
"grad_norm": 0.07188436340923393,
"learning_rate": 2.1852399266194312e-07,
"loss": 0.3651,
"mean_token_accuracy": 0.8792633871506889,
"step": 4230
},
{
"epoch": 9.390677025527191,
"grad_norm": 0.07006294910861763,
"learning_rate": 2.1053370878075685e-07,
"loss": 0.3717,
"mean_token_accuracy": 0.8777365849907172,
"step": 4235
},
{
"epoch": 9.401775804661487,
"grad_norm": 0.07329004738303971,
"learning_rate": 2.0269068604646058e-07,
"loss": 0.3783,
"mean_token_accuracy": 0.8749171892288341,
"step": 4240
},
{
"epoch": 9.412874583795782,
"grad_norm": 0.06897691821001979,
"learning_rate": 1.9499504244020694e-07,
"loss": 0.3657,
"mean_token_accuracy": 0.8788341299355134,
"step": 4245
},
{
"epoch": 9.423973362930077,
"grad_norm": 0.07254613553732685,
"learning_rate": 1.874468937261531e-07,
"loss": 0.3595,
"mean_token_accuracy": 0.8813002625000752,
"step": 4250
},
{
"epoch": 9.435072142064373,
"grad_norm": 0.06889561787998928,
"learning_rate": 1.8004635344971656e-07,
"loss": 0.3661,
"mean_token_accuracy": 0.8790216325364879,
"step": 4255
},
{
"epoch": 9.446170921198668,
"grad_norm": 0.0730126004749205,
"learning_rate": 1.7279353293586765e-07,
"loss": 0.3679,
"mean_token_accuracy": 0.878868187083347,
"step": 4260
},
{
"epoch": 9.457269700332963,
"grad_norm": 0.072922212674908,
"learning_rate": 1.6568854128745537e-07,
"loss": 0.3733,
"mean_token_accuracy": 0.8763594368603579,
"step": 4265
},
{
"epoch": 9.468368479467259,
"grad_norm": 0.07046125285873713,
"learning_rate": 1.5873148538356752e-07,
"loss": 0.3657,
"mean_token_accuracy": 0.8794734514599737,
"step": 4270
},
{
"epoch": 9.479467258601554,
"grad_norm": 0.07257942568751115,
"learning_rate": 1.519224698779198e-07,
"loss": 0.3667,
"mean_token_accuracy": 0.8792038821818163,
"step": 4275
},
{
"epoch": 9.49056603773585,
"grad_norm": 0.07002558706899382,
"learning_rate": 1.4526159719728595e-07,
"loss": 0.3502,
"mean_token_accuracy": 0.8844099641821337,
"step": 4280
},
{
"epoch": 9.501664816870145,
"grad_norm": 0.07126622603987796,
"learning_rate": 1.3874896753995005e-07,
"loss": 0.3664,
"mean_token_accuracy": 0.8787627277974227,
"step": 4285
},
{
"epoch": 9.51276359600444,
"grad_norm": 0.0709690326726039,
"learning_rate": 1.323846788742078e-07,
"loss": 0.3685,
"mean_token_accuracy": 0.8786089736859786,
"step": 4290
},
{
"epoch": 9.523862375138735,
"grad_norm": 0.07128432924368934,
"learning_rate": 1.261688269368877e-07,
"loss": 0.3598,
"mean_token_accuracy": 0.8812220314344532,
"step": 4295
},
{
"epoch": 9.53496115427303,
"grad_norm": 0.06985707014893448,
"learning_rate": 1.201015052319099e-07,
"loss": 0.3584,
"mean_token_accuracy": 0.8816790571082956,
"step": 4300
},
{
"epoch": 9.53496115427303,
"eval_loss": 0.9642364978790283,
"eval_mean_token_accuracy": 0.757674836302265,
"eval_runtime": 2.5035,
"eval_samples_per_second": 51.528,
"eval_steps_per_second": 4.394,
"step": 4300
},
{
"epoch": 9.546059933407324,
"grad_norm": 0.07087098387168629,
"learning_rate": 1.1418280502888401e-07,
"loss": 0.3655,
"mean_token_accuracy": 0.8790343017131386,
"step": 4305
},
{
"epoch": 9.55715871254162,
"grad_norm": 0.07337993821243646,
"learning_rate": 1.084128153617292e-07,
"loss": 0.3737,
"mean_token_accuracy": 0.8764109415276697,
"step": 4310
},
{
"epoch": 9.568257491675915,
"grad_norm": 0.07046222496074872,
"learning_rate": 1.0279162302734624e-07,
"loss": 0.3512,
"mean_token_accuracy": 0.8840578313854823,
"step": 4315
},
{
"epoch": 9.57935627081021,
"grad_norm": 0.07091688704810993,
"learning_rate": 9.731931258429638e-08,
"loss": 0.3665,
"mean_token_accuracy": 0.878253043199692,
"step": 4320
},
{
"epoch": 9.590455049944506,
"grad_norm": 0.0711174537499222,
"learning_rate": 9.199596635154684e-08,
"loss": 0.3496,
"mean_token_accuracy": 0.8851259983663382,
"step": 4325
},
{
"epoch": 9.601553829078801,
"grad_norm": 0.07024701155741234,
"learning_rate": 8.682166440721729e-08,
"loss": 0.3553,
"mean_token_accuracy": 0.8824724160982246,
"step": 4330
},
{
"epoch": 9.612652608213097,
"grad_norm": 0.07121538410184143,
"learning_rate": 8.179648458738309e-08,
"loss": 0.3728,
"mean_token_accuracy": 0.8766866038661515,
"step": 4335
},
{
"epoch": 9.623751387347392,
"grad_norm": 0.07201171778030711,
"learning_rate": 7.692050248490291e-08,
"loss": 0.378,
"mean_token_accuracy": 0.8749676272479154,
"step": 4340
},
{
"epoch": 9.634850166481687,
"grad_norm": 0.07230796757518289,
"learning_rate": 7.219379144828287e-08,
"loss": 0.3513,
"mean_token_accuracy": 0.8840763261643472,
"step": 4345
},
{
"epoch": 9.645948945615983,
"grad_norm": 0.07273499475598692,
"learning_rate": 6.761642258056977e-08,
"loss": 0.362,
"mean_token_accuracy": 0.8804394627971558,
"step": 4350
},
{
"epoch": 9.657047724750278,
"grad_norm": 0.0719345384206557,
"learning_rate": 6.318846473828522e-08,
"loss": 0.3723,
"mean_token_accuracy": 0.8766675696523365,
"step": 4355
},
{
"epoch": 9.668146503884573,
"grad_norm": 0.07291104365357333,
"learning_rate": 5.890998453038643e-08,
"loss": 0.3535,
"mean_token_accuracy": 0.8831918369682384,
"step": 4360
},
{
"epoch": 9.679245283018869,
"grad_norm": 0.0710777472196142,
"learning_rate": 5.4781046317267103e-08,
"loss": 0.3594,
"mean_token_accuracy": 0.8811254334090007,
"step": 4365
},
{
"epoch": 9.690344062153162,
"grad_norm": 0.06910781062183935,
"learning_rate": 5.080171220978813e-08,
"loss": 0.3724,
"mean_token_accuracy": 0.877210756344294,
"step": 4370
},
{
"epoch": 9.701442841287458,
"grad_norm": 0.0703723425128348,
"learning_rate": 4.6972042068341714e-08,
"loss": 0.3597,
"mean_token_accuracy": 0.8807625164330958,
"step": 4375
},
{
"epoch": 9.712541620421753,
"grad_norm": 0.07312653321865854,
"learning_rate": 4.329209350195651e-08,
"loss": 0.3604,
"mean_token_accuracy": 0.881274793861834,
"step": 4380
},
{
"epoch": 9.723640399556048,
"grad_norm": 0.07219178723430214,
"learning_rate": 3.976192186742167e-08,
"loss": 0.3785,
"mean_token_accuracy": 0.8746446415354814,
"step": 4385
},
{
"epoch": 9.734739178690344,
"grad_norm": 0.06955222309447355,
"learning_rate": 3.6381580268463056e-08,
"loss": 0.3462,
"mean_token_accuracy": 0.8857586073047825,
"step": 4390
},
{
"epoch": 9.745837957824639,
"grad_norm": 0.07234203443294888,
"learning_rate": 3.315111955493944e-08,
"loss": 0.3575,
"mean_token_accuracy": 0.8819603747530635,
"step": 4395
},
{
"epoch": 9.756936736958934,
"grad_norm": 0.07240545514975298,
"learning_rate": 3.0070588322079765e-08,
"loss": 0.3679,
"mean_token_accuracy": 0.878275429419619,
"step": 4400
},
{
"epoch": 9.756936736958934,
"eval_loss": 0.9646347165107727,
"eval_mean_token_accuracy": 0.7575733471521328,
"eval_runtime": 2.5005,
"eval_samples_per_second": 51.591,
"eval_steps_per_second": 4.399,
"step": 4400
},
{
"epoch": 9.76803551609323,
"grad_norm": 0.07064125611324511,
"learning_rate": 2.7140032909749315e-08,
"loss": 0.3593,
"mean_token_accuracy": 0.8812070170201969,
"step": 4405
},
{
"epoch": 9.779134295227525,
"grad_norm": 0.07058249125627476,
"learning_rate": 2.4359497401758026e-08,
"loss": 0.3645,
"mean_token_accuracy": 0.8799202650266269,
"step": 4410
},
{
"epoch": 9.79023307436182,
"grad_norm": 0.07207301212981171,
"learning_rate": 2.1729023625189916e-08,
"loss": 0.3585,
"mean_token_accuracy": 0.8818855771506551,
"step": 4415
},
{
"epoch": 9.801331853496116,
"grad_norm": 0.07281322729220804,
"learning_rate": 1.924865114978025e-08,
"loss": 0.3645,
"mean_token_accuracy": 0.8795988855911571,
"step": 4420
},
{
"epoch": 9.812430632630411,
"grad_norm": 0.07133068454580137,
"learning_rate": 1.6918417287318245e-08,
"loss": 0.3659,
"mean_token_accuracy": 0.879352471475125,
"step": 4425
},
{
"epoch": 9.823529411764707,
"grad_norm": 0.07106856199855119,
"learning_rate": 1.4738357091084177e-08,
"loss": 0.358,
"mean_token_accuracy": 0.8820182276270513,
"step": 4430
},
{
"epoch": 9.834628190899002,
"grad_norm": 0.06905146093383048,
"learning_rate": 1.2708503355323143e-08,
"loss": 0.3545,
"mean_token_accuracy": 0.8827259452975497,
"step": 4435
},
{
"epoch": 9.845726970033297,
"grad_norm": 0.06976933586547189,
"learning_rate": 1.0828886614754342e-08,
"loss": 0.3538,
"mean_token_accuracy": 0.8825828210361928,
"step": 4440
},
{
"epoch": 9.85682574916759,
"grad_norm": 0.07034227993170239,
"learning_rate": 9.099535144108107e-09,
"loss": 0.3553,
"mean_token_accuracy": 0.8825321351350404,
"step": 4445
},
{
"epoch": 9.867924528301886,
"grad_norm": 0.07172489561097325,
"learning_rate": 7.520474957699586e-09,
"loss": 0.3619,
"mean_token_accuracy": 0.8805824854864788,
"step": 4450
},
{
"epoch": 9.879023307436182,
"grad_norm": 0.07089188813728774,
"learning_rate": 6.091729809042379e-09,
"loss": 0.3645,
"mean_token_accuracy": 0.8790294765903954,
"step": 4455
},
{
"epoch": 9.890122086570477,
"grad_norm": 0.07130667742928198,
"learning_rate": 4.813321190488829e-09,
"loss": 0.3806,
"mean_token_accuracy": 0.8740528261639089,
"step": 4460
},
{
"epoch": 9.901220865704772,
"grad_norm": 0.06980589275633439,
"learning_rate": 3.6852683329058336e-09,
"loss": 0.3561,
"mean_token_accuracy": 0.8826797263283062,
"step": 4465
},
{
"epoch": 9.912319644839068,
"grad_norm": 0.0725782887050822,
"learning_rate": 2.7075882053828605e-09,
"loss": 0.3606,
"mean_token_accuracy": 0.8807255274320044,
"step": 4470
},
{
"epoch": 9.923418423973363,
"grad_norm": 0.06911521073255959,
"learning_rate": 1.8802955149865854e-09,
"loss": 0.3554,
"mean_token_accuracy": 0.8826477537420596,
"step": 4475
},
{
"epoch": 9.934517203107658,
"grad_norm": 0.07297882233653312,
"learning_rate": 1.203402706525525e-09,
"loss": 0.3746,
"mean_token_accuracy": 0.8760996963828843,
"step": 4480
},
{
"epoch": 9.945615982241954,
"grad_norm": 0.07059910387480436,
"learning_rate": 6.769199623779532e-10,
"loss": 0.3555,
"mean_token_accuracy": 0.882608104296593,
"step": 4485
},
{
"epoch": 9.956714761376249,
"grad_norm": 0.07197783726350207,
"learning_rate": 3.008552023242572e-10,
"loss": 0.3649,
"mean_token_accuracy": 0.8792620243758646,
"step": 4490
},
{
"epoch": 9.967813540510544,
"grad_norm": 0.07151420803500014,
"learning_rate": 7.521408343924564e-11,
"loss": 0.3752,
"mean_token_accuracy": 0.8761361486307025,
"step": 4495
},
{
"epoch": 9.97891231964484,
"grad_norm": 0.07835980656994077,
"learning_rate": 0.0,
"loss": 0.3581,
"mean_token_accuracy": 0.8817789333478083,
"step": 4500
},
{
"epoch": 9.97891231964484,
"eval_loss": 0.9643924832344055,
"eval_mean_token_accuracy": 0.7574227854751178,
"eval_runtime": 2.5228,
"eval_samples_per_second": 51.135,
"eval_steps_per_second": 4.36,
"step": 4500
},
{
"epoch": 9.97891231964484,
"step": 4500,
"total_flos": 6.0804602683856e+18,
"train_loss": 0.5375538207954831,
"train_runtime": 15535.6447,
"train_samples_per_second": 13.918,
"train_steps_per_second": 0.29
}
],
"logging_steps": 5,
"max_steps": 4500,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.0804602683856e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}