health-analysis-biobert / trainer_state.json
Fahim18's picture
Upload folder using huggingface_hub
9b51627 verified
{
"best_metric": 1.6915712356567383,
"best_model_checkpoint": "./health_analysis_results/checkpoint-8752",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 10940,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004570383912248629,
"grad_norm": 86932.7734375,
"learning_rate": 1.0000000000000002e-06,
"loss": 2.6599,
"step": 10
},
{
"epoch": 0.009140767824497258,
"grad_norm": 41141.640625,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.6196,
"step": 20
},
{
"epoch": 0.013711151736745886,
"grad_norm": 64399.953125,
"learning_rate": 3e-06,
"loss": 2.6801,
"step": 30
},
{
"epoch": 0.018281535648994516,
"grad_norm": 47369.9140625,
"learning_rate": 4.000000000000001e-06,
"loss": 2.5896,
"step": 40
},
{
"epoch": 0.022851919561243144,
"grad_norm": 42862.3359375,
"learning_rate": 5e-06,
"loss": 2.6417,
"step": 50
},
{
"epoch": 0.027422303473491772,
"grad_norm": 43046.7734375,
"learning_rate": 6e-06,
"loss": 2.5624,
"step": 60
},
{
"epoch": 0.031992687385740404,
"grad_norm": 57193.46875,
"learning_rate": 7.000000000000001e-06,
"loss": 2.7588,
"step": 70
},
{
"epoch": 0.03656307129798903,
"grad_norm": 47019.44921875,
"learning_rate": 8.000000000000001e-06,
"loss": 2.7033,
"step": 80
},
{
"epoch": 0.04113345521023766,
"grad_norm": 96947.703125,
"learning_rate": 9e-06,
"loss": 2.9225,
"step": 90
},
{
"epoch": 0.04570383912248629,
"grad_norm": 53870.5234375,
"learning_rate": 1e-05,
"loss": 2.65,
"step": 100
},
{
"epoch": 0.050274223034734916,
"grad_norm": 57196.71875,
"learning_rate": 1.1000000000000001e-05,
"loss": 2.726,
"step": 110
},
{
"epoch": 0.054844606946983544,
"grad_norm": 102628.6015625,
"learning_rate": 1.2e-05,
"loss": 2.5591,
"step": 120
},
{
"epoch": 0.05941499085923217,
"grad_norm": 59579.734375,
"learning_rate": 1.3000000000000001e-05,
"loss": 2.7329,
"step": 130
},
{
"epoch": 0.06398537477148081,
"grad_norm": 63045.7890625,
"learning_rate": 1.4000000000000001e-05,
"loss": 2.7527,
"step": 140
},
{
"epoch": 0.06855575868372943,
"grad_norm": 55244.3828125,
"learning_rate": 1.5e-05,
"loss": 2.8104,
"step": 150
},
{
"epoch": 0.07312614259597806,
"grad_norm": 101346.0078125,
"learning_rate": 1.6000000000000003e-05,
"loss": 2.6889,
"step": 160
},
{
"epoch": 0.07769652650822668,
"grad_norm": 47467.60546875,
"learning_rate": 1.7000000000000003e-05,
"loss": 2.646,
"step": 170
},
{
"epoch": 0.08226691042047532,
"grad_norm": 92061.4765625,
"learning_rate": 1.8e-05,
"loss": 2.6756,
"step": 180
},
{
"epoch": 0.08683729433272395,
"grad_norm": 52667.0078125,
"learning_rate": 1.9e-05,
"loss": 2.5845,
"step": 190
},
{
"epoch": 0.09140767824497258,
"grad_norm": 74090.5859375,
"learning_rate": 2e-05,
"loss": 2.6605,
"step": 200
},
{
"epoch": 0.09597806215722121,
"grad_norm": 92731.8984375,
"learning_rate": 2.1e-05,
"loss": 2.7177,
"step": 210
},
{
"epoch": 0.10054844606946983,
"grad_norm": 132096.203125,
"learning_rate": 2.2000000000000003e-05,
"loss": 2.6394,
"step": 220
},
{
"epoch": 0.10511882998171847,
"grad_norm": 64807.15625,
"learning_rate": 2.3000000000000003e-05,
"loss": 2.5417,
"step": 230
},
{
"epoch": 0.10968921389396709,
"grad_norm": 124301.5390625,
"learning_rate": 2.4e-05,
"loss": 2.5855,
"step": 240
},
{
"epoch": 0.11425959780621572,
"grad_norm": 106672.2109375,
"learning_rate": 2.5e-05,
"loss": 2.5796,
"step": 250
},
{
"epoch": 0.11882998171846434,
"grad_norm": 103185.78125,
"learning_rate": 2.6000000000000002e-05,
"loss": 2.558,
"step": 260
},
{
"epoch": 0.12340036563071298,
"grad_norm": 539943.625,
"learning_rate": 2.7000000000000002e-05,
"loss": 2.533,
"step": 270
},
{
"epoch": 0.12797074954296161,
"grad_norm": 136090.453125,
"learning_rate": 2.8000000000000003e-05,
"loss": 2.3531,
"step": 280
},
{
"epoch": 0.13254113345521024,
"grad_norm": 190959.8125,
"learning_rate": 2.9e-05,
"loss": 2.4295,
"step": 290
},
{
"epoch": 0.13711151736745886,
"grad_norm": 321799.75,
"learning_rate": 3e-05,
"loss": 2.3202,
"step": 300
},
{
"epoch": 0.1416819012797075,
"grad_norm": 341939.46875,
"learning_rate": 3.1e-05,
"loss": 2.3956,
"step": 310
},
{
"epoch": 0.14625228519195613,
"grad_norm": 118726.578125,
"learning_rate": 3.2000000000000005e-05,
"loss": 2.2453,
"step": 320
},
{
"epoch": 0.15082266910420475,
"grad_norm": 199988.390625,
"learning_rate": 3.3e-05,
"loss": 2.2402,
"step": 330
},
{
"epoch": 0.15539305301645337,
"grad_norm": 200156.25,
"learning_rate": 3.4000000000000007e-05,
"loss": 2.1299,
"step": 340
},
{
"epoch": 0.15996343692870202,
"grad_norm": 235010.1875,
"learning_rate": 3.5e-05,
"loss": 2.0552,
"step": 350
},
{
"epoch": 0.16453382084095064,
"grad_norm": 168717.734375,
"learning_rate": 3.6e-05,
"loss": 2.0332,
"step": 360
},
{
"epoch": 0.16910420475319926,
"grad_norm": 164222.140625,
"learning_rate": 3.7e-05,
"loss": 2.1069,
"step": 370
},
{
"epoch": 0.1736745886654479,
"grad_norm": 216883.390625,
"learning_rate": 3.8e-05,
"loss": 2.1525,
"step": 380
},
{
"epoch": 0.17824497257769653,
"grad_norm": 238809.9375,
"learning_rate": 3.9000000000000006e-05,
"loss": 2.0176,
"step": 390
},
{
"epoch": 0.18281535648994515,
"grad_norm": 148073.28125,
"learning_rate": 4e-05,
"loss": 1.9141,
"step": 400
},
{
"epoch": 0.18738574040219377,
"grad_norm": 211357.671875,
"learning_rate": 4.1e-05,
"loss": 1.9354,
"step": 410
},
{
"epoch": 0.19195612431444242,
"grad_norm": 289800.125,
"learning_rate": 4.2e-05,
"loss": 2.0263,
"step": 420
},
{
"epoch": 0.19652650822669104,
"grad_norm": 354819.71875,
"learning_rate": 4.3e-05,
"loss": 1.9236,
"step": 430
},
{
"epoch": 0.20109689213893966,
"grad_norm": 187694.3125,
"learning_rate": 4.4000000000000006e-05,
"loss": 1.8974,
"step": 440
},
{
"epoch": 0.2056672760511883,
"grad_norm": 235554.203125,
"learning_rate": 4.5e-05,
"loss": 1.8884,
"step": 450
},
{
"epoch": 0.21023765996343693,
"grad_norm": 194993.0625,
"learning_rate": 4.600000000000001e-05,
"loss": 1.8713,
"step": 460
},
{
"epoch": 0.21480804387568556,
"grad_norm": 190949.453125,
"learning_rate": 4.7e-05,
"loss": 1.9149,
"step": 470
},
{
"epoch": 0.21937842778793418,
"grad_norm": 209000.484375,
"learning_rate": 4.8e-05,
"loss": 1.9436,
"step": 480
},
{
"epoch": 0.22394881170018283,
"grad_norm": 151603.03125,
"learning_rate": 4.9e-05,
"loss": 1.868,
"step": 490
},
{
"epoch": 0.22851919561243145,
"grad_norm": 297051.9375,
"learning_rate": 5e-05,
"loss": 1.8631,
"step": 500
},
{
"epoch": 0.23308957952468007,
"grad_norm": 193912.53125,
"learning_rate": 4.995210727969349e-05,
"loss": 1.866,
"step": 510
},
{
"epoch": 0.2376599634369287,
"grad_norm": 185368.859375,
"learning_rate": 4.9904214559386976e-05,
"loss": 1.8017,
"step": 520
},
{
"epoch": 0.24223034734917734,
"grad_norm": 482963.65625,
"learning_rate": 4.985632183908046e-05,
"loss": 1.958,
"step": 530
},
{
"epoch": 0.24680073126142596,
"grad_norm": 216741.5625,
"learning_rate": 4.980842911877395e-05,
"loss": 1.8688,
"step": 540
},
{
"epoch": 0.2513711151736746,
"grad_norm": 248310.640625,
"learning_rate": 4.976053639846743e-05,
"loss": 1.8206,
"step": 550
},
{
"epoch": 0.25594149908592323,
"grad_norm": 174778.59375,
"learning_rate": 4.971264367816092e-05,
"loss": 1.7981,
"step": 560
},
{
"epoch": 0.26051188299817185,
"grad_norm": 168291.53125,
"learning_rate": 4.966475095785441e-05,
"loss": 1.8444,
"step": 570
},
{
"epoch": 0.26508226691042047,
"grad_norm": 119908.9609375,
"learning_rate": 4.96168582375479e-05,
"loss": 1.8102,
"step": 580
},
{
"epoch": 0.2696526508226691,
"grad_norm": 130359.1640625,
"learning_rate": 4.9568965517241384e-05,
"loss": 1.819,
"step": 590
},
{
"epoch": 0.2742230347349177,
"grad_norm": 122891.2734375,
"learning_rate": 4.952107279693487e-05,
"loss": 1.8189,
"step": 600
},
{
"epoch": 0.27879341864716634,
"grad_norm": 133346.09375,
"learning_rate": 4.947318007662836e-05,
"loss": 1.8422,
"step": 610
},
{
"epoch": 0.283363802559415,
"grad_norm": 179827.6875,
"learning_rate": 4.9425287356321845e-05,
"loss": 1.8674,
"step": 620
},
{
"epoch": 0.28793418647166363,
"grad_norm": 158770.609375,
"learning_rate": 4.9377394636015325e-05,
"loss": 1.8341,
"step": 630
},
{
"epoch": 0.29250457038391225,
"grad_norm": 199898.640625,
"learning_rate": 4.932950191570881e-05,
"loss": 1.8177,
"step": 640
},
{
"epoch": 0.2970749542961609,
"grad_norm": 136966.828125,
"learning_rate": 4.92816091954023e-05,
"loss": 1.6618,
"step": 650
},
{
"epoch": 0.3016453382084095,
"grad_norm": 131470.1875,
"learning_rate": 4.9233716475095786e-05,
"loss": 1.9034,
"step": 660
},
{
"epoch": 0.3062157221206581,
"grad_norm": 137772.28125,
"learning_rate": 4.918582375478927e-05,
"loss": 1.7766,
"step": 670
},
{
"epoch": 0.31078610603290674,
"grad_norm": 225044.28125,
"learning_rate": 4.913793103448276e-05,
"loss": 1.8404,
"step": 680
},
{
"epoch": 0.3153564899451554,
"grad_norm": 117430.6015625,
"learning_rate": 4.9090038314176246e-05,
"loss": 1.7514,
"step": 690
},
{
"epoch": 0.31992687385740404,
"grad_norm": 143649.03125,
"learning_rate": 4.904214559386973e-05,
"loss": 1.7891,
"step": 700
},
{
"epoch": 0.32449725776965266,
"grad_norm": 132381.140625,
"learning_rate": 4.899425287356322e-05,
"loss": 1.728,
"step": 710
},
{
"epoch": 0.3290676416819013,
"grad_norm": 180736.796875,
"learning_rate": 4.894636015325671e-05,
"loss": 1.7465,
"step": 720
},
{
"epoch": 0.3336380255941499,
"grad_norm": 102364.2734375,
"learning_rate": 4.8898467432950194e-05,
"loss": 1.7522,
"step": 730
},
{
"epoch": 0.3382084095063985,
"grad_norm": 124118.0234375,
"learning_rate": 4.885057471264368e-05,
"loss": 1.7294,
"step": 740
},
{
"epoch": 0.34277879341864714,
"grad_norm": 102199.5234375,
"learning_rate": 4.880268199233717e-05,
"loss": 1.866,
"step": 750
},
{
"epoch": 0.3473491773308958,
"grad_norm": 128679.890625,
"learning_rate": 4.8754789272030654e-05,
"loss": 1.8846,
"step": 760
},
{
"epoch": 0.35191956124314444,
"grad_norm": 100680.3671875,
"learning_rate": 4.870689655172414e-05,
"loss": 1.7317,
"step": 770
},
{
"epoch": 0.35648994515539306,
"grad_norm": 125967.859375,
"learning_rate": 4.865900383141763e-05,
"loss": 1.8308,
"step": 780
},
{
"epoch": 0.3610603290676417,
"grad_norm": 124871.5859375,
"learning_rate": 4.8611111111111115e-05,
"loss": 1.7843,
"step": 790
},
{
"epoch": 0.3656307129798903,
"grad_norm": 79497.8515625,
"learning_rate": 4.85632183908046e-05,
"loss": 1.7379,
"step": 800
},
{
"epoch": 0.3702010968921389,
"grad_norm": 130445.0546875,
"learning_rate": 4.851532567049808e-05,
"loss": 1.8268,
"step": 810
},
{
"epoch": 0.37477148080438755,
"grad_norm": 104012.1484375,
"learning_rate": 4.846743295019157e-05,
"loss": 1.7416,
"step": 820
},
{
"epoch": 0.3793418647166362,
"grad_norm": 146501.59375,
"learning_rate": 4.8419540229885056e-05,
"loss": 1.7859,
"step": 830
},
{
"epoch": 0.38391224862888484,
"grad_norm": 142532.4375,
"learning_rate": 4.837164750957854e-05,
"loss": 1.7631,
"step": 840
},
{
"epoch": 0.38848263254113347,
"grad_norm": 130007.6640625,
"learning_rate": 4.8323754789272036e-05,
"loss": 1.7579,
"step": 850
},
{
"epoch": 0.3930530164533821,
"grad_norm": 130011.5078125,
"learning_rate": 4.827586206896552e-05,
"loss": 1.7896,
"step": 860
},
{
"epoch": 0.3976234003656307,
"grad_norm": 111523.03125,
"learning_rate": 4.822796934865901e-05,
"loss": 1.7826,
"step": 870
},
{
"epoch": 0.40219378427787933,
"grad_norm": 155793.453125,
"learning_rate": 4.81800766283525e-05,
"loss": 1.8351,
"step": 880
},
{
"epoch": 0.40676416819012795,
"grad_norm": 129142.6875,
"learning_rate": 4.813218390804598e-05,
"loss": 1.8079,
"step": 890
},
{
"epoch": 0.4113345521023766,
"grad_norm": 132226.46875,
"learning_rate": 4.8084291187739464e-05,
"loss": 1.7534,
"step": 900
},
{
"epoch": 0.41590493601462525,
"grad_norm": 126845.109375,
"learning_rate": 4.803639846743295e-05,
"loss": 1.9015,
"step": 910
},
{
"epoch": 0.42047531992687387,
"grad_norm": 136638.390625,
"learning_rate": 4.798850574712644e-05,
"loss": 1.7273,
"step": 920
},
{
"epoch": 0.4250457038391225,
"grad_norm": 142308.25,
"learning_rate": 4.7940613026819925e-05,
"loss": 1.7398,
"step": 930
},
{
"epoch": 0.4296160877513711,
"grad_norm": 115603.34375,
"learning_rate": 4.789272030651341e-05,
"loss": 1.8353,
"step": 940
},
{
"epoch": 0.43418647166361973,
"grad_norm": 131319.734375,
"learning_rate": 4.78448275862069e-05,
"loss": 1.6825,
"step": 950
},
{
"epoch": 0.43875685557586835,
"grad_norm": 112166.5703125,
"learning_rate": 4.7796934865900385e-05,
"loss": 1.7169,
"step": 960
},
{
"epoch": 0.443327239488117,
"grad_norm": 131488.796875,
"learning_rate": 4.774904214559387e-05,
"loss": 1.7215,
"step": 970
},
{
"epoch": 0.44789762340036565,
"grad_norm": 149545.140625,
"learning_rate": 4.770114942528736e-05,
"loss": 1.6931,
"step": 980
},
{
"epoch": 0.4524680073126143,
"grad_norm": 163557.375,
"learning_rate": 4.7653256704980846e-05,
"loss": 1.7066,
"step": 990
},
{
"epoch": 0.4570383912248629,
"grad_norm": 145251.359375,
"learning_rate": 4.760536398467433e-05,
"loss": 1.797,
"step": 1000
},
{
"epoch": 0.4616087751371115,
"grad_norm": 100632.7734375,
"learning_rate": 4.755747126436782e-05,
"loss": 1.6975,
"step": 1010
},
{
"epoch": 0.46617915904936014,
"grad_norm": 153170.90625,
"learning_rate": 4.7509578544061307e-05,
"loss": 1.6741,
"step": 1020
},
{
"epoch": 0.47074954296160876,
"grad_norm": 78909.109375,
"learning_rate": 4.7461685823754793e-05,
"loss": 1.6999,
"step": 1030
},
{
"epoch": 0.4753199268738574,
"grad_norm": 110082.7421875,
"learning_rate": 4.741379310344828e-05,
"loss": 1.749,
"step": 1040
},
{
"epoch": 0.47989031078610606,
"grad_norm": 100923.5390625,
"learning_rate": 4.736590038314177e-05,
"loss": 1.7683,
"step": 1050
},
{
"epoch": 0.4844606946983547,
"grad_norm": 98683.3203125,
"learning_rate": 4.7318007662835254e-05,
"loss": 1.7535,
"step": 1060
},
{
"epoch": 0.4890310786106033,
"grad_norm": 116789.328125,
"learning_rate": 4.7270114942528734e-05,
"loss": 1.8248,
"step": 1070
},
{
"epoch": 0.4936014625228519,
"grad_norm": 114926.984375,
"learning_rate": 4.722222222222222e-05,
"loss": 1.7018,
"step": 1080
},
{
"epoch": 0.49817184643510054,
"grad_norm": 91417.0859375,
"learning_rate": 4.717432950191571e-05,
"loss": 1.8116,
"step": 1090
},
{
"epoch": 0.5027422303473492,
"grad_norm": 128865.3125,
"learning_rate": 4.7126436781609195e-05,
"loss": 1.6932,
"step": 1100
},
{
"epoch": 0.5073126142595978,
"grad_norm": 122375.984375,
"learning_rate": 4.707854406130268e-05,
"loss": 1.7383,
"step": 1110
},
{
"epoch": 0.5118829981718465,
"grad_norm": 96160.1328125,
"learning_rate": 4.7030651340996175e-05,
"loss": 1.6952,
"step": 1120
},
{
"epoch": 0.5164533820840951,
"grad_norm": 124809.203125,
"learning_rate": 4.698275862068966e-05,
"loss": 1.7973,
"step": 1130
},
{
"epoch": 0.5210237659963437,
"grad_norm": 112799.5625,
"learning_rate": 4.693486590038315e-05,
"loss": 1.7527,
"step": 1140
},
{
"epoch": 0.5255941499085923,
"grad_norm": 82923.640625,
"learning_rate": 4.688697318007663e-05,
"loss": 1.7041,
"step": 1150
},
{
"epoch": 0.5301645338208409,
"grad_norm": 130412.3671875,
"learning_rate": 4.6839080459770116e-05,
"loss": 1.7021,
"step": 1160
},
{
"epoch": 0.5347349177330896,
"grad_norm": 104821.015625,
"learning_rate": 4.67911877394636e-05,
"loss": 1.7333,
"step": 1170
},
{
"epoch": 0.5393053016453382,
"grad_norm": 100490.953125,
"learning_rate": 4.674329501915709e-05,
"loss": 1.6688,
"step": 1180
},
{
"epoch": 0.5438756855575868,
"grad_norm": 86425.4453125,
"learning_rate": 4.669540229885058e-05,
"loss": 1.7381,
"step": 1190
},
{
"epoch": 0.5484460694698354,
"grad_norm": 83740.2734375,
"learning_rate": 4.6647509578544064e-05,
"loss": 1.8439,
"step": 1200
},
{
"epoch": 0.553016453382084,
"grad_norm": 140177.421875,
"learning_rate": 4.659961685823755e-05,
"loss": 1.7227,
"step": 1210
},
{
"epoch": 0.5575868372943327,
"grad_norm": 144323.71875,
"learning_rate": 4.655172413793104e-05,
"loss": 1.7395,
"step": 1220
},
{
"epoch": 0.5621572212065814,
"grad_norm": 97354.59375,
"learning_rate": 4.6503831417624524e-05,
"loss": 1.7719,
"step": 1230
},
{
"epoch": 0.56672760511883,
"grad_norm": 72904.7578125,
"learning_rate": 4.6455938697318004e-05,
"loss": 1.6712,
"step": 1240
},
{
"epoch": 0.5712979890310786,
"grad_norm": 162248.328125,
"learning_rate": 4.640804597701149e-05,
"loss": 1.7491,
"step": 1250
},
{
"epoch": 0.5758683729433273,
"grad_norm": 105222.6328125,
"learning_rate": 4.6360153256704985e-05,
"loss": 1.7503,
"step": 1260
},
{
"epoch": 0.5804387568555759,
"grad_norm": 113333.96875,
"learning_rate": 4.631226053639847e-05,
"loss": 1.751,
"step": 1270
},
{
"epoch": 0.5850091407678245,
"grad_norm": 126183.7734375,
"learning_rate": 4.626436781609196e-05,
"loss": 1.8747,
"step": 1280
},
{
"epoch": 0.5895795246800731,
"grad_norm": 118274.03125,
"learning_rate": 4.6216475095785446e-05,
"loss": 1.6831,
"step": 1290
},
{
"epoch": 0.5941499085923218,
"grad_norm": 108177.03125,
"learning_rate": 4.616858237547893e-05,
"loss": 1.6359,
"step": 1300
},
{
"epoch": 0.5987202925045704,
"grad_norm": 81988.9140625,
"learning_rate": 4.612068965517242e-05,
"loss": 1.7058,
"step": 1310
},
{
"epoch": 0.603290676416819,
"grad_norm": 79780.96875,
"learning_rate": 4.60727969348659e-05,
"loss": 1.7388,
"step": 1320
},
{
"epoch": 0.6078610603290676,
"grad_norm": 166808.515625,
"learning_rate": 4.6024904214559386e-05,
"loss": 1.7623,
"step": 1330
},
{
"epoch": 0.6124314442413162,
"grad_norm": 111601.921875,
"learning_rate": 4.597701149425287e-05,
"loss": 1.761,
"step": 1340
},
{
"epoch": 0.6170018281535649,
"grad_norm": 106101.40625,
"learning_rate": 4.592911877394636e-05,
"loss": 1.685,
"step": 1350
},
{
"epoch": 0.6215722120658135,
"grad_norm": 125174.578125,
"learning_rate": 4.588122605363985e-05,
"loss": 1.7784,
"step": 1360
},
{
"epoch": 0.6261425959780622,
"grad_norm": 107639.9375,
"learning_rate": 4.5833333333333334e-05,
"loss": 1.7636,
"step": 1370
},
{
"epoch": 0.6307129798903108,
"grad_norm": 144034.265625,
"learning_rate": 4.578544061302682e-05,
"loss": 1.7237,
"step": 1380
},
{
"epoch": 0.6352833638025595,
"grad_norm": 135349.234375,
"learning_rate": 4.573754789272031e-05,
"loss": 1.6869,
"step": 1390
},
{
"epoch": 0.6398537477148081,
"grad_norm": 92048.3359375,
"learning_rate": 4.5689655172413794e-05,
"loss": 1.6916,
"step": 1400
},
{
"epoch": 0.6444241316270567,
"grad_norm": 105181.109375,
"learning_rate": 4.564176245210728e-05,
"loss": 1.7276,
"step": 1410
},
{
"epoch": 0.6489945155393053,
"grad_norm": 111967.078125,
"learning_rate": 4.559386973180077e-05,
"loss": 1.7594,
"step": 1420
},
{
"epoch": 0.6535648994515539,
"grad_norm": 95790.1171875,
"learning_rate": 4.5545977011494255e-05,
"loss": 1.6922,
"step": 1430
},
{
"epoch": 0.6581352833638026,
"grad_norm": 133646.671875,
"learning_rate": 4.549808429118774e-05,
"loss": 1.7427,
"step": 1440
},
{
"epoch": 0.6627056672760512,
"grad_norm": 85705.421875,
"learning_rate": 4.545019157088123e-05,
"loss": 1.6791,
"step": 1450
},
{
"epoch": 0.6672760511882998,
"grad_norm": 131938.828125,
"learning_rate": 4.5402298850574716e-05,
"loss": 1.7781,
"step": 1460
},
{
"epoch": 0.6718464351005484,
"grad_norm": 138080.515625,
"learning_rate": 4.53544061302682e-05,
"loss": 1.7237,
"step": 1470
},
{
"epoch": 0.676416819012797,
"grad_norm": 81319.125,
"learning_rate": 4.530651340996169e-05,
"loss": 1.7527,
"step": 1480
},
{
"epoch": 0.6809872029250457,
"grad_norm": 114187.8203125,
"learning_rate": 4.5258620689655176e-05,
"loss": 1.7414,
"step": 1490
},
{
"epoch": 0.6855575868372943,
"grad_norm": 88061.1640625,
"learning_rate": 4.5210727969348656e-05,
"loss": 1.7703,
"step": 1500
},
{
"epoch": 0.6901279707495429,
"grad_norm": 70474.1015625,
"learning_rate": 4.516283524904214e-05,
"loss": 1.6569,
"step": 1510
},
{
"epoch": 0.6946983546617916,
"grad_norm": 93419.6171875,
"learning_rate": 4.511494252873563e-05,
"loss": 1.7559,
"step": 1520
},
{
"epoch": 0.6992687385740403,
"grad_norm": 109502.859375,
"learning_rate": 4.506704980842912e-05,
"loss": 1.6971,
"step": 1530
},
{
"epoch": 0.7038391224862889,
"grad_norm": 172617.59375,
"learning_rate": 4.501915708812261e-05,
"loss": 1.8061,
"step": 1540
},
{
"epoch": 0.7084095063985375,
"grad_norm": 103427.5078125,
"learning_rate": 4.49712643678161e-05,
"loss": 1.7566,
"step": 1550
},
{
"epoch": 0.7129798903107861,
"grad_norm": 77810.8125,
"learning_rate": 4.4923371647509585e-05,
"loss": 1.6869,
"step": 1560
},
{
"epoch": 0.7175502742230347,
"grad_norm": 112475.9453125,
"learning_rate": 4.487547892720307e-05,
"loss": 1.6338,
"step": 1570
},
{
"epoch": 0.7221206581352834,
"grad_norm": 99816.34375,
"learning_rate": 4.482758620689655e-05,
"loss": 1.7478,
"step": 1580
},
{
"epoch": 0.726691042047532,
"grad_norm": 73693.90625,
"learning_rate": 4.477969348659004e-05,
"loss": 1.7733,
"step": 1590
},
{
"epoch": 0.7312614259597806,
"grad_norm": 125517.2734375,
"learning_rate": 4.4731800766283525e-05,
"loss": 1.7643,
"step": 1600
},
{
"epoch": 0.7358318098720292,
"grad_norm": 125624.765625,
"learning_rate": 4.468390804597701e-05,
"loss": 1.789,
"step": 1610
},
{
"epoch": 0.7404021937842779,
"grad_norm": 96340.703125,
"learning_rate": 4.46360153256705e-05,
"loss": 1.8048,
"step": 1620
},
{
"epoch": 0.7449725776965265,
"grad_norm": 77806.5,
"learning_rate": 4.4588122605363986e-05,
"loss": 1.6585,
"step": 1630
},
{
"epoch": 0.7495429616087751,
"grad_norm": 115383.6171875,
"learning_rate": 4.454022988505747e-05,
"loss": 1.7022,
"step": 1640
},
{
"epoch": 0.7541133455210237,
"grad_norm": 128746.96875,
"learning_rate": 4.449233716475096e-05,
"loss": 1.7007,
"step": 1650
},
{
"epoch": 0.7586837294332724,
"grad_norm": 92168.8515625,
"learning_rate": 4.4444444444444447e-05,
"loss": 1.78,
"step": 1660
},
{
"epoch": 0.7632541133455211,
"grad_norm": 79167.046875,
"learning_rate": 4.4396551724137933e-05,
"loss": 1.8273,
"step": 1670
},
{
"epoch": 0.7678244972577697,
"grad_norm": 113320.921875,
"learning_rate": 4.434865900383142e-05,
"loss": 1.6285,
"step": 1680
},
{
"epoch": 0.7723948811700183,
"grad_norm": 80329.4140625,
"learning_rate": 4.430076628352491e-05,
"loss": 1.6705,
"step": 1690
},
{
"epoch": 0.7769652650822669,
"grad_norm": 112864.1875,
"learning_rate": 4.4252873563218394e-05,
"loss": 1.6774,
"step": 1700
},
{
"epoch": 0.7815356489945156,
"grad_norm": 83703.5703125,
"learning_rate": 4.420498084291188e-05,
"loss": 1.7612,
"step": 1710
},
{
"epoch": 0.7861060329067642,
"grad_norm": 89239.75,
"learning_rate": 4.415708812260537e-05,
"loss": 1.6861,
"step": 1720
},
{
"epoch": 0.7906764168190128,
"grad_norm": 89113.1484375,
"learning_rate": 4.4109195402298855e-05,
"loss": 1.751,
"step": 1730
},
{
"epoch": 0.7952468007312614,
"grad_norm": 74121.84375,
"learning_rate": 4.406130268199234e-05,
"loss": 1.7383,
"step": 1740
},
{
"epoch": 0.79981718464351,
"grad_norm": 98291.3125,
"learning_rate": 4.401340996168583e-05,
"loss": 1.6507,
"step": 1750
},
{
"epoch": 0.8043875685557587,
"grad_norm": 101062.6484375,
"learning_rate": 4.396551724137931e-05,
"loss": 1.6809,
"step": 1760
},
{
"epoch": 0.8089579524680073,
"grad_norm": 82451.6015625,
"learning_rate": 4.3917624521072795e-05,
"loss": 1.6369,
"step": 1770
},
{
"epoch": 0.8135283363802559,
"grad_norm": 93937.7109375,
"learning_rate": 4.386973180076628e-05,
"loss": 1.8125,
"step": 1780
},
{
"epoch": 0.8180987202925045,
"grad_norm": 86336.0546875,
"learning_rate": 4.382183908045977e-05,
"loss": 1.7483,
"step": 1790
},
{
"epoch": 0.8226691042047533,
"grad_norm": 94180.6953125,
"learning_rate": 4.3773946360153256e-05,
"loss": 1.7549,
"step": 1800
},
{
"epoch": 0.8272394881170019,
"grad_norm": 89638.5703125,
"learning_rate": 4.372605363984675e-05,
"loss": 1.6895,
"step": 1810
},
{
"epoch": 0.8318098720292505,
"grad_norm": 129005.0,
"learning_rate": 4.367816091954024e-05,
"loss": 1.7419,
"step": 1820
},
{
"epoch": 0.8363802559414991,
"grad_norm": 115378.0703125,
"learning_rate": 4.3630268199233724e-05,
"loss": 1.6716,
"step": 1830
},
{
"epoch": 0.8409506398537477,
"grad_norm": 84872.3046875,
"learning_rate": 4.3582375478927204e-05,
"loss": 1.6927,
"step": 1840
},
{
"epoch": 0.8455210237659964,
"grad_norm": 82049.890625,
"learning_rate": 4.353448275862069e-05,
"loss": 1.7318,
"step": 1850
},
{
"epoch": 0.850091407678245,
"grad_norm": 81004.65625,
"learning_rate": 4.348659003831418e-05,
"loss": 1.6633,
"step": 1860
},
{
"epoch": 0.8546617915904936,
"grad_norm": 186330.078125,
"learning_rate": 4.3438697318007664e-05,
"loss": 1.8362,
"step": 1870
},
{
"epoch": 0.8592321755027422,
"grad_norm": 95703.015625,
"learning_rate": 4.339080459770115e-05,
"loss": 1.8003,
"step": 1880
},
{
"epoch": 0.8638025594149908,
"grad_norm": 75972.78125,
"learning_rate": 4.334291187739464e-05,
"loss": 1.7282,
"step": 1890
},
{
"epoch": 0.8683729433272395,
"grad_norm": 105656.9765625,
"learning_rate": 4.3295019157088125e-05,
"loss": 1.7041,
"step": 1900
},
{
"epoch": 0.8729433272394881,
"grad_norm": 98981.046875,
"learning_rate": 4.324712643678161e-05,
"loss": 1.6535,
"step": 1910
},
{
"epoch": 0.8775137111517367,
"grad_norm": 131719.453125,
"learning_rate": 4.31992337164751e-05,
"loss": 1.7521,
"step": 1920
},
{
"epoch": 0.8820840950639853,
"grad_norm": 62522.44140625,
"learning_rate": 4.3151340996168586e-05,
"loss": 1.6794,
"step": 1930
},
{
"epoch": 0.886654478976234,
"grad_norm": 104086.2265625,
"learning_rate": 4.3103448275862066e-05,
"loss": 1.7029,
"step": 1940
},
{
"epoch": 0.8912248628884827,
"grad_norm": 120783.3359375,
"learning_rate": 4.305555555555556e-05,
"loss": 1.6313,
"step": 1950
},
{
"epoch": 0.8957952468007313,
"grad_norm": 99394.8203125,
"learning_rate": 4.3007662835249046e-05,
"loss": 1.7735,
"step": 1960
},
{
"epoch": 0.9003656307129799,
"grad_norm": 152363.28125,
"learning_rate": 4.295977011494253e-05,
"loss": 1.6732,
"step": 1970
},
{
"epoch": 0.9049360146252285,
"grad_norm": 156369.4375,
"learning_rate": 4.291187739463602e-05,
"loss": 1.765,
"step": 1980
},
{
"epoch": 0.9095063985374772,
"grad_norm": 63156.875,
"learning_rate": 4.286398467432951e-05,
"loss": 1.6294,
"step": 1990
},
{
"epoch": 0.9140767824497258,
"grad_norm": 106310.6015625,
"learning_rate": 4.2816091954022994e-05,
"loss": 1.7492,
"step": 2000
},
{
"epoch": 0.9186471663619744,
"grad_norm": 95234.3515625,
"learning_rate": 4.2768199233716474e-05,
"loss": 1.6726,
"step": 2010
},
{
"epoch": 0.923217550274223,
"grad_norm": 105046.6015625,
"learning_rate": 4.272030651340996e-05,
"loss": 1.6848,
"step": 2020
},
{
"epoch": 0.9277879341864717,
"grad_norm": 91991.890625,
"learning_rate": 4.267241379310345e-05,
"loss": 1.7447,
"step": 2030
},
{
"epoch": 0.9323583180987203,
"grad_norm": 95458.09375,
"learning_rate": 4.2624521072796934e-05,
"loss": 1.7068,
"step": 2040
},
{
"epoch": 0.9369287020109689,
"grad_norm": 108366.8828125,
"learning_rate": 4.257662835249042e-05,
"loss": 1.6272,
"step": 2050
},
{
"epoch": 0.9414990859232175,
"grad_norm": 79585.796875,
"learning_rate": 4.252873563218391e-05,
"loss": 1.7444,
"step": 2060
},
{
"epoch": 0.9460694698354661,
"grad_norm": 108535.515625,
"learning_rate": 4.2480842911877395e-05,
"loss": 1.6754,
"step": 2070
},
{
"epoch": 0.9506398537477148,
"grad_norm": 101708.1640625,
"learning_rate": 4.243295019157089e-05,
"loss": 1.8225,
"step": 2080
},
{
"epoch": 0.9552102376599635,
"grad_norm": 149960.90625,
"learning_rate": 4.238505747126437e-05,
"loss": 1.7228,
"step": 2090
},
{
"epoch": 0.9597806215722121,
"grad_norm": 116507.1328125,
"learning_rate": 4.2337164750957856e-05,
"loss": 1.7649,
"step": 2100
},
{
"epoch": 0.9643510054844607,
"grad_norm": 95376.0859375,
"learning_rate": 4.228927203065134e-05,
"loss": 1.7423,
"step": 2110
},
{
"epoch": 0.9689213893967094,
"grad_norm": 77924.8828125,
"learning_rate": 4.224137931034483e-05,
"loss": 1.7371,
"step": 2120
},
{
"epoch": 0.973491773308958,
"grad_norm": 108620.1484375,
"learning_rate": 4.2193486590038316e-05,
"loss": 1.6786,
"step": 2130
},
{
"epoch": 0.9780621572212066,
"grad_norm": 96284.8984375,
"learning_rate": 4.21455938697318e-05,
"loss": 1.641,
"step": 2140
},
{
"epoch": 0.9826325411334552,
"grad_norm": 78326.7890625,
"learning_rate": 4.209770114942529e-05,
"loss": 1.6984,
"step": 2150
},
{
"epoch": 0.9872029250457038,
"grad_norm": 86470.40625,
"learning_rate": 4.204980842911878e-05,
"loss": 1.7534,
"step": 2160
},
{
"epoch": 0.9917733089579525,
"grad_norm": 104938.75,
"learning_rate": 4.2001915708812264e-05,
"loss": 1.7039,
"step": 2170
},
{
"epoch": 0.9963436928702011,
"grad_norm": 100711.6875,
"learning_rate": 4.195402298850575e-05,
"loss": 1.7534,
"step": 2180
},
{
"epoch": 1.0,
"eval_loss": 1.7005630731582642,
"eval_runtime": 345.6818,
"eval_samples_per_second": 43.393,
"eval_steps_per_second": 1.357,
"step": 2188
},
{
"epoch": 1.0009140767824498,
"grad_norm": 91781.6796875,
"learning_rate": 4.190613026819923e-05,
"loss": 1.6595,
"step": 2190
},
{
"epoch": 1.0054844606946984,
"grad_norm": 90080.9375,
"learning_rate": 4.185823754789272e-05,
"loss": 1.6538,
"step": 2200
},
{
"epoch": 1.010054844606947,
"grad_norm": 88122.7890625,
"learning_rate": 4.1810344827586205e-05,
"loss": 1.6596,
"step": 2210
},
{
"epoch": 1.0146252285191957,
"grad_norm": 111770.046875,
"learning_rate": 4.17624521072797e-05,
"loss": 1.6971,
"step": 2220
},
{
"epoch": 1.0191956124314443,
"grad_norm": 104428.828125,
"learning_rate": 4.1714559386973185e-05,
"loss": 1.7929,
"step": 2230
},
{
"epoch": 1.023765996343693,
"grad_norm": 90344.828125,
"learning_rate": 4.166666666666667e-05,
"loss": 1.8338,
"step": 2240
},
{
"epoch": 1.0283363802559415,
"grad_norm": 87549.5546875,
"learning_rate": 4.161877394636016e-05,
"loss": 1.6911,
"step": 2250
},
{
"epoch": 1.0329067641681902,
"grad_norm": 82813.453125,
"learning_rate": 4.1570881226053646e-05,
"loss": 1.7573,
"step": 2260
},
{
"epoch": 1.0374771480804388,
"grad_norm": 87447.1328125,
"learning_rate": 4.1522988505747126e-05,
"loss": 1.7275,
"step": 2270
},
{
"epoch": 1.0420475319926874,
"grad_norm": 63916.98046875,
"learning_rate": 4.147509578544061e-05,
"loss": 1.7326,
"step": 2280
},
{
"epoch": 1.046617915904936,
"grad_norm": 89433.3828125,
"learning_rate": 4.14272030651341e-05,
"loss": 1.7586,
"step": 2290
},
{
"epoch": 1.0511882998171846,
"grad_norm": 171660.09375,
"learning_rate": 4.1379310344827587e-05,
"loss": 1.789,
"step": 2300
},
{
"epoch": 1.0557586837294333,
"grad_norm": 69103.7734375,
"learning_rate": 4.1331417624521073e-05,
"loss": 1.7837,
"step": 2310
},
{
"epoch": 1.0603290676416819,
"grad_norm": 81962.609375,
"learning_rate": 4.128352490421456e-05,
"loss": 1.7424,
"step": 2320
},
{
"epoch": 1.0648994515539305,
"grad_norm": 83097.5703125,
"learning_rate": 4.123563218390805e-05,
"loss": 1.7879,
"step": 2330
},
{
"epoch": 1.0694698354661791,
"grad_norm": 112565.484375,
"learning_rate": 4.1187739463601534e-05,
"loss": 1.7266,
"step": 2340
},
{
"epoch": 1.0740402193784278,
"grad_norm": 120119.7265625,
"learning_rate": 4.113984674329502e-05,
"loss": 1.8418,
"step": 2350
},
{
"epoch": 1.0786106032906764,
"grad_norm": 105760.8359375,
"learning_rate": 4.109195402298851e-05,
"loss": 1.7203,
"step": 2360
},
{
"epoch": 1.083180987202925,
"grad_norm": 88435.3125,
"learning_rate": 4.1044061302681995e-05,
"loss": 1.6665,
"step": 2370
},
{
"epoch": 1.0877513711151736,
"grad_norm": 74858.359375,
"learning_rate": 4.099616858237548e-05,
"loss": 1.6907,
"step": 2380
},
{
"epoch": 1.0923217550274222,
"grad_norm": 80752.5546875,
"learning_rate": 4.094827586206897e-05,
"loss": 1.7489,
"step": 2390
},
{
"epoch": 1.0968921389396709,
"grad_norm": 85903.7578125,
"learning_rate": 4.0900383141762455e-05,
"loss": 1.6526,
"step": 2400
},
{
"epoch": 1.1014625228519195,
"grad_norm": 87469.15625,
"learning_rate": 4.085249042145594e-05,
"loss": 1.7612,
"step": 2410
},
{
"epoch": 1.106032906764168,
"grad_norm": 115305.1484375,
"learning_rate": 4.080459770114943e-05,
"loss": 1.7148,
"step": 2420
},
{
"epoch": 1.1106032906764167,
"grad_norm": 76171.40625,
"learning_rate": 4.0756704980842916e-05,
"loss": 1.7383,
"step": 2430
},
{
"epoch": 1.1151736745886653,
"grad_norm": 110853.4140625,
"learning_rate": 4.07088122605364e-05,
"loss": 1.7353,
"step": 2440
},
{
"epoch": 1.1197440585009142,
"grad_norm": 83511.421875,
"learning_rate": 4.066091954022988e-05,
"loss": 1.7261,
"step": 2450
},
{
"epoch": 1.1243144424131628,
"grad_norm": 69653.203125,
"learning_rate": 4.061302681992337e-05,
"loss": 1.7403,
"step": 2460
},
{
"epoch": 1.1288848263254114,
"grad_norm": 141315.3125,
"learning_rate": 4.056513409961686e-05,
"loss": 1.7125,
"step": 2470
},
{
"epoch": 1.13345521023766,
"grad_norm": 139185.4375,
"learning_rate": 4.0517241379310344e-05,
"loss": 1.66,
"step": 2480
},
{
"epoch": 1.1380255941499087,
"grad_norm": 107879.578125,
"learning_rate": 4.046934865900383e-05,
"loss": 1.8147,
"step": 2490
},
{
"epoch": 1.1425959780621573,
"grad_norm": 93484.1953125,
"learning_rate": 4.0421455938697324e-05,
"loss": 1.6882,
"step": 2500
},
{
"epoch": 1.147166361974406,
"grad_norm": 96417.0390625,
"learning_rate": 4.037356321839081e-05,
"loss": 1.7444,
"step": 2510
},
{
"epoch": 1.1517367458866545,
"grad_norm": 105896.125,
"learning_rate": 4.03256704980843e-05,
"loss": 1.7218,
"step": 2520
},
{
"epoch": 1.1563071297989032,
"grad_norm": 70536.5078125,
"learning_rate": 4.027777777777778e-05,
"loss": 1.643,
"step": 2530
},
{
"epoch": 1.1608775137111518,
"grad_norm": 188483.515625,
"learning_rate": 4.0229885057471265e-05,
"loss": 1.7145,
"step": 2540
},
{
"epoch": 1.1654478976234004,
"grad_norm": 91915.40625,
"learning_rate": 4.018199233716475e-05,
"loss": 1.6617,
"step": 2550
},
{
"epoch": 1.170018281535649,
"grad_norm": 112711.40625,
"learning_rate": 4.013409961685824e-05,
"loss": 1.7117,
"step": 2560
},
{
"epoch": 1.1745886654478976,
"grad_norm": 81478.6953125,
"learning_rate": 4.0086206896551726e-05,
"loss": 1.8141,
"step": 2570
},
{
"epoch": 1.1791590493601463,
"grad_norm": 90161.96875,
"learning_rate": 4.003831417624521e-05,
"loss": 1.6604,
"step": 2580
},
{
"epoch": 1.1837294332723949,
"grad_norm": 67411.1953125,
"learning_rate": 3.99904214559387e-05,
"loss": 1.7204,
"step": 2590
},
{
"epoch": 1.1882998171846435,
"grad_norm": 104490.5234375,
"learning_rate": 3.9942528735632186e-05,
"loss": 1.7942,
"step": 2600
},
{
"epoch": 1.1928702010968921,
"grad_norm": 81397.3515625,
"learning_rate": 3.989463601532567e-05,
"loss": 1.6425,
"step": 2610
},
{
"epoch": 1.1974405850091407,
"grad_norm": 117060.71875,
"learning_rate": 3.984674329501916e-05,
"loss": 1.7771,
"step": 2620
},
{
"epoch": 1.2020109689213894,
"grad_norm": 80522.625,
"learning_rate": 3.979885057471265e-05,
"loss": 1.7167,
"step": 2630
},
{
"epoch": 1.206581352833638,
"grad_norm": 92349.40625,
"learning_rate": 3.9750957854406134e-05,
"loss": 1.6713,
"step": 2640
},
{
"epoch": 1.2111517367458866,
"grad_norm": 100128.2578125,
"learning_rate": 3.970306513409962e-05,
"loss": 1.7673,
"step": 2650
},
{
"epoch": 1.2157221206581352,
"grad_norm": 89113.0625,
"learning_rate": 3.965517241379311e-05,
"loss": 1.6187,
"step": 2660
},
{
"epoch": 1.2202925045703839,
"grad_norm": 124528.21875,
"learning_rate": 3.9607279693486594e-05,
"loss": 1.7173,
"step": 2670
},
{
"epoch": 1.2248628884826325,
"grad_norm": 105671.9765625,
"learning_rate": 3.955938697318008e-05,
"loss": 1.7232,
"step": 2680
},
{
"epoch": 1.229433272394881,
"grad_norm": 72920.734375,
"learning_rate": 3.951149425287357e-05,
"loss": 1.7218,
"step": 2690
},
{
"epoch": 1.2340036563071297,
"grad_norm": 89138.640625,
"learning_rate": 3.9463601532567055e-05,
"loss": 1.681,
"step": 2700
},
{
"epoch": 1.2385740402193783,
"grad_norm": 76915.9765625,
"learning_rate": 3.9415708812260535e-05,
"loss": 1.7417,
"step": 2710
},
{
"epoch": 1.2431444241316272,
"grad_norm": 76970.3359375,
"learning_rate": 3.936781609195402e-05,
"loss": 1.7513,
"step": 2720
},
{
"epoch": 1.2477148080438756,
"grad_norm": 102189.65625,
"learning_rate": 3.931992337164751e-05,
"loss": 1.6338,
"step": 2730
},
{
"epoch": 1.2522851919561244,
"grad_norm": 102939.9375,
"learning_rate": 3.9272030651340996e-05,
"loss": 1.7095,
"step": 2740
},
{
"epoch": 1.2568555758683728,
"grad_norm": 75103.9375,
"learning_rate": 3.922413793103448e-05,
"loss": 1.7029,
"step": 2750
},
{
"epoch": 1.2614259597806217,
"grad_norm": 118085.3125,
"learning_rate": 3.917624521072797e-05,
"loss": 1.741,
"step": 2760
},
{
"epoch": 1.26599634369287,
"grad_norm": 78790.3984375,
"learning_rate": 3.912835249042146e-05,
"loss": 1.739,
"step": 2770
},
{
"epoch": 1.270566727605119,
"grad_norm": 105775.8125,
"learning_rate": 3.908045977011495e-05,
"loss": 1.6434,
"step": 2780
},
{
"epoch": 1.2751371115173675,
"grad_norm": 78574.390625,
"learning_rate": 3.903256704980843e-05,
"loss": 1.7371,
"step": 2790
},
{
"epoch": 1.2797074954296161,
"grad_norm": 141283.359375,
"learning_rate": 3.898467432950192e-05,
"loss": 1.7781,
"step": 2800
},
{
"epoch": 1.2842778793418648,
"grad_norm": 111235.7265625,
"learning_rate": 3.8936781609195404e-05,
"loss": 1.7383,
"step": 2810
},
{
"epoch": 1.2888482632541134,
"grad_norm": 76845.9296875,
"learning_rate": 3.888888888888889e-05,
"loss": 1.6674,
"step": 2820
},
{
"epoch": 1.293418647166362,
"grad_norm": 95679.5,
"learning_rate": 3.884099616858238e-05,
"loss": 1.7516,
"step": 2830
},
{
"epoch": 1.2979890310786106,
"grad_norm": 69231.671875,
"learning_rate": 3.8793103448275865e-05,
"loss": 1.7267,
"step": 2840
},
{
"epoch": 1.3025594149908593,
"grad_norm": 90358.46875,
"learning_rate": 3.874521072796935e-05,
"loss": 1.7008,
"step": 2850
},
{
"epoch": 1.3071297989031079,
"grad_norm": 130772.9921875,
"learning_rate": 3.869731800766284e-05,
"loss": 1.7041,
"step": 2860
},
{
"epoch": 1.3117001828153565,
"grad_norm": 86227.6015625,
"learning_rate": 3.8649425287356325e-05,
"loss": 1.6709,
"step": 2870
},
{
"epoch": 1.3162705667276051,
"grad_norm": 108528.421875,
"learning_rate": 3.8601532567049805e-05,
"loss": 1.6854,
"step": 2880
},
{
"epoch": 1.3208409506398537,
"grad_norm": 96323.234375,
"learning_rate": 3.855363984674329e-05,
"loss": 1.7317,
"step": 2890
},
{
"epoch": 1.3254113345521024,
"grad_norm": 160365.8125,
"learning_rate": 3.850574712643678e-05,
"loss": 1.7585,
"step": 2900
},
{
"epoch": 1.329981718464351,
"grad_norm": 80294.25,
"learning_rate": 3.845785440613027e-05,
"loss": 1.7612,
"step": 2910
},
{
"epoch": 1.3345521023765996,
"grad_norm": 106545.8515625,
"learning_rate": 3.840996168582376e-05,
"loss": 1.7292,
"step": 2920
},
{
"epoch": 1.3391224862888482,
"grad_norm": 95112.4375,
"learning_rate": 3.8362068965517246e-05,
"loss": 1.7349,
"step": 2930
},
{
"epoch": 1.3436928702010968,
"grad_norm": 65303.0390625,
"learning_rate": 3.831417624521073e-05,
"loss": 1.7047,
"step": 2940
},
{
"epoch": 1.3482632541133455,
"grad_norm": 66884.5078125,
"learning_rate": 3.826628352490422e-05,
"loss": 1.7516,
"step": 2950
},
{
"epoch": 1.352833638025594,
"grad_norm": 87596.6796875,
"learning_rate": 3.82183908045977e-05,
"loss": 1.7047,
"step": 2960
},
{
"epoch": 1.3574040219378427,
"grad_norm": 87859.3671875,
"learning_rate": 3.817049808429119e-05,
"loss": 1.848,
"step": 2970
},
{
"epoch": 1.3619744058500913,
"grad_norm": 78479.203125,
"learning_rate": 3.8122605363984674e-05,
"loss": 1.7104,
"step": 2980
},
{
"epoch": 1.3665447897623402,
"grad_norm": 67868.75,
"learning_rate": 3.807471264367816e-05,
"loss": 1.7818,
"step": 2990
},
{
"epoch": 1.3711151736745886,
"grad_norm": 108708.4921875,
"learning_rate": 3.802681992337165e-05,
"loss": 1.6592,
"step": 3000
},
{
"epoch": 1.3756855575868374,
"grad_norm": 49919.5,
"learning_rate": 3.7978927203065135e-05,
"loss": 1.8281,
"step": 3010
},
{
"epoch": 1.3802559414990858,
"grad_norm": 92234.84375,
"learning_rate": 3.793103448275862e-05,
"loss": 1.6741,
"step": 3020
},
{
"epoch": 1.3848263254113347,
"grad_norm": 110063.28125,
"learning_rate": 3.788314176245211e-05,
"loss": 1.6882,
"step": 3030
},
{
"epoch": 1.389396709323583,
"grad_norm": 99851.5,
"learning_rate": 3.7835249042145595e-05,
"loss": 1.7069,
"step": 3040
},
{
"epoch": 1.393967093235832,
"grad_norm": 91330.5,
"learning_rate": 3.778735632183908e-05,
"loss": 1.7738,
"step": 3050
},
{
"epoch": 1.3985374771480805,
"grad_norm": 97720.6953125,
"learning_rate": 3.773946360153257e-05,
"loss": 1.699,
"step": 3060
},
{
"epoch": 1.4031078610603291,
"grad_norm": 60284.0859375,
"learning_rate": 3.7691570881226056e-05,
"loss": 1.6353,
"step": 3070
},
{
"epoch": 1.4076782449725778,
"grad_norm": 110682.34375,
"learning_rate": 3.764367816091954e-05,
"loss": 1.6963,
"step": 3080
},
{
"epoch": 1.4122486288848264,
"grad_norm": 144162.453125,
"learning_rate": 3.759578544061303e-05,
"loss": 1.7995,
"step": 3090
},
{
"epoch": 1.416819012797075,
"grad_norm": 205120.046875,
"learning_rate": 3.7547892720306517e-05,
"loss": 1.796,
"step": 3100
},
{
"epoch": 1.4213893967093236,
"grad_norm": 144576.359375,
"learning_rate": 3.7500000000000003e-05,
"loss": 1.663,
"step": 3110
},
{
"epoch": 1.4259597806215722,
"grad_norm": 103083.203125,
"learning_rate": 3.745210727969349e-05,
"loss": 1.6979,
"step": 3120
},
{
"epoch": 1.4305301645338209,
"grad_norm": 100059.3046875,
"learning_rate": 3.740421455938698e-05,
"loss": 1.7823,
"step": 3130
},
{
"epoch": 1.4351005484460695,
"grad_norm": 128589.3046875,
"learning_rate": 3.735632183908046e-05,
"loss": 1.7182,
"step": 3140
},
{
"epoch": 1.4396709323583181,
"grad_norm": 70812.890625,
"learning_rate": 3.7308429118773944e-05,
"loss": 1.7785,
"step": 3150
},
{
"epoch": 1.4442413162705667,
"grad_norm": 63189.20703125,
"learning_rate": 3.726053639846743e-05,
"loss": 1.7257,
"step": 3160
},
{
"epoch": 1.4488117001828154,
"grad_norm": 108848.34375,
"learning_rate": 3.721264367816092e-05,
"loss": 1.7409,
"step": 3170
},
{
"epoch": 1.453382084095064,
"grad_norm": 97378.515625,
"learning_rate": 3.716475095785441e-05,
"loss": 1.7078,
"step": 3180
},
{
"epoch": 1.4579524680073126,
"grad_norm": 101953.046875,
"learning_rate": 3.71168582375479e-05,
"loss": 1.8016,
"step": 3190
},
{
"epoch": 1.4625228519195612,
"grad_norm": 118171.328125,
"learning_rate": 3.7068965517241385e-05,
"loss": 1.719,
"step": 3200
},
{
"epoch": 1.4670932358318098,
"grad_norm": 108193.546875,
"learning_rate": 3.702107279693487e-05,
"loss": 1.7423,
"step": 3210
},
{
"epoch": 1.4716636197440585,
"grad_norm": 83211.53125,
"learning_rate": 3.697318007662835e-05,
"loss": 1.6915,
"step": 3220
},
{
"epoch": 1.476234003656307,
"grad_norm": 118025.546875,
"learning_rate": 3.692528735632184e-05,
"loss": 1.7542,
"step": 3230
},
{
"epoch": 1.4808043875685557,
"grad_norm": 79268.8828125,
"learning_rate": 3.6877394636015326e-05,
"loss": 1.7558,
"step": 3240
},
{
"epoch": 1.4853747714808043,
"grad_norm": 91756.1015625,
"learning_rate": 3.682950191570881e-05,
"loss": 1.6653,
"step": 3250
},
{
"epoch": 1.489945155393053,
"grad_norm": 114188.828125,
"learning_rate": 3.67816091954023e-05,
"loss": 1.7899,
"step": 3260
},
{
"epoch": 1.4945155393053016,
"grad_norm": 122504.4921875,
"learning_rate": 3.673371647509579e-05,
"loss": 1.8254,
"step": 3270
},
{
"epoch": 1.4990859232175504,
"grad_norm": 68376.1640625,
"learning_rate": 3.6685823754789274e-05,
"loss": 1.7274,
"step": 3280
},
{
"epoch": 1.5036563071297988,
"grad_norm": 118541.71875,
"learning_rate": 3.663793103448276e-05,
"loss": 1.7456,
"step": 3290
},
{
"epoch": 1.5082266910420477,
"grad_norm": 62813.5078125,
"learning_rate": 3.659003831417625e-05,
"loss": 1.7049,
"step": 3300
},
{
"epoch": 1.512797074954296,
"grad_norm": 107603.8515625,
"learning_rate": 3.6542145593869734e-05,
"loss": 1.6411,
"step": 3310
},
{
"epoch": 1.517367458866545,
"grad_norm": 66405.75,
"learning_rate": 3.649425287356322e-05,
"loss": 1.6928,
"step": 3320
},
{
"epoch": 1.5219378427787933,
"grad_norm": 77770.8125,
"learning_rate": 3.644636015325671e-05,
"loss": 1.6668,
"step": 3330
},
{
"epoch": 1.5265082266910421,
"grad_norm": 96171.3359375,
"learning_rate": 3.6398467432950195e-05,
"loss": 1.7354,
"step": 3340
},
{
"epoch": 1.5310786106032905,
"grad_norm": 114372.53125,
"learning_rate": 3.635057471264368e-05,
"loss": 1.6588,
"step": 3350
},
{
"epoch": 1.5356489945155394,
"grad_norm": 126977.671875,
"learning_rate": 3.630268199233717e-05,
"loss": 1.7437,
"step": 3360
},
{
"epoch": 1.5402193784277878,
"grad_norm": 124899.46875,
"learning_rate": 3.6254789272030656e-05,
"loss": 1.7381,
"step": 3370
},
{
"epoch": 1.5447897623400366,
"grad_norm": 87250.2109375,
"learning_rate": 3.620689655172414e-05,
"loss": 1.7489,
"step": 3380
},
{
"epoch": 1.5493601462522852,
"grad_norm": 99225.671875,
"learning_rate": 3.615900383141763e-05,
"loss": 1.7005,
"step": 3390
},
{
"epoch": 1.5539305301645339,
"grad_norm": 110436.3515625,
"learning_rate": 3.611111111111111e-05,
"loss": 1.7526,
"step": 3400
},
{
"epoch": 1.5585009140767825,
"grad_norm": 73272.2421875,
"learning_rate": 3.6063218390804596e-05,
"loss": 1.6412,
"step": 3410
},
{
"epoch": 1.563071297989031,
"grad_norm": 83162.109375,
"learning_rate": 3.601532567049808e-05,
"loss": 1.7777,
"step": 3420
},
{
"epoch": 1.5676416819012797,
"grad_norm": 65596.4609375,
"learning_rate": 3.596743295019157e-05,
"loss": 1.7165,
"step": 3430
},
{
"epoch": 1.5722120658135283,
"grad_norm": 87716.9375,
"learning_rate": 3.591954022988506e-05,
"loss": 1.7015,
"step": 3440
},
{
"epoch": 1.576782449725777,
"grad_norm": 108148.75,
"learning_rate": 3.5871647509578544e-05,
"loss": 1.6684,
"step": 3450
},
{
"epoch": 1.5813528336380256,
"grad_norm": 79430.3203125,
"learning_rate": 3.582375478927204e-05,
"loss": 1.6243,
"step": 3460
},
{
"epoch": 1.5859232175502742,
"grad_norm": 102633.3046875,
"learning_rate": 3.5775862068965524e-05,
"loss": 1.7009,
"step": 3470
},
{
"epoch": 1.5904936014625228,
"grad_norm": 116884.0546875,
"learning_rate": 3.5727969348659004e-05,
"loss": 1.6794,
"step": 3480
},
{
"epoch": 1.5950639853747715,
"grad_norm": 123101.0078125,
"learning_rate": 3.568007662835249e-05,
"loss": 1.7216,
"step": 3490
},
{
"epoch": 1.59963436928702,
"grad_norm": 81439.2734375,
"learning_rate": 3.563218390804598e-05,
"loss": 1.6748,
"step": 3500
},
{
"epoch": 1.6042047531992687,
"grad_norm": 101444.484375,
"learning_rate": 3.5584291187739465e-05,
"loss": 1.8176,
"step": 3510
},
{
"epoch": 1.6087751371115173,
"grad_norm": 76834.7578125,
"learning_rate": 3.553639846743295e-05,
"loss": 1.6932,
"step": 3520
},
{
"epoch": 1.6133455210237662,
"grad_norm": 102102.7265625,
"learning_rate": 3.548850574712644e-05,
"loss": 1.7848,
"step": 3530
},
{
"epoch": 1.6179159049360146,
"grad_norm": 65518.3984375,
"learning_rate": 3.5440613026819926e-05,
"loss": 1.6966,
"step": 3540
},
{
"epoch": 1.6224862888482634,
"grad_norm": 72790.3359375,
"learning_rate": 3.539272030651341e-05,
"loss": 1.6584,
"step": 3550
},
{
"epoch": 1.6270566727605118,
"grad_norm": 73727.9140625,
"learning_rate": 3.53448275862069e-05,
"loss": 1.7437,
"step": 3560
},
{
"epoch": 1.6316270566727606,
"grad_norm": 91062.0859375,
"learning_rate": 3.529693486590038e-05,
"loss": 1.6678,
"step": 3570
},
{
"epoch": 1.636197440585009,
"grad_norm": 172251.484375,
"learning_rate": 3.5249042145593867e-05,
"loss": 1.7394,
"step": 3580
},
{
"epoch": 1.6407678244972579,
"grad_norm": 85787.4140625,
"learning_rate": 3.5201149425287353e-05,
"loss": 1.6567,
"step": 3590
},
{
"epoch": 1.6453382084095063,
"grad_norm": 96389.1171875,
"learning_rate": 3.515325670498085e-05,
"loss": 1.7427,
"step": 3600
},
{
"epoch": 1.6499085923217551,
"grad_norm": 69201.625,
"learning_rate": 3.5105363984674334e-05,
"loss": 1.7775,
"step": 3610
},
{
"epoch": 1.6544789762340035,
"grad_norm": 100617.40625,
"learning_rate": 3.505747126436782e-05,
"loss": 1.7371,
"step": 3620
},
{
"epoch": 1.6590493601462524,
"grad_norm": 96952.21875,
"learning_rate": 3.500957854406131e-05,
"loss": 1.7269,
"step": 3630
},
{
"epoch": 1.6636197440585008,
"grad_norm": 74618.84375,
"learning_rate": 3.4961685823754795e-05,
"loss": 1.6758,
"step": 3640
},
{
"epoch": 1.6681901279707496,
"grad_norm": 57622.64453125,
"learning_rate": 3.4913793103448275e-05,
"loss": 1.6987,
"step": 3650
},
{
"epoch": 1.672760511882998,
"grad_norm": 102405.7421875,
"learning_rate": 3.486590038314176e-05,
"loss": 1.755,
"step": 3660
},
{
"epoch": 1.6773308957952469,
"grad_norm": 99707.1953125,
"learning_rate": 3.481800766283525e-05,
"loss": 1.6318,
"step": 3670
},
{
"epoch": 1.6819012797074955,
"grad_norm": 84351.671875,
"learning_rate": 3.4770114942528735e-05,
"loss": 1.7005,
"step": 3680
},
{
"epoch": 1.686471663619744,
"grad_norm": 92737.875,
"learning_rate": 3.472222222222222e-05,
"loss": 1.6547,
"step": 3690
},
{
"epoch": 1.6910420475319927,
"grad_norm": 117513.71875,
"learning_rate": 3.467432950191571e-05,
"loss": 1.6817,
"step": 3700
},
{
"epoch": 1.6956124314442413,
"grad_norm": 78983.3515625,
"learning_rate": 3.4626436781609196e-05,
"loss": 1.7011,
"step": 3710
},
{
"epoch": 1.70018281535649,
"grad_norm": 160552.734375,
"learning_rate": 3.457854406130268e-05,
"loss": 1.7277,
"step": 3720
},
{
"epoch": 1.7047531992687386,
"grad_norm": 69111.1328125,
"learning_rate": 3.453065134099617e-05,
"loss": 1.7326,
"step": 3730
},
{
"epoch": 1.7093235831809872,
"grad_norm": 106880.96875,
"learning_rate": 3.4482758620689657e-05,
"loss": 1.7453,
"step": 3740
},
{
"epoch": 1.7138939670932358,
"grad_norm": 117654.46875,
"learning_rate": 3.4434865900383143e-05,
"loss": 1.8011,
"step": 3750
},
{
"epoch": 1.7184643510054844,
"grad_norm": 85585.7265625,
"learning_rate": 3.438697318007663e-05,
"loss": 1.7664,
"step": 3760
},
{
"epoch": 1.723034734917733,
"grad_norm": 77661.5078125,
"learning_rate": 3.433908045977012e-05,
"loss": 1.6859,
"step": 3770
},
{
"epoch": 1.7276051188299817,
"grad_norm": 106292.359375,
"learning_rate": 3.4291187739463604e-05,
"loss": 1.7482,
"step": 3780
},
{
"epoch": 1.7321755027422303,
"grad_norm": 98747.6171875,
"learning_rate": 3.424329501915709e-05,
"loss": 1.7234,
"step": 3790
},
{
"epoch": 1.736745886654479,
"grad_norm": 155160.953125,
"learning_rate": 3.419540229885058e-05,
"loss": 1.8456,
"step": 3800
},
{
"epoch": 1.7413162705667276,
"grad_norm": 94582.5546875,
"learning_rate": 3.4147509578544065e-05,
"loss": 1.8332,
"step": 3810
},
{
"epoch": 1.7458866544789764,
"grad_norm": 96502.8359375,
"learning_rate": 3.409961685823755e-05,
"loss": 1.8145,
"step": 3820
},
{
"epoch": 1.7504570383912248,
"grad_norm": 87164.46875,
"learning_rate": 3.405172413793103e-05,
"loss": 1.6959,
"step": 3830
},
{
"epoch": 1.7550274223034736,
"grad_norm": 94143.84375,
"learning_rate": 3.400383141762452e-05,
"loss": 1.7662,
"step": 3840
},
{
"epoch": 1.759597806215722,
"grad_norm": 69526.3671875,
"learning_rate": 3.3955938697318005e-05,
"loss": 1.6818,
"step": 3850
},
{
"epoch": 1.7641681901279709,
"grad_norm": 91595.3359375,
"learning_rate": 3.390804597701149e-05,
"loss": 1.6884,
"step": 3860
},
{
"epoch": 1.7687385740402193,
"grad_norm": 97468.0234375,
"learning_rate": 3.3860153256704986e-05,
"loss": 1.7608,
"step": 3870
},
{
"epoch": 1.7733089579524681,
"grad_norm": 138455.578125,
"learning_rate": 3.381226053639847e-05,
"loss": 1.6528,
"step": 3880
},
{
"epoch": 1.7778793418647165,
"grad_norm": 107855.234375,
"learning_rate": 3.376436781609196e-05,
"loss": 1.7178,
"step": 3890
},
{
"epoch": 1.7824497257769654,
"grad_norm": 105021.2421875,
"learning_rate": 3.371647509578545e-05,
"loss": 1.6807,
"step": 3900
},
{
"epoch": 1.7870201096892138,
"grad_norm": 252806.71875,
"learning_rate": 3.366858237547893e-05,
"loss": 1.6723,
"step": 3910
},
{
"epoch": 1.7915904936014626,
"grad_norm": 116308.34375,
"learning_rate": 3.3620689655172414e-05,
"loss": 1.7991,
"step": 3920
},
{
"epoch": 1.796160877513711,
"grad_norm": 86454.4375,
"learning_rate": 3.35727969348659e-05,
"loss": 1.7825,
"step": 3930
},
{
"epoch": 1.8007312614259599,
"grad_norm": 71364.28125,
"learning_rate": 3.352490421455939e-05,
"loss": 1.769,
"step": 3940
},
{
"epoch": 1.8053016453382082,
"grad_norm": 70699.7265625,
"learning_rate": 3.3477011494252874e-05,
"loss": 1.7765,
"step": 3950
},
{
"epoch": 1.809872029250457,
"grad_norm": 58119.38671875,
"learning_rate": 3.342911877394636e-05,
"loss": 1.6735,
"step": 3960
},
{
"epoch": 1.8144424131627057,
"grad_norm": 89724.2578125,
"learning_rate": 3.338122605363985e-05,
"loss": 1.7261,
"step": 3970
},
{
"epoch": 1.8190127970749543,
"grad_norm": 103765.2421875,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.7276,
"step": 3980
},
{
"epoch": 1.823583180987203,
"grad_norm": 108591.3046875,
"learning_rate": 3.328544061302682e-05,
"loss": 1.6897,
"step": 3990
},
{
"epoch": 1.8281535648994516,
"grad_norm": 70272.8046875,
"learning_rate": 3.323754789272031e-05,
"loss": 1.6812,
"step": 4000
},
{
"epoch": 1.8327239488117002,
"grad_norm": 71892.390625,
"learning_rate": 3.3189655172413796e-05,
"loss": 1.7387,
"step": 4010
},
{
"epoch": 1.8372943327239488,
"grad_norm": 73947.421875,
"learning_rate": 3.314176245210728e-05,
"loss": 1.6488,
"step": 4020
},
{
"epoch": 1.8418647166361974,
"grad_norm": 90722.765625,
"learning_rate": 3.309386973180077e-05,
"loss": 1.7028,
"step": 4030
},
{
"epoch": 1.846435100548446,
"grad_norm": 106649.1484375,
"learning_rate": 3.3045977011494256e-05,
"loss": 1.7118,
"step": 4040
},
{
"epoch": 1.8510054844606947,
"grad_norm": 79884.34375,
"learning_rate": 3.299808429118774e-05,
"loss": 1.6838,
"step": 4050
},
{
"epoch": 1.8555758683729433,
"grad_norm": 133383.34375,
"learning_rate": 3.295019157088123e-05,
"loss": 1.7038,
"step": 4060
},
{
"epoch": 1.860146252285192,
"grad_norm": 93182.28125,
"learning_rate": 3.290229885057472e-05,
"loss": 1.6335,
"step": 4070
},
{
"epoch": 1.8647166361974405,
"grad_norm": 57547.94921875,
"learning_rate": 3.2854406130268204e-05,
"loss": 1.7422,
"step": 4080
},
{
"epoch": 1.8692870201096892,
"grad_norm": 95853.7734375,
"learning_rate": 3.2806513409961684e-05,
"loss": 1.775,
"step": 4090
},
{
"epoch": 1.8738574040219378,
"grad_norm": 144561.4375,
"learning_rate": 3.275862068965517e-05,
"loss": 1.8058,
"step": 4100
},
{
"epoch": 1.8784277879341866,
"grad_norm": 108504.7734375,
"learning_rate": 3.271072796934866e-05,
"loss": 1.7476,
"step": 4110
},
{
"epoch": 1.882998171846435,
"grad_norm": 94651.2109375,
"learning_rate": 3.2662835249042144e-05,
"loss": 1.7199,
"step": 4120
},
{
"epoch": 1.8875685557586839,
"grad_norm": 70529.734375,
"learning_rate": 3.261494252873563e-05,
"loss": 1.7404,
"step": 4130
},
{
"epoch": 1.8921389396709323,
"grad_norm": 61164.6796875,
"learning_rate": 3.256704980842912e-05,
"loss": 1.6635,
"step": 4140
},
{
"epoch": 1.8967093235831811,
"grad_norm": 114792.84375,
"learning_rate": 3.251915708812261e-05,
"loss": 1.7986,
"step": 4150
},
{
"epoch": 1.9012797074954295,
"grad_norm": 72610.09375,
"learning_rate": 3.24712643678161e-05,
"loss": 1.6614,
"step": 4160
},
{
"epoch": 1.9058500914076784,
"grad_norm": 85551.8828125,
"learning_rate": 3.242337164750958e-05,
"loss": 1.6255,
"step": 4170
},
{
"epoch": 1.9104204753199268,
"grad_norm": 109005.5546875,
"learning_rate": 3.2375478927203066e-05,
"loss": 1.6897,
"step": 4180
},
{
"epoch": 1.9149908592321756,
"grad_norm": 107019.8046875,
"learning_rate": 3.232758620689655e-05,
"loss": 1.732,
"step": 4190
},
{
"epoch": 1.919561243144424,
"grad_norm": 78904.2109375,
"learning_rate": 3.227969348659004e-05,
"loss": 1.7201,
"step": 4200
},
{
"epoch": 1.9241316270566728,
"grad_norm": 64701.79296875,
"learning_rate": 3.2231800766283526e-05,
"loss": 1.6952,
"step": 4210
},
{
"epoch": 1.9287020109689212,
"grad_norm": 64434.64453125,
"learning_rate": 3.218390804597701e-05,
"loss": 1.6946,
"step": 4220
},
{
"epoch": 1.93327239488117,
"grad_norm": 115172.6171875,
"learning_rate": 3.21360153256705e-05,
"loss": 1.6573,
"step": 4230
},
{
"epoch": 1.9378427787934185,
"grad_norm": 82583.25,
"learning_rate": 3.208812260536399e-05,
"loss": 1.7499,
"step": 4240
},
{
"epoch": 1.9424131627056673,
"grad_norm": 93962.40625,
"learning_rate": 3.2040229885057474e-05,
"loss": 1.7036,
"step": 4250
},
{
"epoch": 1.946983546617916,
"grad_norm": 81828.8203125,
"learning_rate": 3.1992337164750954e-05,
"loss": 1.6486,
"step": 4260
},
{
"epoch": 1.9515539305301646,
"grad_norm": 109928.0546875,
"learning_rate": 3.194444444444444e-05,
"loss": 1.7805,
"step": 4270
},
{
"epoch": 1.9561243144424132,
"grad_norm": 100452.4765625,
"learning_rate": 3.1896551724137935e-05,
"loss": 1.6897,
"step": 4280
},
{
"epoch": 1.9606946983546618,
"grad_norm": 53400.49609375,
"learning_rate": 3.184865900383142e-05,
"loss": 1.681,
"step": 4290
},
{
"epoch": 1.9652650822669104,
"grad_norm": 104313.296875,
"learning_rate": 3.180076628352491e-05,
"loss": 1.737,
"step": 4300
},
{
"epoch": 1.969835466179159,
"grad_norm": 92321.3515625,
"learning_rate": 3.1752873563218395e-05,
"loss": 1.6844,
"step": 4310
},
{
"epoch": 1.9744058500914077,
"grad_norm": 87737.4140625,
"learning_rate": 3.170498084291188e-05,
"loss": 1.7712,
"step": 4320
},
{
"epoch": 1.9789762340036563,
"grad_norm": 108424.9453125,
"learning_rate": 3.165708812260537e-05,
"loss": 1.7173,
"step": 4330
},
{
"epoch": 1.983546617915905,
"grad_norm": 79022.0546875,
"learning_rate": 3.160919540229885e-05,
"loss": 1.668,
"step": 4340
},
{
"epoch": 1.9881170018281535,
"grad_norm": 67820.46875,
"learning_rate": 3.1561302681992336e-05,
"loss": 1.7499,
"step": 4350
},
{
"epoch": 1.9926873857404022,
"grad_norm": 56711.33984375,
"learning_rate": 3.151340996168582e-05,
"loss": 1.6331,
"step": 4360
},
{
"epoch": 1.9972577696526508,
"grad_norm": 137476.015625,
"learning_rate": 3.146551724137931e-05,
"loss": 1.6912,
"step": 4370
},
{
"epoch": 2.0,
"eval_loss": 1.693359375,
"eval_runtime": 345.1475,
"eval_samples_per_second": 43.46,
"eval_steps_per_second": 1.359,
"step": 4376
},
{
"epoch": 2.0018281535648996,
"grad_norm": 95937.5703125,
"learning_rate": 3.1417624521072797e-05,
"loss": 1.7171,
"step": 4380
},
{
"epoch": 2.006398537477148,
"grad_norm": 69800.3515625,
"learning_rate": 3.1369731800766283e-05,
"loss": 1.698,
"step": 4390
},
{
"epoch": 2.010968921389397,
"grad_norm": 74739.84375,
"learning_rate": 3.132183908045977e-05,
"loss": 1.6507,
"step": 4400
},
{
"epoch": 2.0155393053016453,
"grad_norm": 82305.1484375,
"learning_rate": 3.127394636015326e-05,
"loss": 1.7155,
"step": 4410
},
{
"epoch": 2.020109689213894,
"grad_norm": 62019.15625,
"learning_rate": 3.1226053639846744e-05,
"loss": 1.7122,
"step": 4420
},
{
"epoch": 2.0246800731261425,
"grad_norm": 63863.31640625,
"learning_rate": 3.117816091954023e-05,
"loss": 1.7085,
"step": 4430
},
{
"epoch": 2.0292504570383914,
"grad_norm": 51309.7109375,
"learning_rate": 3.113026819923372e-05,
"loss": 1.7576,
"step": 4440
},
{
"epoch": 2.0338208409506398,
"grad_norm": 55412.6796875,
"learning_rate": 3.1082375478927205e-05,
"loss": 1.709,
"step": 4450
},
{
"epoch": 2.0383912248628886,
"grad_norm": 107907.9609375,
"learning_rate": 3.103448275862069e-05,
"loss": 1.771,
"step": 4460
},
{
"epoch": 2.042961608775137,
"grad_norm": 73671.1796875,
"learning_rate": 3.098659003831418e-05,
"loss": 1.7703,
"step": 4470
},
{
"epoch": 2.047531992687386,
"grad_norm": 59534.9765625,
"learning_rate": 3.0938697318007665e-05,
"loss": 1.7293,
"step": 4480
},
{
"epoch": 2.0521023765996342,
"grad_norm": 133561.09375,
"learning_rate": 3.089080459770115e-05,
"loss": 1.7752,
"step": 4490
},
{
"epoch": 2.056672760511883,
"grad_norm": 133462.234375,
"learning_rate": 3.084291187739464e-05,
"loss": 1.7729,
"step": 4500
},
{
"epoch": 2.0612431444241315,
"grad_norm": 70087.765625,
"learning_rate": 3.0795019157088126e-05,
"loss": 1.7831,
"step": 4510
},
{
"epoch": 2.0658135283363803,
"grad_norm": 141446.296875,
"learning_rate": 3.0747126436781606e-05,
"loss": 1.7614,
"step": 4520
},
{
"epoch": 2.0703839122486287,
"grad_norm": 83022.75,
"learning_rate": 3.069923371647509e-05,
"loss": 1.5773,
"step": 4530
},
{
"epoch": 2.0749542961608776,
"grad_norm": 81699.2578125,
"learning_rate": 3.065134099616858e-05,
"loss": 1.7204,
"step": 4540
},
{
"epoch": 2.079524680073126,
"grad_norm": 73930.953125,
"learning_rate": 3.060344827586207e-05,
"loss": 1.781,
"step": 4550
},
{
"epoch": 2.084095063985375,
"grad_norm": 99254.2890625,
"learning_rate": 3.055555555555556e-05,
"loss": 1.6642,
"step": 4560
},
{
"epoch": 2.088665447897623,
"grad_norm": 84721.40625,
"learning_rate": 3.0507662835249047e-05,
"loss": 1.6594,
"step": 4570
},
{
"epoch": 2.093235831809872,
"grad_norm": 78780.6015625,
"learning_rate": 3.045977011494253e-05,
"loss": 1.7436,
"step": 4580
},
{
"epoch": 2.0978062157221204,
"grad_norm": 74259.4921875,
"learning_rate": 3.0411877394636018e-05,
"loss": 1.7986,
"step": 4590
},
{
"epoch": 2.1023765996343693,
"grad_norm": 116283.8671875,
"learning_rate": 3.0363984674329505e-05,
"loss": 1.7248,
"step": 4600
},
{
"epoch": 2.106946983546618,
"grad_norm": 96815.84375,
"learning_rate": 3.031609195402299e-05,
"loss": 1.763,
"step": 4610
},
{
"epoch": 2.1115173674588665,
"grad_norm": 61044.22265625,
"learning_rate": 3.0268199233716475e-05,
"loss": 1.7655,
"step": 4620
},
{
"epoch": 2.1160877513711154,
"grad_norm": 75176.796875,
"learning_rate": 3.0220306513409962e-05,
"loss": 1.7197,
"step": 4630
},
{
"epoch": 2.1206581352833638,
"grad_norm": 100533.109375,
"learning_rate": 3.017241379310345e-05,
"loss": 1.6475,
"step": 4640
},
{
"epoch": 2.1252285191956126,
"grad_norm": 79551.5234375,
"learning_rate": 3.0124521072796936e-05,
"loss": 1.6468,
"step": 4650
},
{
"epoch": 2.129798903107861,
"grad_norm": 74119.8515625,
"learning_rate": 3.0076628352490422e-05,
"loss": 1.7115,
"step": 4660
},
{
"epoch": 2.13436928702011,
"grad_norm": 62293.6953125,
"learning_rate": 3.0028735632183906e-05,
"loss": 1.7507,
"step": 4670
},
{
"epoch": 2.1389396709323583,
"grad_norm": 66430.1953125,
"learning_rate": 2.9980842911877393e-05,
"loss": 1.6552,
"step": 4680
},
{
"epoch": 2.143510054844607,
"grad_norm": 70040.5390625,
"learning_rate": 2.9932950191570886e-05,
"loss": 1.7662,
"step": 4690
},
{
"epoch": 2.1480804387568555,
"grad_norm": 94635.6796875,
"learning_rate": 2.988505747126437e-05,
"loss": 1.7736,
"step": 4700
},
{
"epoch": 2.1526508226691043,
"grad_norm": 79885.9453125,
"learning_rate": 2.9837164750957857e-05,
"loss": 1.7164,
"step": 4710
},
{
"epoch": 2.1572212065813527,
"grad_norm": 81728.3046875,
"learning_rate": 2.9789272030651344e-05,
"loss": 1.7839,
"step": 4720
},
{
"epoch": 2.1617915904936016,
"grad_norm": 95488.8984375,
"learning_rate": 2.974137931034483e-05,
"loss": 1.7674,
"step": 4730
},
{
"epoch": 2.16636197440585,
"grad_norm": 109054.171875,
"learning_rate": 2.9693486590038317e-05,
"loss": 1.6966,
"step": 4740
},
{
"epoch": 2.170932358318099,
"grad_norm": 79046.40625,
"learning_rate": 2.96455938697318e-05,
"loss": 1.6621,
"step": 4750
},
{
"epoch": 2.1755027422303472,
"grad_norm": 102101.4765625,
"learning_rate": 2.9597701149425288e-05,
"loss": 1.7391,
"step": 4760
},
{
"epoch": 2.180073126142596,
"grad_norm": 83587.875,
"learning_rate": 2.9549808429118775e-05,
"loss": 1.8232,
"step": 4770
},
{
"epoch": 2.1846435100548445,
"grad_norm": 104632.4921875,
"learning_rate": 2.950191570881226e-05,
"loss": 1.6575,
"step": 4780
},
{
"epoch": 2.1892138939670933,
"grad_norm": 72859.53125,
"learning_rate": 2.945402298850575e-05,
"loss": 1.7205,
"step": 4790
},
{
"epoch": 2.1937842778793417,
"grad_norm": 92640.796875,
"learning_rate": 2.9406130268199232e-05,
"loss": 1.725,
"step": 4800
},
{
"epoch": 2.1983546617915906,
"grad_norm": 246298.65625,
"learning_rate": 2.935823754789272e-05,
"loss": 1.7163,
"step": 4810
},
{
"epoch": 2.202925045703839,
"grad_norm": 97907.3828125,
"learning_rate": 2.9310344827586206e-05,
"loss": 1.6475,
"step": 4820
},
{
"epoch": 2.207495429616088,
"grad_norm": 85840.625,
"learning_rate": 2.9262452107279696e-05,
"loss": 1.7121,
"step": 4830
},
{
"epoch": 2.212065813528336,
"grad_norm": 72967.8984375,
"learning_rate": 2.9214559386973183e-05,
"loss": 1.7438,
"step": 4840
},
{
"epoch": 2.216636197440585,
"grad_norm": 75571.765625,
"learning_rate": 2.916666666666667e-05,
"loss": 1.6586,
"step": 4850
},
{
"epoch": 2.2212065813528334,
"grad_norm": 65606.75,
"learning_rate": 2.9118773946360157e-05,
"loss": 1.7405,
"step": 4860
},
{
"epoch": 2.2257769652650823,
"grad_norm": 81884.3828125,
"learning_rate": 2.9070881226053644e-05,
"loss": 1.7318,
"step": 4870
},
{
"epoch": 2.2303473491773307,
"grad_norm": 66072.0703125,
"learning_rate": 2.9022988505747127e-05,
"loss": 1.7428,
"step": 4880
},
{
"epoch": 2.2349177330895795,
"grad_norm": 154896.03125,
"learning_rate": 2.8975095785440614e-05,
"loss": 1.7548,
"step": 4890
},
{
"epoch": 2.2394881170018284,
"grad_norm": 75736.953125,
"learning_rate": 2.89272030651341e-05,
"loss": 1.7215,
"step": 4900
},
{
"epoch": 2.2440585009140768,
"grad_norm": 67909.71875,
"learning_rate": 2.8879310344827588e-05,
"loss": 1.6336,
"step": 4910
},
{
"epoch": 2.2486288848263256,
"grad_norm": 89797.1953125,
"learning_rate": 2.8831417624521075e-05,
"loss": 1.7273,
"step": 4920
},
{
"epoch": 2.253199268738574,
"grad_norm": 85214.125,
"learning_rate": 2.8783524904214558e-05,
"loss": 1.7144,
"step": 4930
},
{
"epoch": 2.257769652650823,
"grad_norm": 76317.734375,
"learning_rate": 2.8735632183908045e-05,
"loss": 1.7784,
"step": 4940
},
{
"epoch": 2.2623400365630713,
"grad_norm": 49163.828125,
"learning_rate": 2.8687739463601532e-05,
"loss": 1.7026,
"step": 4950
},
{
"epoch": 2.26691042047532,
"grad_norm": 107379.171875,
"learning_rate": 2.863984674329502e-05,
"loss": 1.7803,
"step": 4960
},
{
"epoch": 2.2714808043875685,
"grad_norm": 69726.2734375,
"learning_rate": 2.859195402298851e-05,
"loss": 1.7006,
"step": 4970
},
{
"epoch": 2.2760511882998173,
"grad_norm": 68964.7265625,
"learning_rate": 2.8544061302681996e-05,
"loss": 1.7053,
"step": 4980
},
{
"epoch": 2.2806215722120657,
"grad_norm": 67173.03125,
"learning_rate": 2.8496168582375483e-05,
"loss": 1.6936,
"step": 4990
},
{
"epoch": 2.2851919561243146,
"grad_norm": 78151.328125,
"learning_rate": 2.844827586206897e-05,
"loss": 1.7376,
"step": 5000
},
{
"epoch": 2.289762340036563,
"grad_norm": 147513.4375,
"learning_rate": 2.8400383141762453e-05,
"loss": 1.7627,
"step": 5010
},
{
"epoch": 2.294332723948812,
"grad_norm": 104835.921875,
"learning_rate": 2.835249042145594e-05,
"loss": 1.6425,
"step": 5020
},
{
"epoch": 2.2989031078610602,
"grad_norm": 120597.7109375,
"learning_rate": 2.8304597701149427e-05,
"loss": 1.7738,
"step": 5030
},
{
"epoch": 2.303473491773309,
"grad_norm": 65482.203125,
"learning_rate": 2.8256704980842914e-05,
"loss": 1.6205,
"step": 5040
},
{
"epoch": 2.3080438756855575,
"grad_norm": 71476.0078125,
"learning_rate": 2.82088122605364e-05,
"loss": 1.6699,
"step": 5050
},
{
"epoch": 2.3126142595978063,
"grad_norm": 85771.3515625,
"learning_rate": 2.8160919540229884e-05,
"loss": 1.697,
"step": 5060
},
{
"epoch": 2.3171846435100547,
"grad_norm": 63680.30859375,
"learning_rate": 2.811302681992337e-05,
"loss": 1.6754,
"step": 5070
},
{
"epoch": 2.3217550274223036,
"grad_norm": 152300.34375,
"learning_rate": 2.8065134099616858e-05,
"loss": 1.6722,
"step": 5080
},
{
"epoch": 2.326325411334552,
"grad_norm": 88300.703125,
"learning_rate": 2.8017241379310345e-05,
"loss": 1.7725,
"step": 5090
},
{
"epoch": 2.330895795246801,
"grad_norm": 106965.4375,
"learning_rate": 2.796934865900383e-05,
"loss": 1.7734,
"step": 5100
},
{
"epoch": 2.335466179159049,
"grad_norm": 118704.640625,
"learning_rate": 2.7921455938697322e-05,
"loss": 1.6477,
"step": 5110
},
{
"epoch": 2.340036563071298,
"grad_norm": 78607.4453125,
"learning_rate": 2.787356321839081e-05,
"loss": 1.7386,
"step": 5120
},
{
"epoch": 2.3446069469835464,
"grad_norm": 83952.1171875,
"learning_rate": 2.7825670498084296e-05,
"loss": 1.64,
"step": 5130
},
{
"epoch": 2.3491773308957953,
"grad_norm": 107545.0859375,
"learning_rate": 2.777777777777778e-05,
"loss": 1.6633,
"step": 5140
},
{
"epoch": 2.353747714808044,
"grad_norm": 72284.5,
"learning_rate": 2.7729885057471266e-05,
"loss": 1.7249,
"step": 5150
},
{
"epoch": 2.3583180987202925,
"grad_norm": 89877.7109375,
"learning_rate": 2.7681992337164753e-05,
"loss": 1.6901,
"step": 5160
},
{
"epoch": 2.362888482632541,
"grad_norm": 138945.6875,
"learning_rate": 2.763409961685824e-05,
"loss": 1.6468,
"step": 5170
},
{
"epoch": 2.3674588665447898,
"grad_norm": 58679.375,
"learning_rate": 2.7586206896551727e-05,
"loss": 1.6838,
"step": 5180
},
{
"epoch": 2.3720292504570386,
"grad_norm": 95501.8671875,
"learning_rate": 2.753831417624521e-05,
"loss": 1.682,
"step": 5190
},
{
"epoch": 2.376599634369287,
"grad_norm": 76119.296875,
"learning_rate": 2.7490421455938697e-05,
"loss": 1.8395,
"step": 5200
},
{
"epoch": 2.3811700182815354,
"grad_norm": 108761.65625,
"learning_rate": 2.7442528735632184e-05,
"loss": 1.7638,
"step": 5210
},
{
"epoch": 2.3857404021937842,
"grad_norm": 99530.703125,
"learning_rate": 2.739463601532567e-05,
"loss": 1.6688,
"step": 5220
},
{
"epoch": 2.390310786106033,
"grad_norm": 73215.6171875,
"learning_rate": 2.7346743295019158e-05,
"loss": 1.6266,
"step": 5230
},
{
"epoch": 2.3948811700182815,
"grad_norm": 94147.75,
"learning_rate": 2.7298850574712648e-05,
"loss": 1.7344,
"step": 5240
},
{
"epoch": 2.3994515539305303,
"grad_norm": 80156.09375,
"learning_rate": 2.7250957854406135e-05,
"loss": 1.7074,
"step": 5250
},
{
"epoch": 2.4040219378427787,
"grad_norm": 54975.70703125,
"learning_rate": 2.720306513409962e-05,
"loss": 1.6535,
"step": 5260
},
{
"epoch": 2.4085923217550276,
"grad_norm": 64294.23828125,
"learning_rate": 2.7155172413793105e-05,
"loss": 1.6997,
"step": 5270
},
{
"epoch": 2.413162705667276,
"grad_norm": 83260.0,
"learning_rate": 2.7107279693486592e-05,
"loss": 1.7617,
"step": 5280
},
{
"epoch": 2.417733089579525,
"grad_norm": 79186.6484375,
"learning_rate": 2.705938697318008e-05,
"loss": 1.6661,
"step": 5290
},
{
"epoch": 2.422303473491773,
"grad_norm": 98957.21875,
"learning_rate": 2.7011494252873566e-05,
"loss": 1.6771,
"step": 5300
},
{
"epoch": 2.426873857404022,
"grad_norm": 71378.125,
"learning_rate": 2.6963601532567053e-05,
"loss": 1.6975,
"step": 5310
},
{
"epoch": 2.4314442413162705,
"grad_norm": 81879.71875,
"learning_rate": 2.6915708812260536e-05,
"loss": 1.774,
"step": 5320
},
{
"epoch": 2.4360146252285193,
"grad_norm": 57695.0078125,
"learning_rate": 2.6867816091954023e-05,
"loss": 1.6981,
"step": 5330
},
{
"epoch": 2.4405850091407677,
"grad_norm": 84480.6328125,
"learning_rate": 2.681992337164751e-05,
"loss": 1.7748,
"step": 5340
},
{
"epoch": 2.4451553930530165,
"grad_norm": 71991.0859375,
"learning_rate": 2.6772030651340997e-05,
"loss": 1.6582,
"step": 5350
},
{
"epoch": 2.449725776965265,
"grad_norm": 84038.8984375,
"learning_rate": 2.672413793103448e-05,
"loss": 1.7313,
"step": 5360
},
{
"epoch": 2.454296160877514,
"grad_norm": 61137.41015625,
"learning_rate": 2.6676245210727967e-05,
"loss": 1.686,
"step": 5370
},
{
"epoch": 2.458866544789762,
"grad_norm": 82829.7734375,
"learning_rate": 2.662835249042146e-05,
"loss": 1.6642,
"step": 5380
},
{
"epoch": 2.463436928702011,
"grad_norm": 77371.140625,
"learning_rate": 2.6580459770114948e-05,
"loss": 1.7747,
"step": 5390
},
{
"epoch": 2.4680073126142594,
"grad_norm": 109782.984375,
"learning_rate": 2.653256704980843e-05,
"loss": 1.7706,
"step": 5400
},
{
"epoch": 2.4725776965265083,
"grad_norm": 99484.78125,
"learning_rate": 2.6484674329501918e-05,
"loss": 1.7255,
"step": 5410
},
{
"epoch": 2.4771480804387567,
"grad_norm": 97897.8671875,
"learning_rate": 2.6436781609195405e-05,
"loss": 1.8301,
"step": 5420
},
{
"epoch": 2.4817184643510055,
"grad_norm": 69454.625,
"learning_rate": 2.6388888888888892e-05,
"loss": 1.833,
"step": 5430
},
{
"epoch": 2.4862888482632544,
"grad_norm": 51202.9765625,
"learning_rate": 2.6340996168582375e-05,
"loss": 1.8042,
"step": 5440
},
{
"epoch": 2.4908592321755028,
"grad_norm": 62593.1015625,
"learning_rate": 2.6293103448275862e-05,
"loss": 1.7295,
"step": 5450
},
{
"epoch": 2.495429616087751,
"grad_norm": 62147.26953125,
"learning_rate": 2.624521072796935e-05,
"loss": 1.7026,
"step": 5460
},
{
"epoch": 2.5,
"grad_norm": 105682.3515625,
"learning_rate": 2.6197318007662836e-05,
"loss": 1.6866,
"step": 5470
},
{
"epoch": 2.504570383912249,
"grad_norm": 133388.953125,
"learning_rate": 2.6149425287356323e-05,
"loss": 1.6257,
"step": 5480
},
{
"epoch": 2.5091407678244972,
"grad_norm": 78931.4609375,
"learning_rate": 2.6101532567049806e-05,
"loss": 1.7367,
"step": 5490
},
{
"epoch": 2.5137111517367456,
"grad_norm": 58830.1484375,
"learning_rate": 2.6053639846743293e-05,
"loss": 1.6935,
"step": 5500
},
{
"epoch": 2.5182815356489945,
"grad_norm": 154903.03125,
"learning_rate": 2.600574712643678e-05,
"loss": 1.6548,
"step": 5510
},
{
"epoch": 2.5228519195612433,
"grad_norm": 71029.5859375,
"learning_rate": 2.595785440613027e-05,
"loss": 1.7459,
"step": 5520
},
{
"epoch": 2.5274223034734917,
"grad_norm": 76854.0,
"learning_rate": 2.5909961685823757e-05,
"loss": 1.7737,
"step": 5530
},
{
"epoch": 2.53199268738574,
"grad_norm": 78467.3984375,
"learning_rate": 2.5862068965517244e-05,
"loss": 1.728,
"step": 5540
},
{
"epoch": 2.536563071297989,
"grad_norm": 81775.109375,
"learning_rate": 2.581417624521073e-05,
"loss": 1.6481,
"step": 5550
},
{
"epoch": 2.541133455210238,
"grad_norm": 59482.62890625,
"learning_rate": 2.5766283524904218e-05,
"loss": 1.7565,
"step": 5560
},
{
"epoch": 2.545703839122486,
"grad_norm": 114247.8671875,
"learning_rate": 2.57183908045977e-05,
"loss": 1.6725,
"step": 5570
},
{
"epoch": 2.550274223034735,
"grad_norm": 69261.0390625,
"learning_rate": 2.5670498084291188e-05,
"loss": 1.7034,
"step": 5580
},
{
"epoch": 2.5548446069469835,
"grad_norm": 88227.703125,
"learning_rate": 2.5622605363984675e-05,
"loss": 1.6888,
"step": 5590
},
{
"epoch": 2.5594149908592323,
"grad_norm": 123599.8828125,
"learning_rate": 2.5574712643678162e-05,
"loss": 1.6814,
"step": 5600
},
{
"epoch": 2.5639853747714807,
"grad_norm": 61095.8984375,
"learning_rate": 2.552681992337165e-05,
"loss": 1.6875,
"step": 5610
},
{
"epoch": 2.5685557586837295,
"grad_norm": 71797.8671875,
"learning_rate": 2.5478927203065132e-05,
"loss": 1.6945,
"step": 5620
},
{
"epoch": 2.573126142595978,
"grad_norm": 70069.7890625,
"learning_rate": 2.543103448275862e-05,
"loss": 1.6055,
"step": 5630
},
{
"epoch": 2.577696526508227,
"grad_norm": 90758.25,
"learning_rate": 2.5383141762452106e-05,
"loss": 1.7105,
"step": 5640
},
{
"epoch": 2.582266910420475,
"grad_norm": 73838.0390625,
"learning_rate": 2.5335249042145593e-05,
"loss": 1.7238,
"step": 5650
},
{
"epoch": 2.586837294332724,
"grad_norm": 169500.65625,
"learning_rate": 2.5287356321839083e-05,
"loss": 1.6286,
"step": 5660
},
{
"epoch": 2.5914076782449724,
"grad_norm": 86502.6484375,
"learning_rate": 2.523946360153257e-05,
"loss": 1.7394,
"step": 5670
},
{
"epoch": 2.5959780621572213,
"grad_norm": 77630.203125,
"learning_rate": 2.5191570881226057e-05,
"loss": 1.6857,
"step": 5680
},
{
"epoch": 2.60054844606947,
"grad_norm": 78792.4921875,
"learning_rate": 2.5143678160919544e-05,
"loss": 1.7818,
"step": 5690
},
{
"epoch": 2.6051188299817185,
"grad_norm": 61021.87890625,
"learning_rate": 2.5095785440613027e-05,
"loss": 1.6825,
"step": 5700
},
{
"epoch": 2.609689213893967,
"grad_norm": 93845.828125,
"learning_rate": 2.5047892720306514e-05,
"loss": 1.7818,
"step": 5710
},
{
"epoch": 2.6142595978062158,
"grad_norm": 77104.4296875,
"learning_rate": 2.5e-05,
"loss": 1.6891,
"step": 5720
},
{
"epoch": 2.6188299817184646,
"grad_norm": 67157.0625,
"learning_rate": 2.4952107279693488e-05,
"loss": 1.7418,
"step": 5730
},
{
"epoch": 2.623400365630713,
"grad_norm": 117816.0078125,
"learning_rate": 2.4904214559386975e-05,
"loss": 1.6851,
"step": 5740
},
{
"epoch": 2.6279707495429614,
"grad_norm": 64429.0546875,
"learning_rate": 2.485632183908046e-05,
"loss": 1.7577,
"step": 5750
},
{
"epoch": 2.6325411334552102,
"grad_norm": 99433.84375,
"learning_rate": 2.480842911877395e-05,
"loss": 1.6719,
"step": 5760
},
{
"epoch": 2.637111517367459,
"grad_norm": 129014.9609375,
"learning_rate": 2.4760536398467436e-05,
"loss": 1.8557,
"step": 5770
},
{
"epoch": 2.6416819012797075,
"grad_norm": 51642.76171875,
"learning_rate": 2.4712643678160922e-05,
"loss": 1.71,
"step": 5780
},
{
"epoch": 2.646252285191956,
"grad_norm": 92177.1875,
"learning_rate": 2.4664750957854406e-05,
"loss": 1.6415,
"step": 5790
},
{
"epoch": 2.6508226691042047,
"grad_norm": 83833.5546875,
"learning_rate": 2.4616858237547893e-05,
"loss": 1.6863,
"step": 5800
},
{
"epoch": 2.6553930530164536,
"grad_norm": 83966.53125,
"learning_rate": 2.456896551724138e-05,
"loss": 1.7626,
"step": 5810
},
{
"epoch": 2.659963436928702,
"grad_norm": 94047.453125,
"learning_rate": 2.4521072796934867e-05,
"loss": 1.7408,
"step": 5820
},
{
"epoch": 2.664533820840951,
"grad_norm": 107394.71875,
"learning_rate": 2.4473180076628353e-05,
"loss": 1.7078,
"step": 5830
},
{
"epoch": 2.669104204753199,
"grad_norm": 96418.6328125,
"learning_rate": 2.442528735632184e-05,
"loss": 1.7023,
"step": 5840
},
{
"epoch": 2.673674588665448,
"grad_norm": 115618.6015625,
"learning_rate": 2.4377394636015327e-05,
"loss": 1.7811,
"step": 5850
},
{
"epoch": 2.6782449725776964,
"grad_norm": 102043.5078125,
"learning_rate": 2.4329501915708814e-05,
"loss": 1.761,
"step": 5860
},
{
"epoch": 2.6828153564899453,
"grad_norm": 85863.6953125,
"learning_rate": 2.42816091954023e-05,
"loss": 1.786,
"step": 5870
},
{
"epoch": 2.6873857404021937,
"grad_norm": 105787.890625,
"learning_rate": 2.4233716475095784e-05,
"loss": 1.6736,
"step": 5880
},
{
"epoch": 2.6919561243144425,
"grad_norm": 87654.65625,
"learning_rate": 2.418582375478927e-05,
"loss": 1.7352,
"step": 5890
},
{
"epoch": 2.696526508226691,
"grad_norm": 65512.890625,
"learning_rate": 2.413793103448276e-05,
"loss": 1.6165,
"step": 5900
},
{
"epoch": 2.7010968921389398,
"grad_norm": 96425.09375,
"learning_rate": 2.409003831417625e-05,
"loss": 1.6336,
"step": 5910
},
{
"epoch": 2.705667276051188,
"grad_norm": 64857.203125,
"learning_rate": 2.4042145593869732e-05,
"loss": 1.7171,
"step": 5920
},
{
"epoch": 2.710237659963437,
"grad_norm": 59102.8828125,
"learning_rate": 2.399425287356322e-05,
"loss": 1.7046,
"step": 5930
},
{
"epoch": 2.7148080438756854,
"grad_norm": 89212.4140625,
"learning_rate": 2.3946360153256706e-05,
"loss": 1.6989,
"step": 5940
},
{
"epoch": 2.7193784277879343,
"grad_norm": 75463.0859375,
"learning_rate": 2.3898467432950193e-05,
"loss": 1.7416,
"step": 5950
},
{
"epoch": 2.7239488117001827,
"grad_norm": 107514.09375,
"learning_rate": 2.385057471264368e-05,
"loss": 1.7762,
"step": 5960
},
{
"epoch": 2.7285191956124315,
"grad_norm": 97020.46875,
"learning_rate": 2.3802681992337166e-05,
"loss": 1.6877,
"step": 5970
},
{
"epoch": 2.7330895795246803,
"grad_norm": 100322.8515625,
"learning_rate": 2.3754789272030653e-05,
"loss": 1.7148,
"step": 5980
},
{
"epoch": 2.7376599634369287,
"grad_norm": 66746.328125,
"learning_rate": 2.370689655172414e-05,
"loss": 1.7645,
"step": 5990
},
{
"epoch": 2.742230347349177,
"grad_norm": 50676.11328125,
"learning_rate": 2.3659003831417627e-05,
"loss": 1.7107,
"step": 6000
},
{
"epoch": 2.746800731261426,
"grad_norm": 53923.9921875,
"learning_rate": 2.361111111111111e-05,
"loss": 1.7802,
"step": 6010
},
{
"epoch": 2.751371115173675,
"grad_norm": 61331.36328125,
"learning_rate": 2.3563218390804597e-05,
"loss": 1.682,
"step": 6020
},
{
"epoch": 2.7559414990859232,
"grad_norm": 92563.5078125,
"learning_rate": 2.3515325670498088e-05,
"loss": 1.8056,
"step": 6030
},
{
"epoch": 2.7605118829981716,
"grad_norm": 103113.2578125,
"learning_rate": 2.3467432950191575e-05,
"loss": 1.6654,
"step": 6040
},
{
"epoch": 2.7650822669104205,
"grad_norm": 83672.0859375,
"learning_rate": 2.3419540229885058e-05,
"loss": 1.6853,
"step": 6050
},
{
"epoch": 2.7696526508226693,
"grad_norm": 66880.90625,
"learning_rate": 2.3371647509578545e-05,
"loss": 1.7165,
"step": 6060
},
{
"epoch": 2.7742230347349177,
"grad_norm": 159394.40625,
"learning_rate": 2.3323754789272032e-05,
"loss": 1.6777,
"step": 6070
},
{
"epoch": 2.778793418647166,
"grad_norm": 103960.3203125,
"learning_rate": 2.327586206896552e-05,
"loss": 1.7345,
"step": 6080
},
{
"epoch": 2.783363802559415,
"grad_norm": 89361.9609375,
"learning_rate": 2.3227969348659002e-05,
"loss": 1.8153,
"step": 6090
},
{
"epoch": 2.787934186471664,
"grad_norm": 48453.51953125,
"learning_rate": 2.3180076628352492e-05,
"loss": 1.7499,
"step": 6100
},
{
"epoch": 2.792504570383912,
"grad_norm": 60024.06640625,
"learning_rate": 2.313218390804598e-05,
"loss": 1.6522,
"step": 6110
},
{
"epoch": 2.797074954296161,
"grad_norm": 76540.078125,
"learning_rate": 2.3084291187739466e-05,
"loss": 1.7614,
"step": 6120
},
{
"epoch": 2.8016453382084094,
"grad_norm": 127741.5390625,
"learning_rate": 2.303639846743295e-05,
"loss": 1.7328,
"step": 6130
},
{
"epoch": 2.8062157221206583,
"grad_norm": 92570.3828125,
"learning_rate": 2.2988505747126437e-05,
"loss": 1.6602,
"step": 6140
},
{
"epoch": 2.8107861060329067,
"grad_norm": 74026.5,
"learning_rate": 2.2940613026819923e-05,
"loss": 1.6413,
"step": 6150
},
{
"epoch": 2.8153564899451555,
"grad_norm": 142319.03125,
"learning_rate": 2.289272030651341e-05,
"loss": 1.6652,
"step": 6160
},
{
"epoch": 2.819926873857404,
"grad_norm": 66725.7578125,
"learning_rate": 2.2844827586206897e-05,
"loss": 1.7805,
"step": 6170
},
{
"epoch": 2.8244972577696528,
"grad_norm": 204509.578125,
"learning_rate": 2.2796934865900384e-05,
"loss": 1.7006,
"step": 6180
},
{
"epoch": 2.829067641681901,
"grad_norm": 81922.28125,
"learning_rate": 2.274904214559387e-05,
"loss": 1.7177,
"step": 6190
},
{
"epoch": 2.83363802559415,
"grad_norm": 79576.234375,
"learning_rate": 2.2701149425287358e-05,
"loss": 1.7878,
"step": 6200
},
{
"epoch": 2.8382084095063984,
"grad_norm": 94908.828125,
"learning_rate": 2.2653256704980845e-05,
"loss": 1.6381,
"step": 6210
},
{
"epoch": 2.8427787934186473,
"grad_norm": 54743.4921875,
"learning_rate": 2.2605363984674328e-05,
"loss": 1.6953,
"step": 6220
},
{
"epoch": 2.8473491773308957,
"grad_norm": 81304.8203125,
"learning_rate": 2.2557471264367815e-05,
"loss": 1.7417,
"step": 6230
},
{
"epoch": 2.8519195612431445,
"grad_norm": 61965.26171875,
"learning_rate": 2.2509578544061305e-05,
"loss": 1.7189,
"step": 6240
},
{
"epoch": 2.856489945155393,
"grad_norm": 74507.875,
"learning_rate": 2.2461685823754792e-05,
"loss": 1.6891,
"step": 6250
},
{
"epoch": 2.8610603290676417,
"grad_norm": 83252.703125,
"learning_rate": 2.2413793103448276e-05,
"loss": 1.6622,
"step": 6260
},
{
"epoch": 2.8656307129798906,
"grad_norm": 74911.6484375,
"learning_rate": 2.2365900383141763e-05,
"loss": 1.684,
"step": 6270
},
{
"epoch": 2.870201096892139,
"grad_norm": 67471.203125,
"learning_rate": 2.231800766283525e-05,
"loss": 1.7436,
"step": 6280
},
{
"epoch": 2.8747714808043874,
"grad_norm": 54812.26953125,
"learning_rate": 2.2270114942528736e-05,
"loss": 1.7817,
"step": 6290
},
{
"epoch": 2.8793418647166362,
"grad_norm": 93858.140625,
"learning_rate": 2.2222222222222223e-05,
"loss": 1.721,
"step": 6300
},
{
"epoch": 2.883912248628885,
"grad_norm": 64988.40234375,
"learning_rate": 2.217432950191571e-05,
"loss": 1.7022,
"step": 6310
},
{
"epoch": 2.8884826325411335,
"grad_norm": 70428.0546875,
"learning_rate": 2.2126436781609197e-05,
"loss": 1.7059,
"step": 6320
},
{
"epoch": 2.893053016453382,
"grad_norm": 82495.4453125,
"learning_rate": 2.2078544061302684e-05,
"loss": 1.8106,
"step": 6330
},
{
"epoch": 2.8976234003656307,
"grad_norm": 81380.859375,
"learning_rate": 2.203065134099617e-05,
"loss": 1.6888,
"step": 6340
},
{
"epoch": 2.9021937842778796,
"grad_norm": 79387.8125,
"learning_rate": 2.1982758620689654e-05,
"loss": 1.6739,
"step": 6350
},
{
"epoch": 2.906764168190128,
"grad_norm": 139392.453125,
"learning_rate": 2.193486590038314e-05,
"loss": 1.6741,
"step": 6360
},
{
"epoch": 2.9113345521023763,
"grad_norm": 87657.875,
"learning_rate": 2.1886973180076628e-05,
"loss": 1.7429,
"step": 6370
},
{
"epoch": 2.915904936014625,
"grad_norm": 62421.78125,
"learning_rate": 2.183908045977012e-05,
"loss": 1.7148,
"step": 6380
},
{
"epoch": 2.920475319926874,
"grad_norm": 126881.4375,
"learning_rate": 2.1791187739463602e-05,
"loss": 1.6232,
"step": 6390
},
{
"epoch": 2.9250457038391224,
"grad_norm": 102633.1015625,
"learning_rate": 2.174329501915709e-05,
"loss": 1.7049,
"step": 6400
},
{
"epoch": 2.9296160877513713,
"grad_norm": 102789.2265625,
"learning_rate": 2.1695402298850576e-05,
"loss": 1.7001,
"step": 6410
},
{
"epoch": 2.9341864716636197,
"grad_norm": 79177.21875,
"learning_rate": 2.1647509578544062e-05,
"loss": 1.7525,
"step": 6420
},
{
"epoch": 2.9387568555758685,
"grad_norm": 80986.46875,
"learning_rate": 2.159961685823755e-05,
"loss": 1.6458,
"step": 6430
},
{
"epoch": 2.943327239488117,
"grad_norm": 97659.375,
"learning_rate": 2.1551724137931033e-05,
"loss": 1.7678,
"step": 6440
},
{
"epoch": 2.9478976234003658,
"grad_norm": 97846.609375,
"learning_rate": 2.1503831417624523e-05,
"loss": 1.7128,
"step": 6450
},
{
"epoch": 2.952468007312614,
"grad_norm": 74223.375,
"learning_rate": 2.145593869731801e-05,
"loss": 1.8434,
"step": 6460
},
{
"epoch": 2.957038391224863,
"grad_norm": 59553.359375,
"learning_rate": 2.1408045977011497e-05,
"loss": 1.7717,
"step": 6470
},
{
"epoch": 2.9616087751371114,
"grad_norm": 57012.078125,
"learning_rate": 2.136015325670498e-05,
"loss": 1.6865,
"step": 6480
},
{
"epoch": 2.9661791590493602,
"grad_norm": 57963.9609375,
"learning_rate": 2.1312260536398467e-05,
"loss": 1.6966,
"step": 6490
},
{
"epoch": 2.9707495429616086,
"grad_norm": 82150.6015625,
"learning_rate": 2.1264367816091954e-05,
"loss": 1.7219,
"step": 6500
},
{
"epoch": 2.9753199268738575,
"grad_norm": 49790.92578125,
"learning_rate": 2.1216475095785444e-05,
"loss": 1.7301,
"step": 6510
},
{
"epoch": 2.979890310786106,
"grad_norm": 77082.4140625,
"learning_rate": 2.1168582375478928e-05,
"loss": 1.6742,
"step": 6520
},
{
"epoch": 2.9844606946983547,
"grad_norm": 123639.671875,
"learning_rate": 2.1120689655172415e-05,
"loss": 1.6256,
"step": 6530
},
{
"epoch": 2.989031078610603,
"grad_norm": 67240.890625,
"learning_rate": 2.10727969348659e-05,
"loss": 1.7623,
"step": 6540
},
{
"epoch": 2.993601462522852,
"grad_norm": 71455.7890625,
"learning_rate": 2.102490421455939e-05,
"loss": 1.6758,
"step": 6550
},
{
"epoch": 2.998171846435101,
"grad_norm": 155724.125,
"learning_rate": 2.0977011494252875e-05,
"loss": 1.6401,
"step": 6560
},
{
"epoch": 3.0,
"eval_loss": 1.695529818534851,
"eval_runtime": 345.831,
"eval_samples_per_second": 43.374,
"eval_steps_per_second": 1.356,
"step": 6564
},
{
"epoch": 3.002742230347349,
"grad_norm": 71896.8515625,
"learning_rate": 2.092911877394636e-05,
"loss": 1.7537,
"step": 6570
},
{
"epoch": 3.0073126142595976,
"grad_norm": 61401.01171875,
"learning_rate": 2.088122605363985e-05,
"loss": 1.6979,
"step": 6580
},
{
"epoch": 3.0118829981718465,
"grad_norm": 108287.78125,
"learning_rate": 2.0833333333333336e-05,
"loss": 1.7115,
"step": 6590
},
{
"epoch": 3.016453382084095,
"grad_norm": 108027.125,
"learning_rate": 2.0785440613026823e-05,
"loss": 1.6887,
"step": 6600
},
{
"epoch": 3.0210237659963437,
"grad_norm": 89622.265625,
"learning_rate": 2.0737547892720306e-05,
"loss": 1.6484,
"step": 6610
},
{
"epoch": 3.025594149908592,
"grad_norm": 116170.4921875,
"learning_rate": 2.0689655172413793e-05,
"loss": 1.723,
"step": 6620
},
{
"epoch": 3.030164533820841,
"grad_norm": 76070.9765625,
"learning_rate": 2.064176245210728e-05,
"loss": 1.6649,
"step": 6630
},
{
"epoch": 3.03473491773309,
"grad_norm": 86966.0859375,
"learning_rate": 2.0593869731800767e-05,
"loss": 1.6613,
"step": 6640
},
{
"epoch": 3.039305301645338,
"grad_norm": 101902.5,
"learning_rate": 2.0545977011494254e-05,
"loss": 1.6718,
"step": 6650
},
{
"epoch": 3.043875685557587,
"grad_norm": 45214.640625,
"learning_rate": 2.049808429118774e-05,
"loss": 1.7025,
"step": 6660
},
{
"epoch": 3.0484460694698354,
"grad_norm": 61494.140625,
"learning_rate": 2.0450191570881228e-05,
"loss": 1.7289,
"step": 6670
},
{
"epoch": 3.0530164533820843,
"grad_norm": 136512.90625,
"learning_rate": 2.0402298850574715e-05,
"loss": 1.7768,
"step": 6680
},
{
"epoch": 3.0575868372943327,
"grad_norm": 106828.390625,
"learning_rate": 2.03544061302682e-05,
"loss": 1.7891,
"step": 6690
},
{
"epoch": 3.0621572212065815,
"grad_norm": 54863.59765625,
"learning_rate": 2.0306513409961685e-05,
"loss": 1.6614,
"step": 6700
},
{
"epoch": 3.06672760511883,
"grad_norm": 95806.8984375,
"learning_rate": 2.0258620689655172e-05,
"loss": 1.6815,
"step": 6710
},
{
"epoch": 3.0712979890310788,
"grad_norm": 70664.6875,
"learning_rate": 2.0210727969348662e-05,
"loss": 1.6989,
"step": 6720
},
{
"epoch": 3.075868372943327,
"grad_norm": 118271.9921875,
"learning_rate": 2.016283524904215e-05,
"loss": 1.8092,
"step": 6730
},
{
"epoch": 3.080438756855576,
"grad_norm": 82462.28125,
"learning_rate": 2.0114942528735632e-05,
"loss": 1.8018,
"step": 6740
},
{
"epoch": 3.0850091407678244,
"grad_norm": 69368.5546875,
"learning_rate": 2.006704980842912e-05,
"loss": 1.6884,
"step": 6750
},
{
"epoch": 3.0895795246800732,
"grad_norm": 84815.4609375,
"learning_rate": 2.0019157088122606e-05,
"loss": 1.695,
"step": 6760
},
{
"epoch": 3.0941499085923216,
"grad_norm": 117484.3125,
"learning_rate": 1.9971264367816093e-05,
"loss": 1.799,
"step": 6770
},
{
"epoch": 3.0987202925045705,
"grad_norm": 74344.625,
"learning_rate": 1.992337164750958e-05,
"loss": 1.7173,
"step": 6780
},
{
"epoch": 3.103290676416819,
"grad_norm": 61023.3359375,
"learning_rate": 1.9875478927203067e-05,
"loss": 1.7335,
"step": 6790
},
{
"epoch": 3.1078610603290677,
"grad_norm": 80261.34375,
"learning_rate": 1.9827586206896554e-05,
"loss": 1.6618,
"step": 6800
},
{
"epoch": 3.112431444241316,
"grad_norm": 44226.015625,
"learning_rate": 1.977969348659004e-05,
"loss": 1.6994,
"step": 6810
},
{
"epoch": 3.117001828153565,
"grad_norm": 62552.55078125,
"learning_rate": 1.9731800766283527e-05,
"loss": 1.6826,
"step": 6820
},
{
"epoch": 3.1215722120658134,
"grad_norm": 126776.40625,
"learning_rate": 1.968390804597701e-05,
"loss": 1.6274,
"step": 6830
},
{
"epoch": 3.126142595978062,
"grad_norm": 71035.0234375,
"learning_rate": 1.9636015325670498e-05,
"loss": 1.8007,
"step": 6840
},
{
"epoch": 3.1307129798903106,
"grad_norm": 78976.2265625,
"learning_rate": 1.9588122605363985e-05,
"loss": 1.727,
"step": 6850
},
{
"epoch": 3.1352833638025595,
"grad_norm": 83373.7265625,
"learning_rate": 1.9540229885057475e-05,
"loss": 1.7554,
"step": 6860
},
{
"epoch": 3.139853747714808,
"grad_norm": 165484.453125,
"learning_rate": 1.949233716475096e-05,
"loss": 1.621,
"step": 6870
},
{
"epoch": 3.1444241316270567,
"grad_norm": 62350.18359375,
"learning_rate": 1.9444444444444445e-05,
"loss": 1.8118,
"step": 6880
},
{
"epoch": 3.148994515539305,
"grad_norm": 64155.27734375,
"learning_rate": 1.9396551724137932e-05,
"loss": 1.7245,
"step": 6890
},
{
"epoch": 3.153564899451554,
"grad_norm": 69873.4375,
"learning_rate": 1.934865900383142e-05,
"loss": 1.7768,
"step": 6900
},
{
"epoch": 3.1581352833638023,
"grad_norm": 122451.703125,
"learning_rate": 1.9300766283524903e-05,
"loss": 1.7436,
"step": 6910
},
{
"epoch": 3.162705667276051,
"grad_norm": 50876.828125,
"learning_rate": 1.925287356321839e-05,
"loss": 1.7615,
"step": 6920
},
{
"epoch": 3.1672760511883,
"grad_norm": 68587.875,
"learning_rate": 1.920498084291188e-05,
"loss": 1.6389,
"step": 6930
},
{
"epoch": 3.1718464351005484,
"grad_norm": 111575.640625,
"learning_rate": 1.9157088122605367e-05,
"loss": 1.7055,
"step": 6940
},
{
"epoch": 3.1764168190127973,
"grad_norm": 68116.921875,
"learning_rate": 1.910919540229885e-05,
"loss": 1.7635,
"step": 6950
},
{
"epoch": 3.1809872029250457,
"grad_norm": 75995.734375,
"learning_rate": 1.9061302681992337e-05,
"loss": 1.7208,
"step": 6960
},
{
"epoch": 3.1855575868372945,
"grad_norm": 97217.1796875,
"learning_rate": 1.9013409961685824e-05,
"loss": 1.6942,
"step": 6970
},
{
"epoch": 3.190127970749543,
"grad_norm": 125494.984375,
"learning_rate": 1.896551724137931e-05,
"loss": 1.7897,
"step": 6980
},
{
"epoch": 3.1946983546617918,
"grad_norm": 102539.3046875,
"learning_rate": 1.8917624521072798e-05,
"loss": 1.7186,
"step": 6990
},
{
"epoch": 3.19926873857404,
"grad_norm": 92514.1640625,
"learning_rate": 1.8869731800766285e-05,
"loss": 1.7805,
"step": 7000
},
{
"epoch": 3.203839122486289,
"grad_norm": 86951.125,
"learning_rate": 1.882183908045977e-05,
"loss": 1.738,
"step": 7010
},
{
"epoch": 3.2084095063985374,
"grad_norm": 99123.890625,
"learning_rate": 1.8773946360153258e-05,
"loss": 1.6605,
"step": 7020
},
{
"epoch": 3.2129798903107862,
"grad_norm": 61411.390625,
"learning_rate": 1.8726053639846745e-05,
"loss": 1.6291,
"step": 7030
},
{
"epoch": 3.2175502742230346,
"grad_norm": 69628.1953125,
"learning_rate": 1.867816091954023e-05,
"loss": 1.7693,
"step": 7040
},
{
"epoch": 3.2221206581352835,
"grad_norm": 74736.0390625,
"learning_rate": 1.8630268199233716e-05,
"loss": 1.6428,
"step": 7050
},
{
"epoch": 3.226691042047532,
"grad_norm": 93917.203125,
"learning_rate": 1.8582375478927206e-05,
"loss": 1.7189,
"step": 7060
},
{
"epoch": 3.2312614259597807,
"grad_norm": 80656.5546875,
"learning_rate": 1.8534482758620693e-05,
"loss": 1.7485,
"step": 7070
},
{
"epoch": 3.235831809872029,
"grad_norm": 72125.5546875,
"learning_rate": 1.8486590038314176e-05,
"loss": 1.644,
"step": 7080
},
{
"epoch": 3.240402193784278,
"grad_norm": 86443.671875,
"learning_rate": 1.8438697318007663e-05,
"loss": 1.6747,
"step": 7090
},
{
"epoch": 3.2449725776965264,
"grad_norm": 93107.2734375,
"learning_rate": 1.839080459770115e-05,
"loss": 1.705,
"step": 7100
},
{
"epoch": 3.249542961608775,
"grad_norm": 68446.5078125,
"learning_rate": 1.8342911877394637e-05,
"loss": 1.7647,
"step": 7110
},
{
"epoch": 3.2541133455210236,
"grad_norm": 91024.3125,
"learning_rate": 1.8295019157088124e-05,
"loss": 1.7323,
"step": 7120
},
{
"epoch": 3.2586837294332724,
"grad_norm": 63286.3828125,
"learning_rate": 1.824712643678161e-05,
"loss": 1.7359,
"step": 7130
},
{
"epoch": 3.263254113345521,
"grad_norm": 94550.0234375,
"learning_rate": 1.8199233716475097e-05,
"loss": 1.7065,
"step": 7140
},
{
"epoch": 3.2678244972577697,
"grad_norm": 66774.734375,
"learning_rate": 1.8151340996168584e-05,
"loss": 1.6688,
"step": 7150
},
{
"epoch": 3.272394881170018,
"grad_norm": 52998.1484375,
"learning_rate": 1.810344827586207e-05,
"loss": 1.7237,
"step": 7160
},
{
"epoch": 3.276965265082267,
"grad_norm": 70673.5078125,
"learning_rate": 1.8055555555555555e-05,
"loss": 1.7609,
"step": 7170
},
{
"epoch": 3.2815356489945158,
"grad_norm": 109352.7421875,
"learning_rate": 1.800766283524904e-05,
"loss": 1.6908,
"step": 7180
},
{
"epoch": 3.286106032906764,
"grad_norm": 66609.03125,
"learning_rate": 1.795977011494253e-05,
"loss": 1.7241,
"step": 7190
},
{
"epoch": 3.2906764168190126,
"grad_norm": 74225.8984375,
"learning_rate": 1.791187739463602e-05,
"loss": 1.7827,
"step": 7200
},
{
"epoch": 3.2952468007312614,
"grad_norm": 116946.515625,
"learning_rate": 1.7863984674329502e-05,
"loss": 1.6128,
"step": 7210
},
{
"epoch": 3.2998171846435103,
"grad_norm": 76768.5234375,
"learning_rate": 1.781609195402299e-05,
"loss": 1.7406,
"step": 7220
},
{
"epoch": 3.3043875685557587,
"grad_norm": 107767.0625,
"learning_rate": 1.7768199233716476e-05,
"loss": 1.6913,
"step": 7230
},
{
"epoch": 3.3089579524680075,
"grad_norm": 76932.5703125,
"learning_rate": 1.7720306513409963e-05,
"loss": 1.751,
"step": 7240
},
{
"epoch": 3.313528336380256,
"grad_norm": 132700.34375,
"learning_rate": 1.767241379310345e-05,
"loss": 1.7305,
"step": 7250
},
{
"epoch": 3.3180987202925047,
"grad_norm": 148178.984375,
"learning_rate": 1.7624521072796933e-05,
"loss": 1.6116,
"step": 7260
},
{
"epoch": 3.322669104204753,
"grad_norm": 84747.203125,
"learning_rate": 1.7576628352490424e-05,
"loss": 1.7728,
"step": 7270
},
{
"epoch": 3.327239488117002,
"grad_norm": 88323.5078125,
"learning_rate": 1.752873563218391e-05,
"loss": 1.6676,
"step": 7280
},
{
"epoch": 3.3318098720292504,
"grad_norm": 55836.046875,
"learning_rate": 1.7480842911877397e-05,
"loss": 1.7439,
"step": 7290
},
{
"epoch": 3.3363802559414992,
"grad_norm": 107173.328125,
"learning_rate": 1.743295019157088e-05,
"loss": 1.746,
"step": 7300
},
{
"epoch": 3.3409506398537476,
"grad_norm": 63958.0078125,
"learning_rate": 1.7385057471264368e-05,
"loss": 1.6936,
"step": 7310
},
{
"epoch": 3.3455210237659965,
"grad_norm": 100361.1484375,
"learning_rate": 1.7337164750957855e-05,
"loss": 1.6349,
"step": 7320
},
{
"epoch": 3.350091407678245,
"grad_norm": 64250.95703125,
"learning_rate": 1.728927203065134e-05,
"loss": 1.715,
"step": 7330
},
{
"epoch": 3.3546617915904937,
"grad_norm": 66948.6953125,
"learning_rate": 1.7241379310344828e-05,
"loss": 1.6291,
"step": 7340
},
{
"epoch": 3.359232175502742,
"grad_norm": 83300.3125,
"learning_rate": 1.7193486590038315e-05,
"loss": 1.6724,
"step": 7350
},
{
"epoch": 3.363802559414991,
"grad_norm": 88165.8125,
"learning_rate": 1.7145593869731802e-05,
"loss": 1.7536,
"step": 7360
},
{
"epoch": 3.3683729433272394,
"grad_norm": 67886.234375,
"learning_rate": 1.709770114942529e-05,
"loss": 1.6798,
"step": 7370
},
{
"epoch": 3.372943327239488,
"grad_norm": 64415.46484375,
"learning_rate": 1.7049808429118776e-05,
"loss": 1.7546,
"step": 7380
},
{
"epoch": 3.3775137111517366,
"grad_norm": 75445.15625,
"learning_rate": 1.700191570881226e-05,
"loss": 1.6932,
"step": 7390
},
{
"epoch": 3.3820840950639854,
"grad_norm": 122763.609375,
"learning_rate": 1.6954022988505746e-05,
"loss": 1.7015,
"step": 7400
},
{
"epoch": 3.386654478976234,
"grad_norm": 113570.5546875,
"learning_rate": 1.6906130268199236e-05,
"loss": 1.7188,
"step": 7410
},
{
"epoch": 3.3912248628884827,
"grad_norm": 103909.40625,
"learning_rate": 1.6858237547892723e-05,
"loss": 1.6516,
"step": 7420
},
{
"epoch": 3.395795246800731,
"grad_norm": 72607.9375,
"learning_rate": 1.6810344827586207e-05,
"loss": 1.704,
"step": 7430
},
{
"epoch": 3.40036563071298,
"grad_norm": 154061.578125,
"learning_rate": 1.6762452107279694e-05,
"loss": 1.769,
"step": 7440
},
{
"epoch": 3.4049360146252283,
"grad_norm": 122622.734375,
"learning_rate": 1.671455938697318e-05,
"loss": 1.754,
"step": 7450
},
{
"epoch": 3.409506398537477,
"grad_norm": 70141.734375,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.7117,
"step": 7460
},
{
"epoch": 3.414076782449726,
"grad_norm": 82556.859375,
"learning_rate": 1.6618773946360154e-05,
"loss": 1.7317,
"step": 7470
},
{
"epoch": 3.4186471663619744,
"grad_norm": 83208.140625,
"learning_rate": 1.657088122605364e-05,
"loss": 1.7346,
"step": 7480
},
{
"epoch": 3.423217550274223,
"grad_norm": 102812.6171875,
"learning_rate": 1.6522988505747128e-05,
"loss": 1.7444,
"step": 7490
},
{
"epoch": 3.4277879341864717,
"grad_norm": 90799.8359375,
"learning_rate": 1.6475095785440615e-05,
"loss": 1.7343,
"step": 7500
},
{
"epoch": 3.4323583180987205,
"grad_norm": 91094.5390625,
"learning_rate": 1.6427203065134102e-05,
"loss": 1.7624,
"step": 7510
},
{
"epoch": 3.436928702010969,
"grad_norm": 69841.4296875,
"learning_rate": 1.6379310344827585e-05,
"loss": 1.7264,
"step": 7520
},
{
"epoch": 3.4414990859232177,
"grad_norm": 122445.3515625,
"learning_rate": 1.6331417624521072e-05,
"loss": 1.8214,
"step": 7530
},
{
"epoch": 3.446069469835466,
"grad_norm": 85346.1875,
"learning_rate": 1.628352490421456e-05,
"loss": 1.7169,
"step": 7540
},
{
"epoch": 3.450639853747715,
"grad_norm": 46006.20703125,
"learning_rate": 1.623563218390805e-05,
"loss": 1.6682,
"step": 7550
},
{
"epoch": 3.4552102376599634,
"grad_norm": 61423.13671875,
"learning_rate": 1.6187739463601533e-05,
"loss": 1.7414,
"step": 7560
},
{
"epoch": 3.4597806215722122,
"grad_norm": 73209.5625,
"learning_rate": 1.613984674329502e-05,
"loss": 1.699,
"step": 7570
},
{
"epoch": 3.4643510054844606,
"grad_norm": 81988.2265625,
"learning_rate": 1.6091954022988507e-05,
"loss": 1.6394,
"step": 7580
},
{
"epoch": 3.4689213893967095,
"grad_norm": 128754.4140625,
"learning_rate": 1.6044061302681994e-05,
"loss": 1.7366,
"step": 7590
},
{
"epoch": 3.473491773308958,
"grad_norm": 92045.3828125,
"learning_rate": 1.5996168582375477e-05,
"loss": 1.7727,
"step": 7600
},
{
"epoch": 3.4780621572212067,
"grad_norm": 144295.390625,
"learning_rate": 1.5948275862068967e-05,
"loss": 1.6927,
"step": 7610
},
{
"epoch": 3.482632541133455,
"grad_norm": 54716.375,
"learning_rate": 1.5900383141762454e-05,
"loss": 1.7297,
"step": 7620
},
{
"epoch": 3.487202925045704,
"grad_norm": 80850.1328125,
"learning_rate": 1.585249042145594e-05,
"loss": 1.639,
"step": 7630
},
{
"epoch": 3.4917733089579523,
"grad_norm": 114330.296875,
"learning_rate": 1.5804597701149425e-05,
"loss": 1.7673,
"step": 7640
},
{
"epoch": 3.496343692870201,
"grad_norm": 58934.4921875,
"learning_rate": 1.575670498084291e-05,
"loss": 1.6763,
"step": 7650
},
{
"epoch": 3.5009140767824496,
"grad_norm": 123695.609375,
"learning_rate": 1.5708812260536398e-05,
"loss": 1.7135,
"step": 7660
},
{
"epoch": 3.5054844606946984,
"grad_norm": 83289.6640625,
"learning_rate": 1.5660919540229885e-05,
"loss": 1.688,
"step": 7670
},
{
"epoch": 3.510054844606947,
"grad_norm": 100226.015625,
"learning_rate": 1.5613026819923372e-05,
"loss": 1.7304,
"step": 7680
},
{
"epoch": 3.5146252285191957,
"grad_norm": 88909.984375,
"learning_rate": 1.556513409961686e-05,
"loss": 1.6723,
"step": 7690
},
{
"epoch": 3.519195612431444,
"grad_norm": 66940.3515625,
"learning_rate": 1.5517241379310346e-05,
"loss": 1.6159,
"step": 7700
},
{
"epoch": 3.523765996343693,
"grad_norm": 88044.171875,
"learning_rate": 1.5469348659003833e-05,
"loss": 1.7853,
"step": 7710
},
{
"epoch": 3.5283363802559418,
"grad_norm": 85045.421875,
"learning_rate": 1.542145593869732e-05,
"loss": 1.7666,
"step": 7720
},
{
"epoch": 3.53290676416819,
"grad_norm": 60147.796875,
"learning_rate": 1.5373563218390803e-05,
"loss": 1.6683,
"step": 7730
},
{
"epoch": 3.5374771480804386,
"grad_norm": 82411.9609375,
"learning_rate": 1.532567049808429e-05,
"loss": 1.7323,
"step": 7740
},
{
"epoch": 3.5420475319926874,
"grad_norm": 66054.59375,
"learning_rate": 1.527777777777778e-05,
"loss": 1.6714,
"step": 7750
},
{
"epoch": 3.5466179159049362,
"grad_norm": 72301.625,
"learning_rate": 1.5229885057471265e-05,
"loss": 1.6496,
"step": 7760
},
{
"epoch": 3.5511882998171846,
"grad_norm": 104870.84375,
"learning_rate": 1.5181992337164752e-05,
"loss": 1.729,
"step": 7770
},
{
"epoch": 3.555758683729433,
"grad_norm": 155170.828125,
"learning_rate": 1.5134099616858237e-05,
"loss": 1.6892,
"step": 7780
},
{
"epoch": 3.560329067641682,
"grad_norm": 103218.2578125,
"learning_rate": 1.5086206896551724e-05,
"loss": 1.68,
"step": 7790
},
{
"epoch": 3.5648994515539307,
"grad_norm": 77247.625,
"learning_rate": 1.5038314176245211e-05,
"loss": 1.8163,
"step": 7800
},
{
"epoch": 3.569469835466179,
"grad_norm": 62777.0859375,
"learning_rate": 1.4990421455938696e-05,
"loss": 1.7311,
"step": 7810
},
{
"epoch": 3.5740402193784275,
"grad_norm": 79059.1328125,
"learning_rate": 1.4942528735632185e-05,
"loss": 1.714,
"step": 7820
},
{
"epoch": 3.5786106032906764,
"grad_norm": 75726.578125,
"learning_rate": 1.4894636015325672e-05,
"loss": 1.724,
"step": 7830
},
{
"epoch": 3.583180987202925,
"grad_norm": 62909.81640625,
"learning_rate": 1.4846743295019159e-05,
"loss": 1.6383,
"step": 7840
},
{
"epoch": 3.5877513711151736,
"grad_norm": 80657.5234375,
"learning_rate": 1.4798850574712644e-05,
"loss": 1.7269,
"step": 7850
},
{
"epoch": 3.5923217550274225,
"grad_norm": 73948.4375,
"learning_rate": 1.475095785440613e-05,
"loss": 1.6603,
"step": 7860
},
{
"epoch": 3.596892138939671,
"grad_norm": 88141.875,
"learning_rate": 1.4703065134099616e-05,
"loss": 1.6842,
"step": 7870
},
{
"epoch": 3.6014625228519197,
"grad_norm": 78391.890625,
"learning_rate": 1.4655172413793103e-05,
"loss": 1.6908,
"step": 7880
},
{
"epoch": 3.606032906764168,
"grad_norm": 65879.515625,
"learning_rate": 1.4607279693486591e-05,
"loss": 1.7438,
"step": 7890
},
{
"epoch": 3.610603290676417,
"grad_norm": 137562.15625,
"learning_rate": 1.4559386973180078e-05,
"loss": 1.704,
"step": 7900
},
{
"epoch": 3.6151736745886653,
"grad_norm": 99103.4375,
"learning_rate": 1.4511494252873564e-05,
"loss": 1.7265,
"step": 7910
},
{
"epoch": 3.619744058500914,
"grad_norm": 67414.4609375,
"learning_rate": 1.446360153256705e-05,
"loss": 1.7134,
"step": 7920
},
{
"epoch": 3.6243144424131626,
"grad_norm": 109987.7265625,
"learning_rate": 1.4415708812260537e-05,
"loss": 1.6406,
"step": 7930
},
{
"epoch": 3.6288848263254114,
"grad_norm": 143015.703125,
"learning_rate": 1.4367816091954022e-05,
"loss": 1.7694,
"step": 7940
},
{
"epoch": 3.63345521023766,
"grad_norm": 87812.765625,
"learning_rate": 1.431992337164751e-05,
"loss": 1.7357,
"step": 7950
},
{
"epoch": 3.6380255941499087,
"grad_norm": 84844.578125,
"learning_rate": 1.4272030651340998e-05,
"loss": 1.7893,
"step": 7960
},
{
"epoch": 3.642595978062157,
"grad_norm": 72731.703125,
"learning_rate": 1.4224137931034485e-05,
"loss": 1.6586,
"step": 7970
},
{
"epoch": 3.647166361974406,
"grad_norm": 81956.890625,
"learning_rate": 1.417624521072797e-05,
"loss": 1.7693,
"step": 7980
},
{
"epoch": 3.6517367458866543,
"grad_norm": 69217.53125,
"learning_rate": 1.4128352490421457e-05,
"loss": 1.7928,
"step": 7990
},
{
"epoch": 3.656307129798903,
"grad_norm": 54634.5703125,
"learning_rate": 1.4080459770114942e-05,
"loss": 1.7918,
"step": 8000
},
{
"epoch": 3.660877513711152,
"grad_norm": 63817.8359375,
"learning_rate": 1.4032567049808429e-05,
"loss": 1.6763,
"step": 8010
},
{
"epoch": 3.6654478976234004,
"grad_norm": 118554.0859375,
"learning_rate": 1.3984674329501916e-05,
"loss": 1.7032,
"step": 8020
},
{
"epoch": 3.670018281535649,
"grad_norm": 96849.4453125,
"learning_rate": 1.3936781609195404e-05,
"loss": 1.6672,
"step": 8030
},
{
"epoch": 3.6745886654478976,
"grad_norm": 138688.09375,
"learning_rate": 1.388888888888889e-05,
"loss": 1.6499,
"step": 8040
},
{
"epoch": 3.6791590493601465,
"grad_norm": 71032.484375,
"learning_rate": 1.3840996168582376e-05,
"loss": 1.7859,
"step": 8050
},
{
"epoch": 3.683729433272395,
"grad_norm": 105990.21875,
"learning_rate": 1.3793103448275863e-05,
"loss": 1.6542,
"step": 8060
},
{
"epoch": 3.6882998171846433,
"grad_norm": 119098.0859375,
"learning_rate": 1.3745210727969348e-05,
"loss": 1.7186,
"step": 8070
},
{
"epoch": 3.692870201096892,
"grad_norm": 61243.96484375,
"learning_rate": 1.3697318007662835e-05,
"loss": 1.7328,
"step": 8080
},
{
"epoch": 3.697440585009141,
"grad_norm": 78637.296875,
"learning_rate": 1.3649425287356324e-05,
"loss": 1.6125,
"step": 8090
},
{
"epoch": 3.7020109689213894,
"grad_norm": 88676.828125,
"learning_rate": 1.360153256704981e-05,
"loss": 1.6784,
"step": 8100
},
{
"epoch": 3.7065813528336378,
"grad_norm": 80246.65625,
"learning_rate": 1.3553639846743296e-05,
"loss": 1.6786,
"step": 8110
},
{
"epoch": 3.7111517367458866,
"grad_norm": 79097.7734375,
"learning_rate": 1.3505747126436783e-05,
"loss": 1.6875,
"step": 8120
},
{
"epoch": 3.7157221206581355,
"grad_norm": 75883.453125,
"learning_rate": 1.3457854406130268e-05,
"loss": 1.6788,
"step": 8130
},
{
"epoch": 3.720292504570384,
"grad_norm": 87841.7734375,
"learning_rate": 1.3409961685823755e-05,
"loss": 1.7938,
"step": 8140
},
{
"epoch": 3.7248628884826327,
"grad_norm": 60471.46875,
"learning_rate": 1.336206896551724e-05,
"loss": 1.7017,
"step": 8150
},
{
"epoch": 3.729433272394881,
"grad_norm": 117315.484375,
"learning_rate": 1.331417624521073e-05,
"loss": 1.652,
"step": 8160
},
{
"epoch": 3.73400365630713,
"grad_norm": 81507.8984375,
"learning_rate": 1.3266283524904216e-05,
"loss": 1.7639,
"step": 8170
},
{
"epoch": 3.7385740402193783,
"grad_norm": 110054.6328125,
"learning_rate": 1.3218390804597702e-05,
"loss": 1.7429,
"step": 8180
},
{
"epoch": 3.743144424131627,
"grad_norm": 65638.3828125,
"learning_rate": 1.3170498084291188e-05,
"loss": 1.7599,
"step": 8190
},
{
"epoch": 3.7477148080438756,
"grad_norm": 116608.078125,
"learning_rate": 1.3122605363984675e-05,
"loss": 1.7329,
"step": 8200
},
{
"epoch": 3.7522851919561244,
"grad_norm": 87637.5078125,
"learning_rate": 1.3074712643678161e-05,
"loss": 1.7624,
"step": 8210
},
{
"epoch": 3.756855575868373,
"grad_norm": 94494.8125,
"learning_rate": 1.3026819923371647e-05,
"loss": 1.7554,
"step": 8220
},
{
"epoch": 3.7614259597806217,
"grad_norm": 69636.9296875,
"learning_rate": 1.2978927203065135e-05,
"loss": 1.758,
"step": 8230
},
{
"epoch": 3.76599634369287,
"grad_norm": 73185.421875,
"learning_rate": 1.2931034482758622e-05,
"loss": 1.7235,
"step": 8240
},
{
"epoch": 3.770566727605119,
"grad_norm": 94298.2265625,
"learning_rate": 1.2883141762452109e-05,
"loss": 1.6726,
"step": 8250
},
{
"epoch": 3.7751371115173673,
"grad_norm": 99814.8671875,
"learning_rate": 1.2835249042145594e-05,
"loss": 1.6912,
"step": 8260
},
{
"epoch": 3.779707495429616,
"grad_norm": 68422.03125,
"learning_rate": 1.2787356321839081e-05,
"loss": 1.7406,
"step": 8270
},
{
"epoch": 3.7842778793418645,
"grad_norm": 82088.296875,
"learning_rate": 1.2739463601532566e-05,
"loss": 1.7526,
"step": 8280
},
{
"epoch": 3.7888482632541134,
"grad_norm": 77173.703125,
"learning_rate": 1.2691570881226053e-05,
"loss": 1.7784,
"step": 8290
},
{
"epoch": 3.7934186471663622,
"grad_norm": 60719.59375,
"learning_rate": 1.2643678160919542e-05,
"loss": 1.7124,
"step": 8300
},
{
"epoch": 3.7979890310786106,
"grad_norm": 71459.625,
"learning_rate": 1.2595785440613029e-05,
"loss": 1.6099,
"step": 8310
},
{
"epoch": 3.802559414990859,
"grad_norm": 57145.5546875,
"learning_rate": 1.2547892720306514e-05,
"loss": 1.7385,
"step": 8320
},
{
"epoch": 3.807129798903108,
"grad_norm": 72725.9296875,
"learning_rate": 1.25e-05,
"loss": 1.7214,
"step": 8330
},
{
"epoch": 3.8117001828153567,
"grad_norm": 135925.65625,
"learning_rate": 1.2452107279693487e-05,
"loss": 1.719,
"step": 8340
},
{
"epoch": 3.816270566727605,
"grad_norm": 48055.78515625,
"learning_rate": 1.2404214559386974e-05,
"loss": 1.7011,
"step": 8350
},
{
"epoch": 3.8208409506398535,
"grad_norm": 120826.3359375,
"learning_rate": 1.2356321839080461e-05,
"loss": 1.6556,
"step": 8360
},
{
"epoch": 3.8254113345521024,
"grad_norm": 98967.4453125,
"learning_rate": 1.2308429118773946e-05,
"loss": 1.6559,
"step": 8370
},
{
"epoch": 3.829981718464351,
"grad_norm": 67471.8125,
"learning_rate": 1.2260536398467433e-05,
"loss": 1.6941,
"step": 8380
},
{
"epoch": 3.8345521023765996,
"grad_norm": 73149.8359375,
"learning_rate": 1.221264367816092e-05,
"loss": 1.7197,
"step": 8390
},
{
"epoch": 3.839122486288848,
"grad_norm": 59362.51953125,
"learning_rate": 1.2164750957854407e-05,
"loss": 1.7067,
"step": 8400
},
{
"epoch": 3.843692870201097,
"grad_norm": 103423.5859375,
"learning_rate": 1.2116858237547892e-05,
"loss": 1.5936,
"step": 8410
},
{
"epoch": 3.8482632541133457,
"grad_norm": 68154.90625,
"learning_rate": 1.206896551724138e-05,
"loss": 1.7166,
"step": 8420
},
{
"epoch": 3.852833638025594,
"grad_norm": 81413.09375,
"learning_rate": 1.2021072796934866e-05,
"loss": 1.7183,
"step": 8430
},
{
"epoch": 3.857404021937843,
"grad_norm": 67458.328125,
"learning_rate": 1.1973180076628353e-05,
"loss": 1.7063,
"step": 8440
},
{
"epoch": 3.8619744058500913,
"grad_norm": 81477.78125,
"learning_rate": 1.192528735632184e-05,
"loss": 1.6622,
"step": 8450
},
{
"epoch": 3.86654478976234,
"grad_norm": 94965.6953125,
"learning_rate": 1.1877394636015327e-05,
"loss": 1.6777,
"step": 8460
},
{
"epoch": 3.8711151736745886,
"grad_norm": 64403.4375,
"learning_rate": 1.1829501915708814e-05,
"loss": 1.7471,
"step": 8470
},
{
"epoch": 3.8756855575868374,
"grad_norm": 72309.5859375,
"learning_rate": 1.1781609195402299e-05,
"loss": 1.7836,
"step": 8480
},
{
"epoch": 3.880255941499086,
"grad_norm": 80551.765625,
"learning_rate": 1.1733716475095787e-05,
"loss": 1.6478,
"step": 8490
},
{
"epoch": 3.8848263254113347,
"grad_norm": 86743.6015625,
"learning_rate": 1.1685823754789272e-05,
"loss": 1.6723,
"step": 8500
},
{
"epoch": 3.889396709323583,
"grad_norm": 60500.5,
"learning_rate": 1.163793103448276e-05,
"loss": 1.779,
"step": 8510
},
{
"epoch": 3.893967093235832,
"grad_norm": 81024.5703125,
"learning_rate": 1.1590038314176246e-05,
"loss": 1.5701,
"step": 8520
},
{
"epoch": 3.8985374771480803,
"grad_norm": 80383.0234375,
"learning_rate": 1.1542145593869733e-05,
"loss": 1.7806,
"step": 8530
},
{
"epoch": 3.903107861060329,
"grad_norm": 76206.0,
"learning_rate": 1.1494252873563218e-05,
"loss": 1.6846,
"step": 8540
},
{
"epoch": 3.9076782449725775,
"grad_norm": 105761.515625,
"learning_rate": 1.1446360153256705e-05,
"loss": 1.7613,
"step": 8550
},
{
"epoch": 3.9122486288848264,
"grad_norm": 76597.375,
"learning_rate": 1.1398467432950192e-05,
"loss": 1.6565,
"step": 8560
},
{
"epoch": 3.916819012797075,
"grad_norm": 73221.8984375,
"learning_rate": 1.1350574712643679e-05,
"loss": 1.7441,
"step": 8570
},
{
"epoch": 3.9213893967093236,
"grad_norm": 79467.125,
"learning_rate": 1.1302681992337164e-05,
"loss": 1.6712,
"step": 8580
},
{
"epoch": 3.9259597806215725,
"grad_norm": 72400.3125,
"learning_rate": 1.1254789272030653e-05,
"loss": 1.6967,
"step": 8590
},
{
"epoch": 3.930530164533821,
"grad_norm": 57804.4296875,
"learning_rate": 1.1206896551724138e-05,
"loss": 1.7333,
"step": 8600
},
{
"epoch": 3.9351005484460693,
"grad_norm": 67051.7734375,
"learning_rate": 1.1159003831417625e-05,
"loss": 1.6614,
"step": 8610
},
{
"epoch": 3.939670932358318,
"grad_norm": 104811.15625,
"learning_rate": 1.1111111111111112e-05,
"loss": 1.7326,
"step": 8620
},
{
"epoch": 3.944241316270567,
"grad_norm": 110914.2265625,
"learning_rate": 1.1063218390804599e-05,
"loss": 1.6674,
"step": 8630
},
{
"epoch": 3.9488117001828154,
"grad_norm": 79537.1875,
"learning_rate": 1.1015325670498085e-05,
"loss": 1.6492,
"step": 8640
},
{
"epoch": 3.9533820840950638,
"grad_norm": 60097.46484375,
"learning_rate": 1.096743295019157e-05,
"loss": 1.6846,
"step": 8650
},
{
"epoch": 3.9579524680073126,
"grad_norm": 118254.2265625,
"learning_rate": 1.091954022988506e-05,
"loss": 1.6961,
"step": 8660
},
{
"epoch": 3.9625228519195614,
"grad_norm": 114773.8046875,
"learning_rate": 1.0871647509578544e-05,
"loss": 1.7114,
"step": 8670
},
{
"epoch": 3.96709323583181,
"grad_norm": 87263.125,
"learning_rate": 1.0823754789272031e-05,
"loss": 1.7308,
"step": 8680
},
{
"epoch": 3.9716636197440582,
"grad_norm": 66768.859375,
"learning_rate": 1.0775862068965516e-05,
"loss": 1.6712,
"step": 8690
},
{
"epoch": 3.976234003656307,
"grad_norm": 119385.375,
"learning_rate": 1.0727969348659005e-05,
"loss": 1.7558,
"step": 8700
},
{
"epoch": 3.980804387568556,
"grad_norm": 71487.484375,
"learning_rate": 1.068007662835249e-05,
"loss": 1.7924,
"step": 8710
},
{
"epoch": 3.9853747714808043,
"grad_norm": 81396.0234375,
"learning_rate": 1.0632183908045977e-05,
"loss": 1.6232,
"step": 8720
},
{
"epoch": 3.989945155393053,
"grad_norm": 89533.171875,
"learning_rate": 1.0584291187739464e-05,
"loss": 1.7174,
"step": 8730
},
{
"epoch": 3.9945155393053016,
"grad_norm": 72157.4765625,
"learning_rate": 1.053639846743295e-05,
"loss": 1.7366,
"step": 8740
},
{
"epoch": 3.9990859232175504,
"grad_norm": 71103.40625,
"learning_rate": 1.0488505747126438e-05,
"loss": 1.6741,
"step": 8750
},
{
"epoch": 4.0,
"eval_loss": 1.6915712356567383,
"eval_runtime": 345.8106,
"eval_samples_per_second": 43.376,
"eval_steps_per_second": 1.356,
"step": 8752
},
{
"epoch": 4.003656307129799,
"grad_norm": 75867.875,
"learning_rate": 1.0440613026819925e-05,
"loss": 1.6729,
"step": 8760
},
{
"epoch": 4.008226691042047,
"grad_norm": 72915.7890625,
"learning_rate": 1.0392720306513411e-05,
"loss": 1.7197,
"step": 8770
},
{
"epoch": 4.012797074954296,
"grad_norm": 92003.453125,
"learning_rate": 1.0344827586206897e-05,
"loss": 1.6728,
"step": 8780
},
{
"epoch": 4.017367458866545,
"grad_norm": 120198.3359375,
"learning_rate": 1.0296934865900384e-05,
"loss": 1.7737,
"step": 8790
},
{
"epoch": 4.021937842778794,
"grad_norm": 102790.4375,
"learning_rate": 1.024904214559387e-05,
"loss": 1.7166,
"step": 8800
},
{
"epoch": 4.026508226691042,
"grad_norm": 83954.15625,
"learning_rate": 1.0201149425287357e-05,
"loss": 1.6454,
"step": 8810
},
{
"epoch": 4.0310786106032905,
"grad_norm": 100407.6484375,
"learning_rate": 1.0153256704980842e-05,
"loss": 1.7711,
"step": 8820
},
{
"epoch": 4.035648994515539,
"grad_norm": 70433.90625,
"learning_rate": 1.0105363984674331e-05,
"loss": 1.7616,
"step": 8830
},
{
"epoch": 4.040219378427788,
"grad_norm": 73853.703125,
"learning_rate": 1.0057471264367816e-05,
"loss": 1.7407,
"step": 8840
},
{
"epoch": 4.044789762340036,
"grad_norm": 89838.96875,
"learning_rate": 1.0009578544061303e-05,
"loss": 1.6869,
"step": 8850
},
{
"epoch": 4.049360146252285,
"grad_norm": 98299.5859375,
"learning_rate": 9.96168582375479e-06,
"loss": 1.6584,
"step": 8860
},
{
"epoch": 4.053930530164534,
"grad_norm": 52650.97265625,
"learning_rate": 9.913793103448277e-06,
"loss": 1.7039,
"step": 8870
},
{
"epoch": 4.058500914076783,
"grad_norm": 98332.7890625,
"learning_rate": 9.865900383141764e-06,
"loss": 1.7345,
"step": 8880
},
{
"epoch": 4.063071297989031,
"grad_norm": 87076.296875,
"learning_rate": 9.818007662835249e-06,
"loss": 1.6923,
"step": 8890
},
{
"epoch": 4.0676416819012795,
"grad_norm": 54348.390625,
"learning_rate": 9.770114942528738e-06,
"loss": 1.7026,
"step": 8900
},
{
"epoch": 4.072212065813528,
"grad_norm": 57868.62109375,
"learning_rate": 9.722222222222223e-06,
"loss": 1.7003,
"step": 8910
},
{
"epoch": 4.076782449725777,
"grad_norm": 65227.4375,
"learning_rate": 9.67432950191571e-06,
"loss": 1.5978,
"step": 8920
},
{
"epoch": 4.081352833638025,
"grad_norm": 105873.453125,
"learning_rate": 9.626436781609195e-06,
"loss": 1.7575,
"step": 8930
},
{
"epoch": 4.085923217550274,
"grad_norm": 112255.6640625,
"learning_rate": 9.578544061302683e-06,
"loss": 1.6445,
"step": 8940
},
{
"epoch": 4.090493601462523,
"grad_norm": 65130.7109375,
"learning_rate": 9.530651340996169e-06,
"loss": 1.7077,
"step": 8950
},
{
"epoch": 4.095063985374772,
"grad_norm": 116178.6796875,
"learning_rate": 9.482758620689655e-06,
"loss": 1.7243,
"step": 8960
},
{
"epoch": 4.0996343692870205,
"grad_norm": 105348.765625,
"learning_rate": 9.434865900383142e-06,
"loss": 1.6588,
"step": 8970
},
{
"epoch": 4.1042047531992685,
"grad_norm": 64916.08203125,
"learning_rate": 9.386973180076629e-06,
"loss": 1.6274,
"step": 8980
},
{
"epoch": 4.108775137111517,
"grad_norm": 85616.9453125,
"learning_rate": 9.339080459770114e-06,
"loss": 1.7181,
"step": 8990
},
{
"epoch": 4.113345521023766,
"grad_norm": 56802.30859375,
"learning_rate": 9.291187739463603e-06,
"loss": 1.7205,
"step": 9000
},
{
"epoch": 4.117915904936015,
"grad_norm": 106071.6015625,
"learning_rate": 9.243295019157088e-06,
"loss": 1.663,
"step": 9010
},
{
"epoch": 4.122486288848263,
"grad_norm": 84213.0859375,
"learning_rate": 9.195402298850575e-06,
"loss": 1.6674,
"step": 9020
},
{
"epoch": 4.127056672760512,
"grad_norm": 83103.328125,
"learning_rate": 9.147509578544062e-06,
"loss": 1.7418,
"step": 9030
},
{
"epoch": 4.131627056672761,
"grad_norm": 45266.80859375,
"learning_rate": 9.099616858237549e-06,
"loss": 1.6215,
"step": 9040
},
{
"epoch": 4.1361974405850095,
"grad_norm": 83939.390625,
"learning_rate": 9.051724137931036e-06,
"loss": 1.7258,
"step": 9050
},
{
"epoch": 4.140767824497257,
"grad_norm": 98675.046875,
"learning_rate": 9.00383141762452e-06,
"loss": 1.6511,
"step": 9060
},
{
"epoch": 4.145338208409506,
"grad_norm": 78594.921875,
"learning_rate": 8.95593869731801e-06,
"loss": 1.6855,
"step": 9070
},
{
"epoch": 4.149908592321755,
"grad_norm": 78093.4609375,
"learning_rate": 8.908045977011495e-06,
"loss": 1.7284,
"step": 9080
},
{
"epoch": 4.154478976234004,
"grad_norm": 98573.2890625,
"learning_rate": 8.860153256704981e-06,
"loss": 1.686,
"step": 9090
},
{
"epoch": 4.159049360146252,
"grad_norm": 56181.88671875,
"learning_rate": 8.812260536398467e-06,
"loss": 1.7138,
"step": 9100
},
{
"epoch": 4.163619744058501,
"grad_norm": 75070.2421875,
"learning_rate": 8.764367816091955e-06,
"loss": 1.7444,
"step": 9110
},
{
"epoch": 4.16819012797075,
"grad_norm": 62708.078125,
"learning_rate": 8.71647509578544e-06,
"loss": 1.7641,
"step": 9120
},
{
"epoch": 4.1727605118829985,
"grad_norm": 98152.28125,
"learning_rate": 8.668582375478927e-06,
"loss": 1.7544,
"step": 9130
},
{
"epoch": 4.177330895795246,
"grad_norm": 59807.0546875,
"learning_rate": 8.620689655172414e-06,
"loss": 1.727,
"step": 9140
},
{
"epoch": 4.181901279707495,
"grad_norm": 61243.06640625,
"learning_rate": 8.572796934865901e-06,
"loss": 1.6752,
"step": 9150
},
{
"epoch": 4.186471663619744,
"grad_norm": 69440.40625,
"learning_rate": 8.524904214559388e-06,
"loss": 1.7704,
"step": 9160
},
{
"epoch": 4.191042047531993,
"grad_norm": 132799.5625,
"learning_rate": 8.477011494252873e-06,
"loss": 1.7243,
"step": 9170
},
{
"epoch": 4.195612431444241,
"grad_norm": 60873.515625,
"learning_rate": 8.429118773946362e-06,
"loss": 1.7021,
"step": 9180
},
{
"epoch": 4.20018281535649,
"grad_norm": 56033.59375,
"learning_rate": 8.381226053639847e-06,
"loss": 1.6721,
"step": 9190
},
{
"epoch": 4.204753199268739,
"grad_norm": 65075.58984375,
"learning_rate": 8.333333333333334e-06,
"loss": 1.7053,
"step": 9200
},
{
"epoch": 4.209323583180987,
"grad_norm": 65212.20703125,
"learning_rate": 8.28544061302682e-06,
"loss": 1.6667,
"step": 9210
},
{
"epoch": 4.213893967093236,
"grad_norm": 98034.7109375,
"learning_rate": 8.237547892720307e-06,
"loss": 1.7291,
"step": 9220
},
{
"epoch": 4.218464351005484,
"grad_norm": 112262.515625,
"learning_rate": 8.189655172413793e-06,
"loss": 1.7307,
"step": 9230
},
{
"epoch": 4.223034734917733,
"grad_norm": 42643.26171875,
"learning_rate": 8.14176245210728e-06,
"loss": 1.7146,
"step": 9240
},
{
"epoch": 4.227605118829982,
"grad_norm": 120319.0703125,
"learning_rate": 8.093869731800766e-06,
"loss": 1.5791,
"step": 9250
},
{
"epoch": 4.232175502742231,
"grad_norm": 111697.4765625,
"learning_rate": 8.045977011494253e-06,
"loss": 1.7305,
"step": 9260
},
{
"epoch": 4.236745886654479,
"grad_norm": 82615.4453125,
"learning_rate": 7.998084291187739e-06,
"loss": 1.6441,
"step": 9270
},
{
"epoch": 4.2413162705667276,
"grad_norm": 112459.7890625,
"learning_rate": 7.950191570881227e-06,
"loss": 1.6667,
"step": 9280
},
{
"epoch": 4.245886654478976,
"grad_norm": 89633.421875,
"learning_rate": 7.902298850574712e-06,
"loss": 1.7402,
"step": 9290
},
{
"epoch": 4.250457038391225,
"grad_norm": 73259.6953125,
"learning_rate": 7.854406130268199e-06,
"loss": 1.725,
"step": 9300
},
{
"epoch": 4.255027422303473,
"grad_norm": 87751.640625,
"learning_rate": 7.806513409961686e-06,
"loss": 1.7017,
"step": 9310
},
{
"epoch": 4.259597806215722,
"grad_norm": 130956.0546875,
"learning_rate": 7.758620689655173e-06,
"loss": 1.6762,
"step": 9320
},
{
"epoch": 4.264168190127971,
"grad_norm": 54888.203125,
"learning_rate": 7.71072796934866e-06,
"loss": 1.772,
"step": 9330
},
{
"epoch": 4.26873857404022,
"grad_norm": 55581.7109375,
"learning_rate": 7.662835249042145e-06,
"loss": 1.7176,
"step": 9340
},
{
"epoch": 4.273308957952468,
"grad_norm": 89327.875,
"learning_rate": 7.614942528735633e-06,
"loss": 1.7243,
"step": 9350
},
{
"epoch": 4.2778793418647165,
"grad_norm": 76867.40625,
"learning_rate": 7.567049808429119e-06,
"loss": 1.7136,
"step": 9360
},
{
"epoch": 4.282449725776965,
"grad_norm": 131182.859375,
"learning_rate": 7.519157088122606e-06,
"loss": 1.7012,
"step": 9370
},
{
"epoch": 4.287020109689214,
"grad_norm": 80961.5546875,
"learning_rate": 7.4712643678160925e-06,
"loss": 1.7052,
"step": 9380
},
{
"epoch": 4.291590493601462,
"grad_norm": 64866.6796875,
"learning_rate": 7.423371647509579e-06,
"loss": 1.7689,
"step": 9390
},
{
"epoch": 4.296160877513711,
"grad_norm": 65769.59375,
"learning_rate": 7.375478927203065e-06,
"loss": 1.7517,
"step": 9400
},
{
"epoch": 4.30073126142596,
"grad_norm": 54188.25,
"learning_rate": 7.3275862068965514e-06,
"loss": 1.7222,
"step": 9410
},
{
"epoch": 4.305301645338209,
"grad_norm": 122926.7734375,
"learning_rate": 7.279693486590039e-06,
"loss": 1.6745,
"step": 9420
},
{
"epoch": 4.309872029250457,
"grad_norm": 69720.5078125,
"learning_rate": 7.231800766283525e-06,
"loss": 1.7206,
"step": 9430
},
{
"epoch": 4.3144424131627055,
"grad_norm": 127599.6953125,
"learning_rate": 7.183908045977011e-06,
"loss": 1.6919,
"step": 9440
},
{
"epoch": 4.319012797074954,
"grad_norm": 75789.5546875,
"learning_rate": 7.136015325670499e-06,
"loss": 1.725,
"step": 9450
},
{
"epoch": 4.323583180987203,
"grad_norm": 99748.046875,
"learning_rate": 7.088122605363985e-06,
"loss": 1.7164,
"step": 9460
},
{
"epoch": 4.328153564899452,
"grad_norm": 85926.734375,
"learning_rate": 7.040229885057471e-06,
"loss": 1.7377,
"step": 9470
},
{
"epoch": 4.3327239488117,
"grad_norm": 85478.2109375,
"learning_rate": 6.992337164750958e-06,
"loss": 1.7319,
"step": 9480
},
{
"epoch": 4.337294332723949,
"grad_norm": 72827.2265625,
"learning_rate": 6.944444444444445e-06,
"loss": 1.7371,
"step": 9490
},
{
"epoch": 4.341864716636198,
"grad_norm": 93393.625,
"learning_rate": 6.896551724137932e-06,
"loss": 1.6363,
"step": 9500
},
{
"epoch": 4.346435100548446,
"grad_norm": 90090.8359375,
"learning_rate": 6.848659003831418e-06,
"loss": 1.6988,
"step": 9510
},
{
"epoch": 4.3510054844606945,
"grad_norm": 85922.3203125,
"learning_rate": 6.800766283524905e-06,
"loss": 1.6765,
"step": 9520
},
{
"epoch": 4.355575868372943,
"grad_norm": 94569.2109375,
"learning_rate": 6.7528735632183914e-06,
"loss": 1.6928,
"step": 9530
},
{
"epoch": 4.360146252285192,
"grad_norm": 86991.3984375,
"learning_rate": 6.7049808429118775e-06,
"loss": 1.7658,
"step": 9540
},
{
"epoch": 4.364716636197441,
"grad_norm": 129308.6171875,
"learning_rate": 6.657088122605365e-06,
"loss": 1.7118,
"step": 9550
},
{
"epoch": 4.369287020109689,
"grad_norm": 58696.4765625,
"learning_rate": 6.609195402298851e-06,
"loss": 1.7136,
"step": 9560
},
{
"epoch": 4.373857404021938,
"grad_norm": 72559.2265625,
"learning_rate": 6.561302681992337e-06,
"loss": 1.7245,
"step": 9570
},
{
"epoch": 4.378427787934187,
"grad_norm": 96479.890625,
"learning_rate": 6.513409961685823e-06,
"loss": 1.7426,
"step": 9580
},
{
"epoch": 4.3829981718464355,
"grad_norm": 52625.5859375,
"learning_rate": 6.465517241379311e-06,
"loss": 1.818,
"step": 9590
},
{
"epoch": 4.387568555758683,
"grad_norm": 74031.9765625,
"learning_rate": 6.417624521072797e-06,
"loss": 1.688,
"step": 9600
},
{
"epoch": 4.392138939670932,
"grad_norm": 70750.546875,
"learning_rate": 6.369731800766283e-06,
"loss": 1.6336,
"step": 9610
},
{
"epoch": 4.396709323583181,
"grad_norm": 79010.375,
"learning_rate": 6.321839080459771e-06,
"loss": 1.6344,
"step": 9620
},
{
"epoch": 4.40127970749543,
"grad_norm": 52663.4765625,
"learning_rate": 6.273946360153257e-06,
"loss": 1.7024,
"step": 9630
},
{
"epoch": 4.405850091407678,
"grad_norm": 88580.4375,
"learning_rate": 6.226053639846744e-06,
"loss": 1.7704,
"step": 9640
},
{
"epoch": 4.410420475319927,
"grad_norm": 75858.3828125,
"learning_rate": 6.178160919540231e-06,
"loss": 1.7163,
"step": 9650
},
{
"epoch": 4.414990859232176,
"grad_norm": 85468.9296875,
"learning_rate": 6.130268199233717e-06,
"loss": 1.8792,
"step": 9660
},
{
"epoch": 4.4195612431444244,
"grad_norm": 149377.140625,
"learning_rate": 6.0823754789272035e-06,
"loss": 1.7492,
"step": 9670
},
{
"epoch": 4.424131627056672,
"grad_norm": 96749.546875,
"learning_rate": 6.03448275862069e-06,
"loss": 1.641,
"step": 9680
},
{
"epoch": 4.428702010968921,
"grad_norm": 114815.234375,
"learning_rate": 5.9865900383141764e-06,
"loss": 1.7394,
"step": 9690
},
{
"epoch": 4.43327239488117,
"grad_norm": 117656.1953125,
"learning_rate": 5.938697318007663e-06,
"loss": 1.7659,
"step": 9700
},
{
"epoch": 4.437842778793419,
"grad_norm": 91634.71875,
"learning_rate": 5.890804597701149e-06,
"loss": 1.806,
"step": 9710
},
{
"epoch": 4.442413162705667,
"grad_norm": 59309.10546875,
"learning_rate": 5.842911877394636e-06,
"loss": 1.7264,
"step": 9720
},
{
"epoch": 4.446983546617916,
"grad_norm": 102864.578125,
"learning_rate": 5.795019157088123e-06,
"loss": 1.6442,
"step": 9730
},
{
"epoch": 4.451553930530165,
"grad_norm": 48123.75,
"learning_rate": 5.747126436781609e-06,
"loss": 1.7215,
"step": 9740
},
{
"epoch": 4.456124314442413,
"grad_norm": 59340.78125,
"learning_rate": 5.699233716475096e-06,
"loss": 1.7105,
"step": 9750
},
{
"epoch": 4.460694698354661,
"grad_norm": 47793.0078125,
"learning_rate": 5.651340996168582e-06,
"loss": 1.773,
"step": 9760
},
{
"epoch": 4.46526508226691,
"grad_norm": 68314.3828125,
"learning_rate": 5.603448275862069e-06,
"loss": 1.8067,
"step": 9770
},
{
"epoch": 4.469835466179159,
"grad_norm": 127164.171875,
"learning_rate": 5.555555555555556e-06,
"loss": 1.712,
"step": 9780
},
{
"epoch": 4.474405850091408,
"grad_norm": 108175.9296875,
"learning_rate": 5.507662835249043e-06,
"loss": 1.6775,
"step": 9790
},
{
"epoch": 4.478976234003657,
"grad_norm": 83982.234375,
"learning_rate": 5.45977011494253e-06,
"loss": 1.7235,
"step": 9800
},
{
"epoch": 4.483546617915905,
"grad_norm": 116926.3515625,
"learning_rate": 5.411877394636016e-06,
"loss": 1.7325,
"step": 9810
},
{
"epoch": 4.4881170018281535,
"grad_norm": 85041.0234375,
"learning_rate": 5.3639846743295025e-06,
"loss": 1.7853,
"step": 9820
},
{
"epoch": 4.492687385740402,
"grad_norm": 67453.5859375,
"learning_rate": 5.3160919540229885e-06,
"loss": 1.7712,
"step": 9830
},
{
"epoch": 4.497257769652651,
"grad_norm": 120161.9140625,
"learning_rate": 5.268199233716475e-06,
"loss": 1.7047,
"step": 9840
},
{
"epoch": 4.501828153564899,
"grad_norm": 91166.8984375,
"learning_rate": 5.220306513409962e-06,
"loss": 1.7825,
"step": 9850
},
{
"epoch": 4.506398537477148,
"grad_norm": 80539.265625,
"learning_rate": 5.172413793103448e-06,
"loss": 1.7293,
"step": 9860
},
{
"epoch": 4.510968921389397,
"grad_norm": 89111.5390625,
"learning_rate": 5.124521072796935e-06,
"loss": 1.6972,
"step": 9870
},
{
"epoch": 4.515539305301646,
"grad_norm": 106499.9453125,
"learning_rate": 5.076628352490421e-06,
"loss": 1.6945,
"step": 9880
},
{
"epoch": 4.520109689213894,
"grad_norm": 81342.203125,
"learning_rate": 5.028735632183908e-06,
"loss": 1.7311,
"step": 9890
},
{
"epoch": 4.5246800731261425,
"grad_norm": 63680.8359375,
"learning_rate": 4.980842911877395e-06,
"loss": 1.6724,
"step": 9900
},
{
"epoch": 4.529250457038391,
"grad_norm": 80776.640625,
"learning_rate": 4.932950191570882e-06,
"loss": 1.7283,
"step": 9910
},
{
"epoch": 4.53382084095064,
"grad_norm": 107851.6328125,
"learning_rate": 4.885057471264369e-06,
"loss": 1.7028,
"step": 9920
},
{
"epoch": 4.538391224862888,
"grad_norm": 79906.65625,
"learning_rate": 4.837164750957855e-06,
"loss": 1.6608,
"step": 9930
},
{
"epoch": 4.542961608775137,
"grad_norm": 67892.0,
"learning_rate": 4.789272030651342e-06,
"loss": 1.6717,
"step": 9940
},
{
"epoch": 4.547531992687386,
"grad_norm": 79051.953125,
"learning_rate": 4.741379310344828e-06,
"loss": 1.6859,
"step": 9950
},
{
"epoch": 4.552102376599635,
"grad_norm": 107722.7109375,
"learning_rate": 4.6934865900383146e-06,
"loss": 1.768,
"step": 9960
},
{
"epoch": 4.556672760511883,
"grad_norm": 73130.265625,
"learning_rate": 4.6455938697318015e-06,
"loss": 1.623,
"step": 9970
},
{
"epoch": 4.5612431444241315,
"grad_norm": 96307.8984375,
"learning_rate": 4.5977011494252875e-06,
"loss": 1.7251,
"step": 9980
},
{
"epoch": 4.56581352833638,
"grad_norm": 92756.6484375,
"learning_rate": 4.549808429118774e-06,
"loss": 1.7256,
"step": 9990
},
{
"epoch": 4.570383912248629,
"grad_norm": 99314.5390625,
"learning_rate": 4.50191570881226e-06,
"loss": 1.7193,
"step": 10000
},
{
"epoch": 4.574954296160877,
"grad_norm": 89754.9453125,
"learning_rate": 4.454022988505747e-06,
"loss": 1.7082,
"step": 10010
},
{
"epoch": 4.579524680073126,
"grad_norm": 75742.890625,
"learning_rate": 4.406130268199233e-06,
"loss": 1.7386,
"step": 10020
},
{
"epoch": 4.584095063985375,
"grad_norm": 95144.9921875,
"learning_rate": 4.35823754789272e-06,
"loss": 1.7612,
"step": 10030
},
{
"epoch": 4.588665447897624,
"grad_norm": 74380.0546875,
"learning_rate": 4.310344827586207e-06,
"loss": 1.7231,
"step": 10040
},
{
"epoch": 4.5932358318098725,
"grad_norm": 75351.3515625,
"learning_rate": 4.262452107279694e-06,
"loss": 1.6865,
"step": 10050
},
{
"epoch": 4.5978062157221204,
"grad_norm": 126452.8359375,
"learning_rate": 4.214559386973181e-06,
"loss": 1.7256,
"step": 10060
},
{
"epoch": 4.602376599634369,
"grad_norm": 50301.078125,
"learning_rate": 4.166666666666667e-06,
"loss": 1.7692,
"step": 10070
},
{
"epoch": 4.606946983546618,
"grad_norm": 72251.7265625,
"learning_rate": 4.118773946360154e-06,
"loss": 1.7593,
"step": 10080
},
{
"epoch": 4.611517367458866,
"grad_norm": 63932.359375,
"learning_rate": 4.07088122605364e-06,
"loss": 1.768,
"step": 10090
},
{
"epoch": 4.616087751371115,
"grad_norm": 71254.5390625,
"learning_rate": 4.022988505747127e-06,
"loss": 1.736,
"step": 10100
},
{
"epoch": 4.620658135283364,
"grad_norm": 105173.09375,
"learning_rate": 3.9750957854406135e-06,
"loss": 1.7123,
"step": 10110
},
{
"epoch": 4.625228519195613,
"grad_norm": 55138.73828125,
"learning_rate": 3.9272030651340996e-06,
"loss": 1.7062,
"step": 10120
},
{
"epoch": 4.6297989031078615,
"grad_norm": 56330.21484375,
"learning_rate": 3.8793103448275865e-06,
"loss": 1.6406,
"step": 10130
},
{
"epoch": 4.634369287020109,
"grad_norm": 134460.59375,
"learning_rate": 3.8314176245210725e-06,
"loss": 1.7209,
"step": 10140
},
{
"epoch": 4.638939670932358,
"grad_norm": 67907.453125,
"learning_rate": 3.7835249042145594e-06,
"loss": 1.7233,
"step": 10150
},
{
"epoch": 4.643510054844607,
"grad_norm": 100771.4921875,
"learning_rate": 3.7356321839080462e-06,
"loss": 1.6853,
"step": 10160
},
{
"epoch": 4.648080438756856,
"grad_norm": 54433.70703125,
"learning_rate": 3.6877394636015327e-06,
"loss": 1.5971,
"step": 10170
},
{
"epoch": 4.652650822669104,
"grad_norm": 74801.8515625,
"learning_rate": 3.6398467432950196e-06,
"loss": 1.6912,
"step": 10180
},
{
"epoch": 4.657221206581353,
"grad_norm": 89413.6328125,
"learning_rate": 3.5919540229885056e-06,
"loss": 1.7757,
"step": 10190
},
{
"epoch": 4.661791590493602,
"grad_norm": 55121.12109375,
"learning_rate": 3.5440613026819925e-06,
"loss": 1.7154,
"step": 10200
},
{
"epoch": 4.66636197440585,
"grad_norm": 71017.0078125,
"learning_rate": 3.496168582375479e-06,
"loss": 1.6741,
"step": 10210
},
{
"epoch": 4.670932358318098,
"grad_norm": 70393.5859375,
"learning_rate": 3.448275862068966e-06,
"loss": 1.7049,
"step": 10220
},
{
"epoch": 4.675502742230347,
"grad_norm": 68136.203125,
"learning_rate": 3.4003831417624527e-06,
"loss": 1.7331,
"step": 10230
},
{
"epoch": 4.680073126142596,
"grad_norm": 92633.125,
"learning_rate": 3.3524904214559387e-06,
"loss": 1.7386,
"step": 10240
},
{
"epoch": 4.684643510054845,
"grad_norm": 52863.0234375,
"learning_rate": 3.3045977011494256e-06,
"loss": 1.6676,
"step": 10250
},
{
"epoch": 4.689213893967093,
"grad_norm": 52217.04296875,
"learning_rate": 3.2567049808429117e-06,
"loss": 1.6496,
"step": 10260
},
{
"epoch": 4.693784277879342,
"grad_norm": 71589.21875,
"learning_rate": 3.2088122605363985e-06,
"loss": 1.7128,
"step": 10270
},
{
"epoch": 4.698354661791591,
"grad_norm": 63616.85546875,
"learning_rate": 3.1609195402298854e-06,
"loss": 1.8258,
"step": 10280
},
{
"epoch": 4.702925045703839,
"grad_norm": 51611.06640625,
"learning_rate": 3.113026819923372e-06,
"loss": 1.6953,
"step": 10290
},
{
"epoch": 4.707495429616088,
"grad_norm": 65978.15625,
"learning_rate": 3.0651340996168583e-06,
"loss": 1.7285,
"step": 10300
},
{
"epoch": 4.712065813528336,
"grad_norm": 68540.25,
"learning_rate": 3.017241379310345e-06,
"loss": 1.666,
"step": 10310
},
{
"epoch": 4.716636197440585,
"grad_norm": 121031.6015625,
"learning_rate": 2.9693486590038317e-06,
"loss": 1.7578,
"step": 10320
},
{
"epoch": 4.721206581352834,
"grad_norm": 90117.9296875,
"learning_rate": 2.921455938697318e-06,
"loss": 1.7317,
"step": 10330
},
{
"epoch": 4.725776965265082,
"grad_norm": 113749.5390625,
"learning_rate": 2.8735632183908046e-06,
"loss": 1.7508,
"step": 10340
},
{
"epoch": 4.730347349177331,
"grad_norm": 51193.17578125,
"learning_rate": 2.825670498084291e-06,
"loss": 1.6362,
"step": 10350
},
{
"epoch": 4.7349177330895795,
"grad_norm": 90305.71875,
"learning_rate": 2.777777777777778e-06,
"loss": 1.7621,
"step": 10360
},
{
"epoch": 4.739488117001828,
"grad_norm": 74391.09375,
"learning_rate": 2.729885057471265e-06,
"loss": 1.6677,
"step": 10370
},
{
"epoch": 4.744058500914077,
"grad_norm": 82829.6875,
"learning_rate": 2.6819923371647512e-06,
"loss": 1.8085,
"step": 10380
},
{
"epoch": 4.748628884826325,
"grad_norm": 102442.8515625,
"learning_rate": 2.6340996168582377e-06,
"loss": 1.6941,
"step": 10390
},
{
"epoch": 4.753199268738574,
"grad_norm": 84212.0546875,
"learning_rate": 2.586206896551724e-06,
"loss": 1.6369,
"step": 10400
},
{
"epoch": 4.757769652650823,
"grad_norm": 87777.5234375,
"learning_rate": 2.5383141762452106e-06,
"loss": 1.6629,
"step": 10410
},
{
"epoch": 4.762340036563071,
"grad_norm": 72210.796875,
"learning_rate": 2.4904214559386975e-06,
"loss": 1.7738,
"step": 10420
},
{
"epoch": 4.76691042047532,
"grad_norm": 72672.140625,
"learning_rate": 2.4425287356321844e-06,
"loss": 1.743,
"step": 10430
},
{
"epoch": 4.7714808043875685,
"grad_norm": 52435.03515625,
"learning_rate": 2.394636015325671e-06,
"loss": 1.6503,
"step": 10440
},
{
"epoch": 4.776051188299817,
"grad_norm": 81516.578125,
"learning_rate": 2.3467432950191573e-06,
"loss": 1.7072,
"step": 10450
},
{
"epoch": 4.780621572212066,
"grad_norm": 64971.1328125,
"learning_rate": 2.2988505747126437e-06,
"loss": 1.7349,
"step": 10460
},
{
"epoch": 4.785191956124314,
"grad_norm": 90203.71875,
"learning_rate": 2.25095785440613e-06,
"loss": 1.7305,
"step": 10470
},
{
"epoch": 4.789762340036563,
"grad_norm": 85017.875,
"learning_rate": 2.2030651340996167e-06,
"loss": 1.642,
"step": 10480
},
{
"epoch": 4.794332723948812,
"grad_norm": 88660.4765625,
"learning_rate": 2.1551724137931035e-06,
"loss": 1.7387,
"step": 10490
},
{
"epoch": 4.798903107861061,
"grad_norm": 81671.3984375,
"learning_rate": 2.1072796934865904e-06,
"loss": 1.6963,
"step": 10500
},
{
"epoch": 4.803473491773309,
"grad_norm": 82145.9453125,
"learning_rate": 2.059386973180077e-06,
"loss": 1.689,
"step": 10510
},
{
"epoch": 4.8080438756855575,
"grad_norm": 60723.84375,
"learning_rate": 2.0114942528735633e-06,
"loss": 1.7065,
"step": 10520
},
{
"epoch": 4.812614259597806,
"grad_norm": 79342.8515625,
"learning_rate": 1.9636015325670498e-06,
"loss": 1.6952,
"step": 10530
},
{
"epoch": 4.817184643510055,
"grad_norm": 66902.59375,
"learning_rate": 1.9157088122605362e-06,
"loss": 1.6965,
"step": 10540
},
{
"epoch": 4.821755027422303,
"grad_norm": 57767.9453125,
"learning_rate": 1.8678160919540231e-06,
"loss": 1.7267,
"step": 10550
},
{
"epoch": 4.826325411334552,
"grad_norm": 57847.31640625,
"learning_rate": 1.8199233716475098e-06,
"loss": 1.5725,
"step": 10560
},
{
"epoch": 4.830895795246801,
"grad_norm": 71927.8984375,
"learning_rate": 1.7720306513409962e-06,
"loss": 1.6501,
"step": 10570
},
{
"epoch": 4.83546617915905,
"grad_norm": 74899.0234375,
"learning_rate": 1.724137931034483e-06,
"loss": 1.6678,
"step": 10580
},
{
"epoch": 4.840036563071298,
"grad_norm": 92564.6875,
"learning_rate": 1.6762452107279694e-06,
"loss": 1.6991,
"step": 10590
},
{
"epoch": 4.844606946983546,
"grad_norm": 65702.7109375,
"learning_rate": 1.6283524904214558e-06,
"loss": 1.7309,
"step": 10600
},
{
"epoch": 4.849177330895795,
"grad_norm": 72665.1796875,
"learning_rate": 1.5804597701149427e-06,
"loss": 1.6152,
"step": 10610
},
{
"epoch": 4.853747714808044,
"grad_norm": 94501.109375,
"learning_rate": 1.5325670498084292e-06,
"loss": 1.6927,
"step": 10620
},
{
"epoch": 4.858318098720293,
"grad_norm": 113540.65625,
"learning_rate": 1.4846743295019158e-06,
"loss": 1.7562,
"step": 10630
},
{
"epoch": 4.862888482632541,
"grad_norm": 60082.1484375,
"learning_rate": 1.4367816091954023e-06,
"loss": 1.743,
"step": 10640
},
{
"epoch": 4.86745886654479,
"grad_norm": 77984.4921875,
"learning_rate": 1.388888888888889e-06,
"loss": 1.597,
"step": 10650
},
{
"epoch": 4.872029250457039,
"grad_norm": 77285.3515625,
"learning_rate": 1.3409961685823756e-06,
"loss": 1.6401,
"step": 10660
},
{
"epoch": 4.876599634369287,
"grad_norm": 62164.921875,
"learning_rate": 1.293103448275862e-06,
"loss": 1.7019,
"step": 10670
},
{
"epoch": 4.881170018281535,
"grad_norm": 82581.1875,
"learning_rate": 1.2452107279693487e-06,
"loss": 1.6698,
"step": 10680
},
{
"epoch": 4.885740402193784,
"grad_norm": 191003.421875,
"learning_rate": 1.1973180076628354e-06,
"loss": 1.7425,
"step": 10690
},
{
"epoch": 4.890310786106033,
"grad_norm": 70496.2421875,
"learning_rate": 1.1494252873563219e-06,
"loss": 1.7575,
"step": 10700
},
{
"epoch": 4.894881170018282,
"grad_norm": 96979.078125,
"learning_rate": 1.1015325670498083e-06,
"loss": 1.7798,
"step": 10710
},
{
"epoch": 4.89945155393053,
"grad_norm": 75589.8046875,
"learning_rate": 1.0536398467432952e-06,
"loss": 1.6209,
"step": 10720
},
{
"epoch": 4.904021937842779,
"grad_norm": 146879.5,
"learning_rate": 1.0057471264367817e-06,
"loss": 1.7368,
"step": 10730
},
{
"epoch": 4.908592321755028,
"grad_norm": 80107.3125,
"learning_rate": 9.578544061302681e-07,
"loss": 1.7019,
"step": 10740
},
{
"epoch": 4.913162705667276,
"grad_norm": 66398.5703125,
"learning_rate": 9.099616858237549e-07,
"loss": 1.7885,
"step": 10750
},
{
"epoch": 4.917733089579524,
"grad_norm": 76734.59375,
"learning_rate": 8.620689655172415e-07,
"loss": 1.7817,
"step": 10760
},
{
"epoch": 4.922303473491773,
"grad_norm": 79165.3203125,
"learning_rate": 8.141762452107279e-07,
"loss": 1.7004,
"step": 10770
},
{
"epoch": 4.926873857404022,
"grad_norm": 61589.2109375,
"learning_rate": 7.662835249042146e-07,
"loss": 1.7177,
"step": 10780
},
{
"epoch": 4.931444241316271,
"grad_norm": 91896.1015625,
"learning_rate": 7.183908045977011e-07,
"loss": 1.7116,
"step": 10790
},
{
"epoch": 4.936014625228519,
"grad_norm": 76825.4296875,
"learning_rate": 6.704980842911878e-07,
"loss": 1.6501,
"step": 10800
},
{
"epoch": 4.940585009140768,
"grad_norm": 79197.75,
"learning_rate": 6.226053639846744e-07,
"loss": 1.7284,
"step": 10810
},
{
"epoch": 4.9451553930530165,
"grad_norm": 87847.4609375,
"learning_rate": 5.747126436781609e-07,
"loss": 1.7757,
"step": 10820
},
{
"epoch": 4.949725776965265,
"grad_norm": 105324.3125,
"learning_rate": 5.268199233716476e-07,
"loss": 1.7017,
"step": 10830
},
{
"epoch": 4.954296160877513,
"grad_norm": 78621.59375,
"learning_rate": 4.789272030651341e-07,
"loss": 1.6659,
"step": 10840
},
{
"epoch": 4.958866544789762,
"grad_norm": 161616.046875,
"learning_rate": 4.3103448275862073e-07,
"loss": 1.7586,
"step": 10850
},
{
"epoch": 4.963436928702011,
"grad_norm": 87202.765625,
"learning_rate": 3.831417624521073e-07,
"loss": 1.6147,
"step": 10860
},
{
"epoch": 4.96800731261426,
"grad_norm": 87169.0390625,
"learning_rate": 3.352490421455939e-07,
"loss": 1.6565,
"step": 10870
},
{
"epoch": 4.972577696526509,
"grad_norm": 93609.015625,
"learning_rate": 2.8735632183908047e-07,
"loss": 1.8099,
"step": 10880
},
{
"epoch": 4.977148080438757,
"grad_norm": 120831.265625,
"learning_rate": 2.3946360153256703e-07,
"loss": 1.7018,
"step": 10890
},
{
"epoch": 4.9817184643510055,
"grad_norm": 69430.453125,
"learning_rate": 1.9157088122605365e-07,
"loss": 1.6799,
"step": 10900
},
{
"epoch": 4.986288848263254,
"grad_norm": 65192.02734375,
"learning_rate": 1.4367816091954023e-07,
"loss": 1.7379,
"step": 10910
},
{
"epoch": 4.990859232175502,
"grad_norm": 89309.6953125,
"learning_rate": 9.578544061302682e-08,
"loss": 1.7807,
"step": 10920
},
{
"epoch": 4.995429616087751,
"grad_norm": 65604.3203125,
"learning_rate": 4.789272030651341e-08,
"loss": 1.7125,
"step": 10930
},
{
"epoch": 5.0,
"grad_norm": 114907.515625,
"learning_rate": 0.0,
"loss": 1.7282,
"step": 10940
},
{
"epoch": 5.0,
"eval_loss": 1.6917415857315063,
"eval_runtime": 346.3088,
"eval_samples_per_second": 43.314,
"eval_steps_per_second": 1.354,
"step": 10940
}
],
"logging_steps": 10,
"max_steps": 10940,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}