hdong0's picture
Model save
fdc91be verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 2932,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0068212824010914054,
"grad_norm": 2.541781765483875,
"learning_rate": 1.360544217687075e-06,
"loss": 0.8582,
"num_tokens": 3759146.0,
"step": 5
},
{
"epoch": 0.013642564802182811,
"grad_norm": 1.7880590777745466,
"learning_rate": 3.0612244897959185e-06,
"loss": 0.852,
"num_tokens": 7668808.0,
"step": 10
},
{
"epoch": 0.020463847203274217,
"grad_norm": 1.1396843211846335,
"learning_rate": 4.7619047619047615e-06,
"loss": 0.7985,
"num_tokens": 11368873.0,
"step": 15
},
{
"epoch": 0.027285129604365622,
"grad_norm": 0.7380484470623474,
"learning_rate": 6.462585034013606e-06,
"loss": 0.7495,
"num_tokens": 15118063.0,
"step": 20
},
{
"epoch": 0.034106412005457026,
"grad_norm": 0.5940722286411614,
"learning_rate": 8.163265306122448e-06,
"loss": 0.7103,
"num_tokens": 18906839.0,
"step": 25
},
{
"epoch": 0.040927694406548434,
"grad_norm": 0.5396036096677221,
"learning_rate": 9.863945578231292e-06,
"loss": 0.6796,
"num_tokens": 22641755.0,
"step": 30
},
{
"epoch": 0.047748976807639835,
"grad_norm": 0.48176514966602646,
"learning_rate": 1.1564625850340138e-05,
"loss": 0.6702,
"num_tokens": 26636629.0,
"step": 35
},
{
"epoch": 0.054570259208731244,
"grad_norm": 0.4277566165637127,
"learning_rate": 1.3265306122448982e-05,
"loss": 0.6354,
"num_tokens": 30417967.0,
"step": 40
},
{
"epoch": 0.061391541609822645,
"grad_norm": 0.32948370527060555,
"learning_rate": 1.4965986394557824e-05,
"loss": 0.6353,
"num_tokens": 34231333.0,
"step": 45
},
{
"epoch": 0.06821282401091405,
"grad_norm": 0.30904400396091464,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.6331,
"num_tokens": 37961424.0,
"step": 50
},
{
"epoch": 0.07503410641200546,
"grad_norm": 0.3379471244414341,
"learning_rate": 1.836734693877551e-05,
"loss": 0.6307,
"num_tokens": 41826860.0,
"step": 55
},
{
"epoch": 0.08185538881309687,
"grad_norm": 0.3194776627652625,
"learning_rate": 2.0068027210884355e-05,
"loss": 0.6154,
"num_tokens": 45543403.0,
"step": 60
},
{
"epoch": 0.08867667121418826,
"grad_norm": 0.3576895809208668,
"learning_rate": 2.17687074829932e-05,
"loss": 0.6176,
"num_tokens": 49369486.0,
"step": 65
},
{
"epoch": 0.09549795361527967,
"grad_norm": 0.37184023838165053,
"learning_rate": 2.3469387755102043e-05,
"loss": 0.6053,
"num_tokens": 53010874.0,
"step": 70
},
{
"epoch": 0.10231923601637108,
"grad_norm": 0.3667912581195206,
"learning_rate": 2.5170068027210887e-05,
"loss": 0.6063,
"num_tokens": 56909889.0,
"step": 75
},
{
"epoch": 0.10914051841746249,
"grad_norm": 0.3730278875443016,
"learning_rate": 2.687074829931973e-05,
"loss": 0.5878,
"num_tokens": 60650570.0,
"step": 80
},
{
"epoch": 0.11596180081855388,
"grad_norm": 0.36241315719448003,
"learning_rate": 2.857142857142857e-05,
"loss": 0.5797,
"num_tokens": 64564660.0,
"step": 85
},
{
"epoch": 0.12278308321964529,
"grad_norm": 0.36828276580401526,
"learning_rate": 3.0272108843537418e-05,
"loss": 0.5973,
"num_tokens": 68426882.0,
"step": 90
},
{
"epoch": 0.1296043656207367,
"grad_norm": 0.34302064089420164,
"learning_rate": 3.1972789115646265e-05,
"loss": 0.5922,
"num_tokens": 72252819.0,
"step": 95
},
{
"epoch": 0.1364256480218281,
"grad_norm": 0.4550066149442531,
"learning_rate": 3.36734693877551e-05,
"loss": 0.5781,
"num_tokens": 76160914.0,
"step": 100
},
{
"epoch": 0.1432469304229195,
"grad_norm": 0.40436569138766876,
"learning_rate": 3.5374149659863946e-05,
"loss": 0.5795,
"num_tokens": 80152955.0,
"step": 105
},
{
"epoch": 0.15006821282401092,
"grad_norm": 0.3562171730848496,
"learning_rate": 3.707482993197279e-05,
"loss": 0.5826,
"num_tokens": 83901702.0,
"step": 110
},
{
"epoch": 0.15688949522510232,
"grad_norm": 0.42960332050781525,
"learning_rate": 3.8775510204081634e-05,
"loss": 0.5806,
"num_tokens": 87691018.0,
"step": 115
},
{
"epoch": 0.16371077762619374,
"grad_norm": 0.3710888003891392,
"learning_rate": 4.047619047619048e-05,
"loss": 0.5695,
"num_tokens": 91542428.0,
"step": 120
},
{
"epoch": 0.17053206002728513,
"grad_norm": 0.5761721385944898,
"learning_rate": 4.217687074829932e-05,
"loss": 0.5767,
"num_tokens": 95438170.0,
"step": 125
},
{
"epoch": 0.17735334242837653,
"grad_norm": 0.6241666610197041,
"learning_rate": 4.387755102040816e-05,
"loss": 0.5724,
"num_tokens": 99383692.0,
"step": 130
},
{
"epoch": 0.18417462482946795,
"grad_norm": 0.5871695733429951,
"learning_rate": 4.557823129251701e-05,
"loss": 0.5721,
"num_tokens": 103223537.0,
"step": 135
},
{
"epoch": 0.19099590723055934,
"grad_norm": 0.49124082309652556,
"learning_rate": 4.7278911564625856e-05,
"loss": 0.5849,
"num_tokens": 106901687.0,
"step": 140
},
{
"epoch": 0.19781718963165076,
"grad_norm": 0.4800174598604436,
"learning_rate": 4.89795918367347e-05,
"loss": 0.5564,
"num_tokens": 110840758.0,
"step": 145
},
{
"epoch": 0.20463847203274216,
"grad_norm": 0.44927716136774587,
"learning_rate": 4.9999942738637725e-05,
"loss": 0.5714,
"num_tokens": 114521504.0,
"step": 150
},
{
"epoch": 0.21145975443383355,
"grad_norm": 0.42337064215442904,
"learning_rate": 4.999929855165921e-05,
"loss": 0.5553,
"num_tokens": 118445759.0,
"step": 155
},
{
"epoch": 0.21828103683492497,
"grad_norm": 0.43044313054459166,
"learning_rate": 4.999793862156041e-05,
"loss": 0.5637,
"num_tokens": 122311693.0,
"step": 160
},
{
"epoch": 0.22510231923601637,
"grad_norm": 0.4339776202600696,
"learning_rate": 4.999586299160312e-05,
"loss": 0.566,
"num_tokens": 126094524.0,
"step": 165
},
{
"epoch": 0.23192360163710776,
"grad_norm": 0.40738667516921956,
"learning_rate": 4.999307172781686e-05,
"loss": 0.5592,
"num_tokens": 129971056.0,
"step": 170
},
{
"epoch": 0.23874488403819918,
"grad_norm": 0.40693766273399773,
"learning_rate": 4.998956491899676e-05,
"loss": 0.5735,
"num_tokens": 133765982.0,
"step": 175
},
{
"epoch": 0.24556616643929058,
"grad_norm": 0.45034001459143336,
"learning_rate": 4.9985342676700705e-05,
"loss": 0.5531,
"num_tokens": 137522281.0,
"step": 180
},
{
"epoch": 0.252387448840382,
"grad_norm": 0.36073152691456045,
"learning_rate": 4.998040513524581e-05,
"loss": 0.5474,
"num_tokens": 141196084.0,
"step": 185
},
{
"epoch": 0.2592087312414734,
"grad_norm": 0.3992148561894294,
"learning_rate": 4.997475245170414e-05,
"loss": 0.5583,
"num_tokens": 145136704.0,
"step": 190
},
{
"epoch": 0.2660300136425648,
"grad_norm": 0.38393616918462065,
"learning_rate": 4.996838480589772e-05,
"loss": 0.546,
"num_tokens": 148940122.0,
"step": 195
},
{
"epoch": 0.2728512960436562,
"grad_norm": 0.34915039592664654,
"learning_rate": 4.9961302400392804e-05,
"loss": 0.5555,
"num_tokens": 152809678.0,
"step": 200
},
{
"epoch": 0.27967257844474763,
"grad_norm": 0.4709177431986326,
"learning_rate": 4.9953505460493435e-05,
"loss": 0.549,
"num_tokens": 156683573.0,
"step": 205
},
{
"epoch": 0.286493860845839,
"grad_norm": 0.3989622124157632,
"learning_rate": 4.99449942342343e-05,
"loss": 0.5502,
"num_tokens": 160433326.0,
"step": 210
},
{
"epoch": 0.2933151432469304,
"grad_norm": 0.2963557816370656,
"learning_rate": 4.993576899237278e-05,
"loss": 0.5534,
"num_tokens": 164374316.0,
"step": 215
},
{
"epoch": 0.30013642564802184,
"grad_norm": 0.33146262474141547,
"learning_rate": 4.992583002838041e-05,
"loss": 0.5325,
"num_tokens": 168223299.0,
"step": 220
},
{
"epoch": 0.3069577080491132,
"grad_norm": 0.3804270204403636,
"learning_rate": 4.991517765843349e-05,
"loss": 0.5454,
"num_tokens": 172132000.0,
"step": 225
},
{
"epoch": 0.31377899045020463,
"grad_norm": 0.4050739455664775,
"learning_rate": 4.990381222140305e-05,
"loss": 0.541,
"num_tokens": 176086331.0,
"step": 230
},
{
"epoch": 0.32060027285129605,
"grad_norm": 0.438158122564832,
"learning_rate": 4.989173407884408e-05,
"loss": 0.533,
"num_tokens": 179917640.0,
"step": 235
},
{
"epoch": 0.3274215552523875,
"grad_norm": 0.37482443871129906,
"learning_rate": 4.987894361498399e-05,
"loss": 0.5457,
"num_tokens": 183746129.0,
"step": 240
},
{
"epoch": 0.33424283765347884,
"grad_norm": 0.40883725910500407,
"learning_rate": 4.9865441236710415e-05,
"loss": 0.5542,
"num_tokens": 187699370.0,
"step": 245
},
{
"epoch": 0.34106412005457026,
"grad_norm": 0.36840773537703164,
"learning_rate": 4.985122737355828e-05,
"loss": 0.5362,
"num_tokens": 191512142.0,
"step": 250
},
{
"epoch": 0.3478854024556617,
"grad_norm": 0.35196282145686897,
"learning_rate": 4.983630247769613e-05,
"loss": 0.5373,
"num_tokens": 195367749.0,
"step": 255
},
{
"epoch": 0.35470668485675305,
"grad_norm": 0.32453218633384984,
"learning_rate": 4.982066702391169e-05,
"loss": 0.5259,
"num_tokens": 199215371.0,
"step": 260
},
{
"epoch": 0.3615279672578445,
"grad_norm": 0.3333209805493323,
"learning_rate": 4.980432150959687e-05,
"loss": 0.5455,
"num_tokens": 202903048.0,
"step": 265
},
{
"epoch": 0.3683492496589359,
"grad_norm": 0.34300836131361134,
"learning_rate": 4.978726645473186e-05,
"loss": 0.5344,
"num_tokens": 206761213.0,
"step": 270
},
{
"epoch": 0.37517053206002726,
"grad_norm": 0.33687880661376,
"learning_rate": 4.976950240186857e-05,
"loss": 0.5302,
"num_tokens": 210585632.0,
"step": 275
},
{
"epoch": 0.3819918144611187,
"grad_norm": 0.3062190092375375,
"learning_rate": 4.975102991611348e-05,
"loss": 0.536,
"num_tokens": 214261738.0,
"step": 280
},
{
"epoch": 0.3888130968622101,
"grad_norm": 0.3228746457123477,
"learning_rate": 4.973184958510955e-05,
"loss": 0.5404,
"num_tokens": 218144024.0,
"step": 285
},
{
"epoch": 0.3956343792633015,
"grad_norm": 0.3198594107258337,
"learning_rate": 4.971196201901757e-05,
"loss": 0.5361,
"num_tokens": 222057295.0,
"step": 290
},
{
"epoch": 0.4024556616643929,
"grad_norm": 0.3994626681138991,
"learning_rate": 4.969136785049676e-05,
"loss": 0.5275,
"num_tokens": 225828573.0,
"step": 295
},
{
"epoch": 0.4092769440654843,
"grad_norm": 0.36849928732257725,
"learning_rate": 4.9670067734684625e-05,
"loss": 0.5272,
"num_tokens": 229710487.0,
"step": 300
},
{
"epoch": 0.41609822646657574,
"grad_norm": 0.3489875276180288,
"learning_rate": 4.9648062349176145e-05,
"loss": 0.5422,
"num_tokens": 233617525.0,
"step": 305
},
{
"epoch": 0.4229195088676671,
"grad_norm": 0.36792321807790457,
"learning_rate": 4.962535239400217e-05,
"loss": 0.5396,
"num_tokens": 237394471.0,
"step": 310
},
{
"epoch": 0.4297407912687585,
"grad_norm": 0.34720260466539804,
"learning_rate": 4.9601938591607175e-05,
"loss": 0.5338,
"num_tokens": 241114261.0,
"step": 315
},
{
"epoch": 0.43656207366984995,
"grad_norm": 0.3329495509148233,
"learning_rate": 4.9577821686826304e-05,
"loss": 0.5342,
"num_tokens": 244967987.0,
"step": 320
},
{
"epoch": 0.4433833560709413,
"grad_norm": 0.34562763504560073,
"learning_rate": 4.9553002446861634e-05,
"loss": 0.5406,
"num_tokens": 248751095.0,
"step": 325
},
{
"epoch": 0.45020463847203274,
"grad_norm": 0.385098545376304,
"learning_rate": 4.952748166125779e-05,
"loss": 0.528,
"num_tokens": 252607265.0,
"step": 330
},
{
"epoch": 0.45702592087312416,
"grad_norm": 0.32550400486163533,
"learning_rate": 4.950126014187683e-05,
"loss": 0.5344,
"num_tokens": 256417909.0,
"step": 335
},
{
"epoch": 0.4638472032742155,
"grad_norm": 0.3806963854916901,
"learning_rate": 4.9474338722872404e-05,
"loss": 0.5272,
"num_tokens": 260456429.0,
"step": 340
},
{
"epoch": 0.47066848567530695,
"grad_norm": 0.32980302955349,
"learning_rate": 4.9446718260663234e-05,
"loss": 0.5203,
"num_tokens": 264287905.0,
"step": 345
},
{
"epoch": 0.47748976807639837,
"grad_norm": 0.3603093816461743,
"learning_rate": 4.941839963390585e-05,
"loss": 0.529,
"num_tokens": 268098790.0,
"step": 350
},
{
"epoch": 0.4843110504774898,
"grad_norm": 0.3631235151389752,
"learning_rate": 4.9389383743466675e-05,
"loss": 0.5383,
"num_tokens": 271974065.0,
"step": 355
},
{
"epoch": 0.49113233287858116,
"grad_norm": 0.40273291547493417,
"learning_rate": 4.935967151239331e-05,
"loss": 0.5422,
"num_tokens": 275852045.0,
"step": 360
},
{
"epoch": 0.4979536152796726,
"grad_norm": 0.36255296270077164,
"learning_rate": 4.932926388588524e-05,
"loss": 0.5198,
"num_tokens": 279504484.0,
"step": 365
},
{
"epoch": 0.504774897680764,
"grad_norm": 0.3182608267836354,
"learning_rate": 4.92981618312637e-05,
"loss": 0.5217,
"num_tokens": 283461546.0,
"step": 370
},
{
"epoch": 0.5115961800818554,
"grad_norm": 0.36234286989341263,
"learning_rate": 4.9266366337940945e-05,
"loss": 0.53,
"num_tokens": 287371918.0,
"step": 375
},
{
"epoch": 0.5184174624829468,
"grad_norm": 0.4092201031412652,
"learning_rate": 4.923387841738875e-05,
"loss": 0.5162,
"num_tokens": 291057738.0,
"step": 380
},
{
"epoch": 0.5252387448840382,
"grad_norm": 0.30609433227409455,
"learning_rate": 4.920069910310625e-05,
"loss": 0.5255,
"num_tokens": 294881963.0,
"step": 385
},
{
"epoch": 0.5320600272851296,
"grad_norm": 0.3224125224406666,
"learning_rate": 4.9166829450587075e-05,
"loss": 0.5282,
"num_tokens": 298677895.0,
"step": 390
},
{
"epoch": 0.538881309686221,
"grad_norm": 0.2959811215364232,
"learning_rate": 4.9132270537285726e-05,
"loss": 0.512,
"num_tokens": 302387985.0,
"step": 395
},
{
"epoch": 0.5457025920873124,
"grad_norm": 20.354817642986546,
"learning_rate": 4.9097023462583345e-05,
"loss": 0.5424,
"num_tokens": 306130593.0,
"step": 400
},
{
"epoch": 0.5525238744884038,
"grad_norm": 0.42997615431371866,
"learning_rate": 4.906108934775272e-05,
"loss": 0.5344,
"num_tokens": 309952008.0,
"step": 405
},
{
"epoch": 0.5593451568894953,
"grad_norm": 0.43537708372363537,
"learning_rate": 4.902446933592261e-05,
"loss": 0.5181,
"num_tokens": 313725489.0,
"step": 410
},
{
"epoch": 0.5661664392905866,
"grad_norm": 0.3490812508662286,
"learning_rate": 4.8987164592041416e-05,
"loss": 0.5226,
"num_tokens": 317525148.0,
"step": 415
},
{
"epoch": 0.572987721691678,
"grad_norm": 0.32574253533741937,
"learning_rate": 4.894917630284007e-05,
"loss": 0.5223,
"num_tokens": 321410101.0,
"step": 420
},
{
"epoch": 0.5798090040927695,
"grad_norm": 0.3086970265131434,
"learning_rate": 4.891050567679433e-05,
"loss": 0.5174,
"num_tokens": 325102278.0,
"step": 425
},
{
"epoch": 0.5866302864938608,
"grad_norm": 0.3208378690386464,
"learning_rate": 4.88711539440863e-05,
"loss": 0.522,
"num_tokens": 328831071.0,
"step": 430
},
{
"epoch": 0.5934515688949522,
"grad_norm": 0.3106759301738243,
"learning_rate": 4.8831122356565323e-05,
"loss": 0.5116,
"num_tokens": 332767044.0,
"step": 435
},
{
"epoch": 0.6002728512960437,
"grad_norm": 0.3108281530425403,
"learning_rate": 4.8790412187708125e-05,
"loss": 0.509,
"num_tokens": 336595638.0,
"step": 440
},
{
"epoch": 0.607094133697135,
"grad_norm": 0.3147467882312625,
"learning_rate": 4.874902473257835e-05,
"loss": 0.5218,
"num_tokens": 340358925.0,
"step": 445
},
{
"epoch": 0.6139154160982264,
"grad_norm": 0.3366316381524695,
"learning_rate": 4.870696130778532e-05,
"loss": 0.5206,
"num_tokens": 344239058.0,
"step": 450
},
{
"epoch": 0.6207366984993179,
"grad_norm": 0.3171021318012896,
"learning_rate": 4.8664223251442154e-05,
"loss": 0.524,
"num_tokens": 348080585.0,
"step": 455
},
{
"epoch": 0.6275579809004093,
"grad_norm": 0.3178449068910906,
"learning_rate": 4.862081192312326e-05,
"loss": 0.5176,
"num_tokens": 351817695.0,
"step": 460
},
{
"epoch": 0.6343792633015006,
"grad_norm": 0.363976784318056,
"learning_rate": 4.8576728703820976e-05,
"loss": 0.5122,
"num_tokens": 355639183.0,
"step": 465
},
{
"epoch": 0.6412005457025921,
"grad_norm": 0.33984465023934035,
"learning_rate": 4.853197499590174e-05,
"loss": 0.5308,
"num_tokens": 359437581.0,
"step": 470
},
{
"epoch": 0.6480218281036835,
"grad_norm": 0.32388712017688437,
"learning_rate": 4.848655222306144e-05,
"loss": 0.5066,
"num_tokens": 363189983.0,
"step": 475
},
{
"epoch": 0.654843110504775,
"grad_norm": 0.29612685969086366,
"learning_rate": 4.844046183028009e-05,
"loss": 0.509,
"num_tokens": 366863209.0,
"step": 480
},
{
"epoch": 0.6616643929058663,
"grad_norm": 0.34029178906832475,
"learning_rate": 4.839370528377592e-05,
"loss": 0.5231,
"num_tokens": 370725860.0,
"step": 485
},
{
"epoch": 0.6684856753069577,
"grad_norm": 0.3103169185956682,
"learning_rate": 4.834628407095871e-05,
"loss": 0.5085,
"num_tokens": 374566515.0,
"step": 490
},
{
"epoch": 0.6753069577080492,
"grad_norm": 0.2800430805166771,
"learning_rate": 4.829819970038245e-05,
"loss": 0.5012,
"num_tokens": 378330489.0,
"step": 495
},
{
"epoch": 0.6821282401091405,
"grad_norm": 0.34884629801915895,
"learning_rate": 4.8249453701697385e-05,
"loss": 0.5059,
"num_tokens": 382103468.0,
"step": 500
},
{
"epoch": 0.6889495225102319,
"grad_norm": 0.3197637736232303,
"learning_rate": 4.820004762560134e-05,
"loss": 0.5144,
"num_tokens": 385837297.0,
"step": 505
},
{
"epoch": 0.6957708049113234,
"grad_norm": 0.3936823154590106,
"learning_rate": 4.814998304379036e-05,
"loss": 0.5117,
"num_tokens": 389586680.0,
"step": 510
},
{
"epoch": 0.7025920873124147,
"grad_norm": 0.3172798834226314,
"learning_rate": 4.8099261548908773e-05,
"loss": 0.5162,
"num_tokens": 393428760.0,
"step": 515
},
{
"epoch": 0.7094133697135061,
"grad_norm": 0.3160858881655075,
"learning_rate": 4.8047884754498495e-05,
"loss": 0.5279,
"num_tokens": 397158700.0,
"step": 520
},
{
"epoch": 0.7162346521145976,
"grad_norm": 0.37082219207657685,
"learning_rate": 4.799585429494768e-05,
"loss": 0.5123,
"num_tokens": 400923938.0,
"step": 525
},
{
"epoch": 0.723055934515689,
"grad_norm": 0.31992077884834463,
"learning_rate": 4.794317182543875e-05,
"loss": 0.506,
"num_tokens": 404685655.0,
"step": 530
},
{
"epoch": 0.7298772169167803,
"grad_norm": 0.29913070231582056,
"learning_rate": 4.7889839021895724e-05,
"loss": 0.5117,
"num_tokens": 408582799.0,
"step": 535
},
{
"epoch": 0.7366984993178718,
"grad_norm": 0.28227205669712635,
"learning_rate": 4.783585758093095e-05,
"loss": 0.517,
"num_tokens": 412386009.0,
"step": 540
},
{
"epoch": 0.7435197817189632,
"grad_norm": 0.3067668406718358,
"learning_rate": 4.778122921979104e-05,
"loss": 0.5201,
"num_tokens": 416372039.0,
"step": 545
},
{
"epoch": 0.7503410641200545,
"grad_norm": 0.3186838734464595,
"learning_rate": 4.772595567630237e-05,
"loss": 0.5046,
"num_tokens": 420051975.0,
"step": 550
},
{
"epoch": 0.757162346521146,
"grad_norm": 0.34804937870860536,
"learning_rate": 4.7670038708815676e-05,
"loss": 0.5051,
"num_tokens": 423685709.0,
"step": 555
},
{
"epoch": 0.7639836289222374,
"grad_norm": 0.33506793399336343,
"learning_rate": 4.761348009615018e-05,
"loss": 0.5011,
"num_tokens": 427405904.0,
"step": 560
},
{
"epoch": 0.7708049113233287,
"grad_norm": 0.34443735099415185,
"learning_rate": 4.7556281637536985e-05,
"loss": 0.5011,
"num_tokens": 431288378.0,
"step": 565
},
{
"epoch": 0.7776261937244202,
"grad_norm": 0.32414846184121815,
"learning_rate": 4.7498445152561864e-05,
"loss": 0.5042,
"num_tokens": 435077995.0,
"step": 570
},
{
"epoch": 0.7844474761255116,
"grad_norm": 0.3176009794001488,
"learning_rate": 4.743997248110733e-05,
"loss": 0.5041,
"num_tokens": 439006864.0,
"step": 575
},
{
"epoch": 0.791268758526603,
"grad_norm": 0.3149518360403247,
"learning_rate": 4.738086548329416e-05,
"loss": 0.5192,
"num_tokens": 443045688.0,
"step": 580
},
{
"epoch": 0.7980900409276944,
"grad_norm": 0.3207146432446857,
"learning_rate": 4.732112603942216e-05,
"loss": 0.5065,
"num_tokens": 446852018.0,
"step": 585
},
{
"epoch": 0.8049113233287858,
"grad_norm": 0.2888771649434731,
"learning_rate": 4.7260756049910406e-05,
"loss": 0.5203,
"num_tokens": 450687287.0,
"step": 590
},
{
"epoch": 0.8117326057298773,
"grad_norm": 0.3057129065721774,
"learning_rate": 4.7199757435236744e-05,
"loss": 0.5057,
"num_tokens": 454483896.0,
"step": 595
},
{
"epoch": 0.8185538881309686,
"grad_norm": 0.28756788730889826,
"learning_rate": 4.713813213587674e-05,
"loss": 0.5036,
"num_tokens": 458325861.0,
"step": 600
},
{
"epoch": 0.82537517053206,
"grad_norm": 0.30014832786980916,
"learning_rate": 4.70758821122419e-05,
"loss": 0.5092,
"num_tokens": 462055868.0,
"step": 605
},
{
"epoch": 0.8321964529331515,
"grad_norm": 0.3111436835827166,
"learning_rate": 4.701300934461736e-05,
"loss": 0.5032,
"num_tokens": 465973876.0,
"step": 610
},
{
"epoch": 0.8390177353342428,
"grad_norm": 0.32758808149750535,
"learning_rate": 4.6949515833098824e-05,
"loss": 0.5004,
"num_tokens": 469791498.0,
"step": 615
},
{
"epoch": 0.8458390177353342,
"grad_norm": 0.27029086656552975,
"learning_rate": 4.688540359752902e-05,
"loss": 0.4979,
"num_tokens": 473570581.0,
"step": 620
},
{
"epoch": 0.8526603001364257,
"grad_norm": 0.29349068259319094,
"learning_rate": 4.6820674677433376e-05,
"loss": 0.5139,
"num_tokens": 477401353.0,
"step": 625
},
{
"epoch": 0.859481582537517,
"grad_norm": 0.2866715279919294,
"learning_rate": 4.675533113195515e-05,
"loss": 0.5129,
"num_tokens": 481348892.0,
"step": 630
},
{
"epoch": 0.8663028649386084,
"grad_norm": 0.2864693588210585,
"learning_rate": 4.6689375039789954e-05,
"loss": 0.5108,
"num_tokens": 485171910.0,
"step": 635
},
{
"epoch": 0.8731241473396999,
"grad_norm": 0.27626775639483725,
"learning_rate": 4.6622808499119625e-05,
"loss": 0.4956,
"num_tokens": 489059548.0,
"step": 640
},
{
"epoch": 0.8799454297407913,
"grad_norm": 0.2946930273512276,
"learning_rate": 4.655563362754543e-05,
"loss": 0.4989,
"num_tokens": 492893029.0,
"step": 645
},
{
"epoch": 0.8867667121418826,
"grad_norm": 0.3238619095461243,
"learning_rate": 4.648785256202076e-05,
"loss": 0.5064,
"num_tokens": 496905674.0,
"step": 650
},
{
"epoch": 0.8935879945429741,
"grad_norm": 0.27166442143334074,
"learning_rate": 4.6419467458783125e-05,
"loss": 0.5012,
"num_tokens": 500864542.0,
"step": 655
},
{
"epoch": 0.9004092769440655,
"grad_norm": 0.35133448756025343,
"learning_rate": 4.635048049328555e-05,
"loss": 0.505,
"num_tokens": 504810366.0,
"step": 660
},
{
"epoch": 0.9072305593451568,
"grad_norm": 0.35555108957660303,
"learning_rate": 4.628089386012737e-05,
"loss": 0.5011,
"num_tokens": 508607232.0,
"step": 665
},
{
"epoch": 0.9140518417462483,
"grad_norm": 0.32064397314940585,
"learning_rate": 4.621070977298446e-05,
"loss": 0.508,
"num_tokens": 512474084.0,
"step": 670
},
{
"epoch": 0.9208731241473397,
"grad_norm": 0.28781771682272567,
"learning_rate": 4.613993046453875e-05,
"loss": 0.4986,
"num_tokens": 516216104.0,
"step": 675
},
{
"epoch": 0.927694406548431,
"grad_norm": 0.278681463872458,
"learning_rate": 4.606855818640724e-05,
"loss": 0.5079,
"num_tokens": 519966345.0,
"step": 680
},
{
"epoch": 0.9345156889495225,
"grad_norm": 0.2777129164143007,
"learning_rate": 4.5996595209070356e-05,
"loss": 0.4934,
"num_tokens": 523793837.0,
"step": 685
},
{
"epoch": 0.9413369713506139,
"grad_norm": 0.29049878841827675,
"learning_rate": 4.5924043821799734e-05,
"loss": 0.5069,
"num_tokens": 527666864.0,
"step": 690
},
{
"epoch": 0.9481582537517054,
"grad_norm": 0.30785475125106526,
"learning_rate": 4.585090633258539e-05,
"loss": 0.5042,
"num_tokens": 531466359.0,
"step": 695
},
{
"epoch": 0.9549795361527967,
"grad_norm": 0.29390951682226474,
"learning_rate": 4.577718506806228e-05,
"loss": 0.5066,
"num_tokens": 535253217.0,
"step": 700
},
{
"epoch": 0.9618008185538881,
"grad_norm": 0.33095337694320254,
"learning_rate": 4.570288237343632e-05,
"loss": 0.4904,
"num_tokens": 539137436.0,
"step": 705
},
{
"epoch": 0.9686221009549796,
"grad_norm": 0.2850378778466875,
"learning_rate": 4.562800061240975e-05,
"loss": 0.5009,
"num_tokens": 542824098.0,
"step": 710
},
{
"epoch": 0.975443383356071,
"grad_norm": 0.30760761583081375,
"learning_rate": 4.555254216710597e-05,
"loss": 0.5119,
"num_tokens": 546578047.0,
"step": 715
},
{
"epoch": 0.9822646657571623,
"grad_norm": 0.27149624139361506,
"learning_rate": 4.5476509437993726e-05,
"loss": 0.4976,
"num_tokens": 550377811.0,
"step": 720
},
{
"epoch": 0.9890859481582538,
"grad_norm": 0.30828520368635326,
"learning_rate": 4.5399904843810756e-05,
"loss": 0.4941,
"num_tokens": 554224024.0,
"step": 725
},
{
"epoch": 0.9959072305593452,
"grad_norm": 0.3464146216808222,
"learning_rate": 4.532273082148689e-05,
"loss": 0.5015,
"num_tokens": 558001366.0,
"step": 730
},
{
"epoch": 1.0027285129604366,
"grad_norm": 0.3490549767071155,
"learning_rate": 4.5244989826066444e-05,
"loss": 0.4889,
"num_tokens": 561842493.0,
"step": 735
},
{
"epoch": 1.009549795361528,
"grad_norm": 0.3148778232856733,
"learning_rate": 4.51666843306302e-05,
"loss": 0.4679,
"num_tokens": 565569218.0,
"step": 740
},
{
"epoch": 1.0163710777626194,
"grad_norm": 0.3731178499173407,
"learning_rate": 4.5087816826216695e-05,
"loss": 0.4764,
"num_tokens": 569401220.0,
"step": 745
},
{
"epoch": 1.0231923601637107,
"grad_norm": 0.34226324439808514,
"learning_rate": 4.500838982174297e-05,
"loss": 0.4766,
"num_tokens": 573382420.0,
"step": 750
},
{
"epoch": 1.030013642564802,
"grad_norm": 0.2805013542076892,
"learning_rate": 4.492840584392478e-05,
"loss": 0.4671,
"num_tokens": 577211964.0,
"step": 755
},
{
"epoch": 1.0368349249658937,
"grad_norm": 0.3065721441536721,
"learning_rate": 4.484786743719619e-05,
"loss": 0.4689,
"num_tokens": 581185526.0,
"step": 760
},
{
"epoch": 1.043656207366985,
"grad_norm": 0.2936712359839026,
"learning_rate": 4.4766777163628656e-05,
"loss": 0.4712,
"num_tokens": 584961788.0,
"step": 765
},
{
"epoch": 1.0504774897680764,
"grad_norm": 0.2764462935192823,
"learning_rate": 4.468513760284952e-05,
"loss": 0.4797,
"num_tokens": 588808530.0,
"step": 770
},
{
"epoch": 1.0572987721691678,
"grad_norm": 0.28592666412762696,
"learning_rate": 4.460295135195991e-05,
"loss": 0.4599,
"num_tokens": 592600556.0,
"step": 775
},
{
"epoch": 1.0641200545702592,
"grad_norm": 0.29631998771015455,
"learning_rate": 4.452022102545217e-05,
"loss": 0.4673,
"num_tokens": 596480393.0,
"step": 780
},
{
"epoch": 1.0709413369713505,
"grad_norm": 0.26505805501945173,
"learning_rate": 4.443694925512665e-05,
"loss": 0.4717,
"num_tokens": 600450795.0,
"step": 785
},
{
"epoch": 1.077762619372442,
"grad_norm": 0.28531533545747306,
"learning_rate": 4.4353138690008026e-05,
"loss": 0.474,
"num_tokens": 604180876.0,
"step": 790
},
{
"epoch": 1.0845839017735335,
"grad_norm": 0.2780327870404011,
"learning_rate": 4.426879199626098e-05,
"loss": 0.467,
"num_tokens": 607934610.0,
"step": 795
},
{
"epoch": 1.0914051841746248,
"grad_norm": 0.27715466802943606,
"learning_rate": 4.418391185710543e-05,
"loss": 0.4695,
"num_tokens": 611651241.0,
"step": 800
},
{
"epoch": 1.0982264665757162,
"grad_norm": 0.31591464715547607,
"learning_rate": 4.409850097273113e-05,
"loss": 0.4675,
"num_tokens": 615434308.0,
"step": 805
},
{
"epoch": 1.1050477489768076,
"grad_norm": 0.26233476106629194,
"learning_rate": 4.401256206021181e-05,
"loss": 0.4657,
"num_tokens": 619368833.0,
"step": 810
},
{
"epoch": 1.111869031377899,
"grad_norm": 0.27576262263266726,
"learning_rate": 4.39260978534187e-05,
"loss": 0.4639,
"num_tokens": 623126366.0,
"step": 815
},
{
"epoch": 1.1186903137789905,
"grad_norm": 0.2746654826395091,
"learning_rate": 4.383911110293363e-05,
"loss": 0.4765,
"num_tokens": 627107529.0,
"step": 820
},
{
"epoch": 1.125511596180082,
"grad_norm": 0.24775871464017238,
"learning_rate": 4.375160457596144e-05,
"loss": 0.4651,
"num_tokens": 630903442.0,
"step": 825
},
{
"epoch": 1.1323328785811733,
"grad_norm": 0.24655144689998953,
"learning_rate": 4.3663581056242e-05,
"loss": 0.4606,
"num_tokens": 634517705.0,
"step": 830
},
{
"epoch": 1.1391541609822646,
"grad_norm": 0.27895983869139385,
"learning_rate": 4.357504334396168e-05,
"loss": 0.4685,
"num_tokens": 638096091.0,
"step": 835
},
{
"epoch": 1.145975443383356,
"grad_norm": 0.31816753414625654,
"learning_rate": 4.348599425566422e-05,
"loss": 0.4684,
"num_tokens": 642002612.0,
"step": 840
},
{
"epoch": 1.1527967257844476,
"grad_norm": 0.26724451611429256,
"learning_rate": 4.3396436624161125e-05,
"loss": 0.4735,
"num_tokens": 645977691.0,
"step": 845
},
{
"epoch": 1.159618008185539,
"grad_norm": 0.32927796404239074,
"learning_rate": 4.330637329844162e-05,
"loss": 0.4667,
"num_tokens": 649827816.0,
"step": 850
},
{
"epoch": 1.1664392905866303,
"grad_norm": 0.34404133032544415,
"learning_rate": 4.321580714358193e-05,
"loss": 0.4765,
"num_tokens": 653510445.0,
"step": 855
},
{
"epoch": 1.1732605729877217,
"grad_norm": 0.27768133752278,
"learning_rate": 4.3124741040654217e-05,
"loss": 0.4715,
"num_tokens": 657384998.0,
"step": 860
},
{
"epoch": 1.180081855388813,
"grad_norm": 0.25025345440677405,
"learning_rate": 4.3033177886634845e-05,
"loss": 0.4665,
"num_tokens": 661293472.0,
"step": 865
},
{
"epoch": 1.1869031377899044,
"grad_norm": 0.30457046427971657,
"learning_rate": 4.2941120594312315e-05,
"loss": 0.4748,
"num_tokens": 665091673.0,
"step": 870
},
{
"epoch": 1.1937244201909958,
"grad_norm": 0.23877952147739753,
"learning_rate": 4.2848572092194513e-05,
"loss": 0.4728,
"num_tokens": 668922968.0,
"step": 875
},
{
"epoch": 1.2005457025920874,
"grad_norm": 0.2520844593923701,
"learning_rate": 4.275553532441562e-05,
"loss": 0.4644,
"num_tokens": 672846634.0,
"step": 880
},
{
"epoch": 1.2073669849931787,
"grad_norm": 0.3147581673447297,
"learning_rate": 4.266201325064242e-05,
"loss": 0.4627,
"num_tokens": 676655826.0,
"step": 885
},
{
"epoch": 1.21418826739427,
"grad_norm": 0.24800806438269005,
"learning_rate": 4.256800884598013e-05,
"loss": 0.4786,
"num_tokens": 680322801.0,
"step": 890
},
{
"epoch": 1.2210095497953615,
"grad_norm": 2.4907630647282,
"learning_rate": 4.2473525100877823e-05,
"loss": 0.4739,
"num_tokens": 684119272.0,
"step": 895
},
{
"epoch": 1.2278308321964528,
"grad_norm": 0.30229873985443184,
"learning_rate": 4.23785650210332e-05,
"loss": 0.467,
"num_tokens": 687991406.0,
"step": 900
},
{
"epoch": 1.2346521145975444,
"grad_norm": 0.26793479462106135,
"learning_rate": 4.228313162729706e-05,
"loss": 0.4768,
"num_tokens": 691695054.0,
"step": 905
},
{
"epoch": 1.2414733969986358,
"grad_norm": 0.2849637202449397,
"learning_rate": 4.218722795557717e-05,
"loss": 0.4681,
"num_tokens": 695621817.0,
"step": 910
},
{
"epoch": 1.2482946793997272,
"grad_norm": 0.25879554486736855,
"learning_rate": 4.2090857056741676e-05,
"loss": 0.4553,
"num_tokens": 699480008.0,
"step": 915
},
{
"epoch": 1.2551159618008185,
"grad_norm": 0.2878790201694137,
"learning_rate": 4.199402199652205e-05,
"loss": 0.4502,
"num_tokens": 703195130.0,
"step": 920
},
{
"epoch": 1.26193724420191,
"grad_norm": 0.30195551712906077,
"learning_rate": 4.189672585541558e-05,
"loss": 0.4686,
"num_tokens": 707113602.0,
"step": 925
},
{
"epoch": 1.2687585266030013,
"grad_norm": 0.2604967174312856,
"learning_rate": 4.1798971728587375e-05,
"loss": 0.4659,
"num_tokens": 710925367.0,
"step": 930
},
{
"epoch": 1.2755798090040928,
"grad_norm": 0.2781371955226694,
"learning_rate": 4.170076272577186e-05,
"loss": 0.464,
"num_tokens": 714722146.0,
"step": 935
},
{
"epoch": 1.2824010914051842,
"grad_norm": 0.27482737768170556,
"learning_rate": 4.160210197117392e-05,
"loss": 0.4608,
"num_tokens": 718555460.0,
"step": 940
},
{
"epoch": 1.2892223738062756,
"grad_norm": 0.31633385102337147,
"learning_rate": 4.150299260336947e-05,
"loss": 0.4638,
"num_tokens": 722379411.0,
"step": 945
},
{
"epoch": 1.296043656207367,
"grad_norm": 0.2982673873153893,
"learning_rate": 4.14034377752056e-05,
"loss": 0.4732,
"num_tokens": 726285777.0,
"step": 950
},
{
"epoch": 1.3028649386084583,
"grad_norm": 0.2770667418801685,
"learning_rate": 4.130344065370031e-05,
"loss": 0.4786,
"num_tokens": 730064545.0,
"step": 955
},
{
"epoch": 1.30968622100955,
"grad_norm": 0.26149171089033296,
"learning_rate": 4.120300441994172e-05,
"loss": 0.462,
"num_tokens": 733841294.0,
"step": 960
},
{
"epoch": 1.3165075034106413,
"grad_norm": 0.31050190782640036,
"learning_rate": 4.110213226898695e-05,
"loss": 0.4732,
"num_tokens": 737760285.0,
"step": 965
},
{
"epoch": 1.3233287858117326,
"grad_norm": 0.27836619089478065,
"learning_rate": 4.100082740976036e-05,
"loss": 0.4727,
"num_tokens": 741596226.0,
"step": 970
},
{
"epoch": 1.330150068212824,
"grad_norm": 0.2579696461553874,
"learning_rate": 4.08990930649516e-05,
"loss": 0.4649,
"num_tokens": 745437763.0,
"step": 975
},
{
"epoch": 1.3369713506139154,
"grad_norm": 0.2740627527065771,
"learning_rate": 4.079693247091302e-05,
"loss": 0.4645,
"num_tokens": 749302808.0,
"step": 980
},
{
"epoch": 1.3437926330150067,
"grad_norm": 0.2790716204058442,
"learning_rate": 4.069434887755667e-05,
"loss": 0.4689,
"num_tokens": 753127588.0,
"step": 985
},
{
"epoch": 1.350613915416098,
"grad_norm": 0.264519833216715,
"learning_rate": 4.059134554825104e-05,
"loss": 0.4686,
"num_tokens": 756971687.0,
"step": 990
},
{
"epoch": 1.3574351978171897,
"grad_norm": 0.28990937969651126,
"learning_rate": 4.048792575971713e-05,
"loss": 0.4598,
"num_tokens": 760697602.0,
"step": 995
},
{
"epoch": 1.364256480218281,
"grad_norm": 0.24357301496138234,
"learning_rate": 4.038409280192427e-05,
"loss": 0.4658,
"num_tokens": 764490647.0,
"step": 1000
},
{
"epoch": 1.3710777626193724,
"grad_norm": 0.26005750254814686,
"learning_rate": 4.0279849977985434e-05,
"loss": 0.4653,
"num_tokens": 768330897.0,
"step": 1005
},
{
"epoch": 1.3778990450204638,
"grad_norm": 0.2593892684840051,
"learning_rate": 4.01752006040522e-05,
"loss": 0.4665,
"num_tokens": 772161013.0,
"step": 1010
},
{
"epoch": 1.3847203274215554,
"grad_norm": 0.2554846973907493,
"learning_rate": 4.007014800920921e-05,
"loss": 0.4733,
"num_tokens": 776138853.0,
"step": 1015
},
{
"epoch": 1.3915416098226467,
"grad_norm": 0.2841717548251602,
"learning_rate": 3.9964695535368306e-05,
"loss": 0.4608,
"num_tokens": 779984056.0,
"step": 1020
},
{
"epoch": 1.398362892223738,
"grad_norm": 0.25032452540370875,
"learning_rate": 3.985884653716218e-05,
"loss": 0.4627,
"num_tokens": 783898134.0,
"step": 1025
},
{
"epoch": 1.4051841746248295,
"grad_norm": 0.29802875985167504,
"learning_rate": 3.9752604381837676e-05,
"loss": 0.4645,
"num_tokens": 787897711.0,
"step": 1030
},
{
"epoch": 1.4120054570259208,
"grad_norm": 0.2706705220606769,
"learning_rate": 3.96459724491487e-05,
"loss": 0.4704,
"num_tokens": 791719537.0,
"step": 1035
},
{
"epoch": 1.4188267394270122,
"grad_norm": 0.2840824587208515,
"learning_rate": 3.953895413124866e-05,
"loss": 0.465,
"num_tokens": 795317290.0,
"step": 1040
},
{
"epoch": 1.4256480218281036,
"grad_norm": 0.28504683741140885,
"learning_rate": 3.9431552832582544e-05,
"loss": 0.4645,
"num_tokens": 799156629.0,
"step": 1045
},
{
"epoch": 1.4324693042291952,
"grad_norm": 0.23464120441174716,
"learning_rate": 3.932377196977871e-05,
"loss": 0.4552,
"num_tokens": 803069636.0,
"step": 1050
},
{
"epoch": 1.4392905866302865,
"grad_norm": 0.27338514780251155,
"learning_rate": 3.9215614971540064e-05,
"loss": 0.4655,
"num_tokens": 807046765.0,
"step": 1055
},
{
"epoch": 1.446111869031378,
"grad_norm": 0.26309603739381,
"learning_rate": 3.9107085278535105e-05,
"loss": 0.4643,
"num_tokens": 810878242.0,
"step": 1060
},
{
"epoch": 1.4529331514324693,
"grad_norm": 0.28263057841312744,
"learning_rate": 3.8998186343288403e-05,
"loss": 0.464,
"num_tokens": 814684093.0,
"step": 1065
},
{
"epoch": 1.4597544338335606,
"grad_norm": 0.27658909194801223,
"learning_rate": 3.888892163007079e-05,
"loss": 0.4612,
"num_tokens": 818487158.0,
"step": 1070
},
{
"epoch": 1.4665757162346522,
"grad_norm": 0.31309736931041143,
"learning_rate": 3.877929461478915e-05,
"loss": 0.4612,
"num_tokens": 822359745.0,
"step": 1075
},
{
"epoch": 1.4733969986357436,
"grad_norm": 0.27357218874453054,
"learning_rate": 3.8669308784875855e-05,
"loss": 0.4593,
"num_tokens": 826031854.0,
"step": 1080
},
{
"epoch": 1.480218281036835,
"grad_norm": 0.2702103817079497,
"learning_rate": 3.8558967639177795e-05,
"loss": 0.4652,
"num_tokens": 829831729.0,
"step": 1085
},
{
"epoch": 1.4870395634379263,
"grad_norm": 0.28660991886409803,
"learning_rate": 3.844827468784513e-05,
"loss": 0.4635,
"num_tokens": 833559495.0,
"step": 1090
},
{
"epoch": 1.4938608458390177,
"grad_norm": 0.2831089788366953,
"learning_rate": 3.8337233452219554e-05,
"loss": 0.4742,
"num_tokens": 837452717.0,
"step": 1095
},
{
"epoch": 1.500682128240109,
"grad_norm": 0.2726925683985379,
"learning_rate": 3.822584746472232e-05,
"loss": 0.4685,
"num_tokens": 841494894.0,
"step": 1100
},
{
"epoch": 1.5075034106412004,
"grad_norm": 0.2945202746465597,
"learning_rate": 3.811412026874187e-05,
"loss": 0.4588,
"num_tokens": 845244637.0,
"step": 1105
},
{
"epoch": 1.514324693042292,
"grad_norm": 0.2947105252840782,
"learning_rate": 3.800205541852109e-05,
"loss": 0.4648,
"num_tokens": 849040114.0,
"step": 1110
},
{
"epoch": 1.5211459754433834,
"grad_norm": 0.26767448377121167,
"learning_rate": 3.788965647904426e-05,
"loss": 0.4639,
"num_tokens": 852855908.0,
"step": 1115
},
{
"epoch": 1.5279672578444747,
"grad_norm": 0.2728941868971984,
"learning_rate": 3.777692702592363e-05,
"loss": 0.4633,
"num_tokens": 856606710.0,
"step": 1120
},
{
"epoch": 1.5347885402455663,
"grad_norm": 0.3161592951745081,
"learning_rate": 3.76638706452857e-05,
"loss": 0.4717,
"num_tokens": 860530719.0,
"step": 1125
},
{
"epoch": 1.5416098226466577,
"grad_norm": 0.30392644830663357,
"learning_rate": 3.755049093365709e-05,
"loss": 0.4624,
"num_tokens": 864403759.0,
"step": 1130
},
{
"epoch": 1.548431105047749,
"grad_norm": 0.27628995163838427,
"learning_rate": 3.74367914978502e-05,
"loss": 0.4668,
"num_tokens": 868277615.0,
"step": 1135
},
{
"epoch": 1.5552523874488404,
"grad_norm": 0.24659333313063722,
"learning_rate": 3.73227759548484e-05,
"loss": 0.4762,
"num_tokens": 872188105.0,
"step": 1140
},
{
"epoch": 1.5620736698499318,
"grad_norm": 0.2541498419510143,
"learning_rate": 3.7208447931691034e-05,
"loss": 0.4656,
"num_tokens": 876046118.0,
"step": 1145
},
{
"epoch": 1.5688949522510232,
"grad_norm": 0.2483276804595362,
"learning_rate": 3.7093811065357934e-05,
"loss": 0.4646,
"num_tokens": 879952254.0,
"step": 1150
},
{
"epoch": 1.5757162346521145,
"grad_norm": 0.2736551066476727,
"learning_rate": 3.6978869002653884e-05,
"loss": 0.4684,
"num_tokens": 883721474.0,
"step": 1155
},
{
"epoch": 1.5825375170532059,
"grad_norm": 0.2606556505747233,
"learning_rate": 3.6863625400092407e-05,
"loss": 0.4849,
"num_tokens": 887512011.0,
"step": 1160
},
{
"epoch": 1.5893587994542973,
"grad_norm": 0.26523699927966526,
"learning_rate": 3.674808392377964e-05,
"loss": 0.456,
"num_tokens": 891332828.0,
"step": 1165
},
{
"epoch": 1.5961800818553888,
"grad_norm": 0.2413639875443443,
"learning_rate": 3.663224824929758e-05,
"loss": 0.4602,
"num_tokens": 895212128.0,
"step": 1170
},
{
"epoch": 1.6030013642564802,
"grad_norm": 0.22966581710695846,
"learning_rate": 3.6516122061587184e-05,
"loss": 0.461,
"num_tokens": 899092598.0,
"step": 1175
},
{
"epoch": 1.6098226466575716,
"grad_norm": 0.23035517437667388,
"learning_rate": 3.639970905483119e-05,
"loss": 0.4652,
"num_tokens": 902821411.0,
"step": 1180
},
{
"epoch": 1.6166439290586632,
"grad_norm": 0.25105961460570336,
"learning_rate": 3.628301293233653e-05,
"loss": 0.4631,
"num_tokens": 906650186.0,
"step": 1185
},
{
"epoch": 1.6234652114597545,
"grad_norm": 0.2671204882477136,
"learning_rate": 3.6166037406416586e-05,
"loss": 0.4667,
"num_tokens": 910561700.0,
"step": 1190
},
{
"epoch": 1.630286493860846,
"grad_norm": 0.24391887686783595,
"learning_rate": 3.604878619827306e-05,
"loss": 0.46,
"num_tokens": 914471979.0,
"step": 1195
},
{
"epoch": 1.6371077762619373,
"grad_norm": 0.24606602365286445,
"learning_rate": 3.593126303787758e-05,
"loss": 0.4628,
"num_tokens": 918407451.0,
"step": 1200
},
{
"epoch": 1.6439290586630286,
"grad_norm": 0.24488025231455127,
"learning_rate": 3.5813471663853086e-05,
"loss": 0.4527,
"num_tokens": 922149664.0,
"step": 1205
},
{
"epoch": 1.65075034106412,
"grad_norm": 0.29502679494434914,
"learning_rate": 3.569541582335487e-05,
"loss": 0.4651,
"num_tokens": 925906310.0,
"step": 1210
},
{
"epoch": 1.6575716234652114,
"grad_norm": 0.29080410988093525,
"learning_rate": 3.557709927195137e-05,
"loss": 0.4579,
"num_tokens": 929886517.0,
"step": 1215
},
{
"epoch": 1.6643929058663027,
"grad_norm": 0.27995909033633837,
"learning_rate": 3.545852577350472e-05,
"loss": 0.4504,
"num_tokens": 933572845.0,
"step": 1220
},
{
"epoch": 1.6712141882673943,
"grad_norm": 0.24168573385650868,
"learning_rate": 3.5339699100051e-05,
"loss": 0.4545,
"num_tokens": 937133457.0,
"step": 1225
},
{
"epoch": 1.6780354706684857,
"grad_norm": 0.25669243958426285,
"learning_rate": 3.522062303168021e-05,
"loss": 0.4575,
"num_tokens": 941038842.0,
"step": 1230
},
{
"epoch": 1.684856753069577,
"grad_norm": 0.2992047414518027,
"learning_rate": 3.510130135641608e-05,
"loss": 0.4656,
"num_tokens": 944824346.0,
"step": 1235
},
{
"epoch": 1.6916780354706686,
"grad_norm": 0.25421993454486885,
"learning_rate": 3.498173787009555e-05,
"loss": 0.4554,
"num_tokens": 948787821.0,
"step": 1240
},
{
"epoch": 1.69849931787176,
"grad_norm": 0.28639702078451146,
"learning_rate": 3.4861936376247994e-05,
"loss": 0.4624,
"num_tokens": 952546196.0,
"step": 1245
},
{
"epoch": 1.7053206002728514,
"grad_norm": 0.2858597251246765,
"learning_rate": 3.474190068597419e-05,
"loss": 0.4622,
"num_tokens": 956413312.0,
"step": 1250
},
{
"epoch": 1.7121418826739427,
"grad_norm": 0.25432985447412165,
"learning_rate": 3.4621634617825195e-05,
"loss": 0.4563,
"num_tokens": 960097484.0,
"step": 1255
},
{
"epoch": 1.718963165075034,
"grad_norm": 0.24911852049104072,
"learning_rate": 3.450114199768076e-05,
"loss": 0.4529,
"num_tokens": 963903477.0,
"step": 1260
},
{
"epoch": 1.7257844474761255,
"grad_norm": 0.2604051041082986,
"learning_rate": 3.4380426658627644e-05,
"loss": 0.4583,
"num_tokens": 967716958.0,
"step": 1265
},
{
"epoch": 1.7326057298772168,
"grad_norm": 0.25401591712991045,
"learning_rate": 3.425949244083775e-05,
"loss": 0.4423,
"num_tokens": 971436672.0,
"step": 1270
},
{
"epoch": 1.7394270122783082,
"grad_norm": 0.2400128552050296,
"learning_rate": 3.413834319144587e-05,
"loss": 0.4689,
"num_tokens": 975282872.0,
"step": 1275
},
{
"epoch": 1.7462482946793996,
"grad_norm": 0.24326085857094784,
"learning_rate": 3.401698276442732e-05,
"loss": 0.4616,
"num_tokens": 979113101.0,
"step": 1280
},
{
"epoch": 1.7530695770804912,
"grad_norm": 0.23363258646620005,
"learning_rate": 3.389541502047541e-05,
"loss": 0.4563,
"num_tokens": 983117582.0,
"step": 1285
},
{
"epoch": 1.7598908594815825,
"grad_norm": 0.25465374186874845,
"learning_rate": 3.377364382687852e-05,
"loss": 0.4673,
"num_tokens": 986965249.0,
"step": 1290
},
{
"epoch": 1.766712141882674,
"grad_norm": 0.24340135543067581,
"learning_rate": 3.365167305739717e-05,
"loss": 0.4603,
"num_tokens": 990643231.0,
"step": 1295
},
{
"epoch": 1.7735334242837655,
"grad_norm": 0.26565317786588566,
"learning_rate": 3.3529506592140724e-05,
"loss": 0.4518,
"num_tokens": 994547720.0,
"step": 1300
},
{
"epoch": 1.7803547066848568,
"grad_norm": 0.2430267355793424,
"learning_rate": 3.3407148317443986e-05,
"loss": 0.4542,
"num_tokens": 998141405.0,
"step": 1305
},
{
"epoch": 1.7871759890859482,
"grad_norm": 0.2500900864567087,
"learning_rate": 3.328460212574356e-05,
"loss": 0.4517,
"num_tokens": 1001828735.0,
"step": 1310
},
{
"epoch": 1.7939972714870396,
"grad_norm": 0.23416160552607826,
"learning_rate": 3.3161871915454045e-05,
"loss": 0.4649,
"num_tokens": 1005562335.0,
"step": 1315
},
{
"epoch": 1.800818553888131,
"grad_norm": 0.26708075086752775,
"learning_rate": 3.303896159084397e-05,
"loss": 0.4578,
"num_tokens": 1009141104.0,
"step": 1320
},
{
"epoch": 1.8076398362892223,
"grad_norm": 0.26198497586871977,
"learning_rate": 3.291587506191166e-05,
"loss": 0.4632,
"num_tokens": 1012723565.0,
"step": 1325
},
{
"epoch": 1.8144611186903137,
"grad_norm": 0.24433603104500154,
"learning_rate": 3.2792616244260774e-05,
"loss": 0.4545,
"num_tokens": 1016477651.0,
"step": 1330
},
{
"epoch": 1.821282401091405,
"grad_norm": 0.22663713003772984,
"learning_rate": 3.266918905897583e-05,
"loss": 0.4569,
"num_tokens": 1020301965.0,
"step": 1335
},
{
"epoch": 1.8281036834924966,
"grad_norm": 0.21349710127379007,
"learning_rate": 3.254559743249741e-05,
"loss": 0.4475,
"num_tokens": 1024157287.0,
"step": 1340
},
{
"epoch": 1.834924965893588,
"grad_norm": 0.2300837335568015,
"learning_rate": 3.2421845296497234e-05,
"loss": 0.4666,
"num_tokens": 1027955809.0,
"step": 1345
},
{
"epoch": 1.8417462482946794,
"grad_norm": 0.23573100830820137,
"learning_rate": 3.229793658775316e-05,
"loss": 0.4583,
"num_tokens": 1031820433.0,
"step": 1350
},
{
"epoch": 1.848567530695771,
"grad_norm": 0.2595808132513396,
"learning_rate": 3.217387524802387e-05,
"loss": 0.4596,
"num_tokens": 1035628174.0,
"step": 1355
},
{
"epoch": 1.8553888130968623,
"grad_norm": 0.25623846534093714,
"learning_rate": 3.204966522392355e-05,
"loss": 0.4684,
"num_tokens": 1039538259.0,
"step": 1360
},
{
"epoch": 1.8622100954979537,
"grad_norm": 0.2508726901197259,
"learning_rate": 3.1925310466796284e-05,
"loss": 0.457,
"num_tokens": 1043323479.0,
"step": 1365
},
{
"epoch": 1.869031377899045,
"grad_norm": 0.2421604869066339,
"learning_rate": 3.180081493259036e-05,
"loss": 0.4627,
"num_tokens": 1047143725.0,
"step": 1370
},
{
"epoch": 1.8758526603001364,
"grad_norm": 0.23737080041525266,
"learning_rate": 3.1676182581732454e-05,
"loss": 0.4578,
"num_tokens": 1051058729.0,
"step": 1375
},
{
"epoch": 1.8826739427012278,
"grad_norm": 0.27755810889548954,
"learning_rate": 3.155141737900162e-05,
"loss": 0.4529,
"num_tokens": 1054707187.0,
"step": 1380
},
{
"epoch": 1.8894952251023192,
"grad_norm": 0.24769759366411626,
"learning_rate": 3.142652329340319e-05,
"loss": 0.4525,
"num_tokens": 1058505830.0,
"step": 1385
},
{
"epoch": 1.8963165075034105,
"grad_norm": 0.24993371360135405,
"learning_rate": 3.1301504298042464e-05,
"loss": 0.4463,
"num_tokens": 1062382498.0,
"step": 1390
},
{
"epoch": 1.9031377899045019,
"grad_norm": 0.26275979680991063,
"learning_rate": 3.117636436999835e-05,
"loss": 0.4641,
"num_tokens": 1066230098.0,
"step": 1395
},
{
"epoch": 1.9099590723055935,
"grad_norm": 0.2432200698117419,
"learning_rate": 3.105110749019684e-05,
"loss": 0.4549,
"num_tokens": 1070100345.0,
"step": 1400
},
{
"epoch": 1.9167803547066848,
"grad_norm": 0.26061462825889503,
"learning_rate": 3.0925737643284405e-05,
"loss": 0.4542,
"num_tokens": 1073919676.0,
"step": 1405
},
{
"epoch": 1.9236016371077762,
"grad_norm": 0.23086737216549277,
"learning_rate": 3.080025881750116e-05,
"loss": 0.4606,
"num_tokens": 1077765957.0,
"step": 1410
},
{
"epoch": 1.9304229195088678,
"grad_norm": 0.24203650893105913,
"learning_rate": 3.067467500455404e-05,
"loss": 0.4505,
"num_tokens": 1081540834.0,
"step": 1415
},
{
"epoch": 1.9372442019099592,
"grad_norm": 0.23619147847327127,
"learning_rate": 3.054899019948984e-05,
"loss": 0.4654,
"num_tokens": 1085254168.0,
"step": 1420
},
{
"epoch": 1.9440654843110505,
"grad_norm": 0.22649221738372532,
"learning_rate": 3.042320840056807e-05,
"loss": 0.4583,
"num_tokens": 1089204273.0,
"step": 1425
},
{
"epoch": 1.950886766712142,
"grad_norm": 0.23354218616389152,
"learning_rate": 3.0297333609133806e-05,
"loss": 0.4571,
"num_tokens": 1093059842.0,
"step": 1430
},
{
"epoch": 1.9577080491132333,
"grad_norm": 0.25797343469814343,
"learning_rate": 3.017136982949035e-05,
"loss": 0.4627,
"num_tokens": 1096876504.0,
"step": 1435
},
{
"epoch": 1.9645293315143246,
"grad_norm": 0.26072636154582024,
"learning_rate": 3.004532106877191e-05,
"loss": 0.4508,
"num_tokens": 1100833806.0,
"step": 1440
},
{
"epoch": 1.971350613915416,
"grad_norm": 0.24072481782703328,
"learning_rate": 2.9919191336816094e-05,
"loss": 0.4516,
"num_tokens": 1104630572.0,
"step": 1445
},
{
"epoch": 1.9781718963165074,
"grad_norm": 0.2571530398132939,
"learning_rate": 2.9792984646036336e-05,
"loss": 0.4466,
"num_tokens": 1108388900.0,
"step": 1450
},
{
"epoch": 1.984993178717599,
"grad_norm": 0.2526050017737328,
"learning_rate": 2.966670501129427e-05,
"loss": 0.4617,
"num_tokens": 1112326009.0,
"step": 1455
},
{
"epoch": 1.9918144611186903,
"grad_norm": 0.25326046223431775,
"learning_rate": 2.9540356449772034e-05,
"loss": 0.4584,
"num_tokens": 1116108995.0,
"step": 1460
},
{
"epoch": 1.9986357435197817,
"grad_norm": 0.2564772607600422,
"learning_rate": 2.941394298084441e-05,
"loss": 0.4484,
"num_tokens": 1119855365.0,
"step": 1465
},
{
"epoch": 2.0054570259208733,
"grad_norm": 0.3147035418636063,
"learning_rate": 2.9287468625951025e-05,
"loss": 0.4223,
"num_tokens": 1123684625.0,
"step": 1470
},
{
"epoch": 2.0122783083219646,
"grad_norm": 0.2565619666869903,
"learning_rate": 2.9160937408468396e-05,
"loss": 0.416,
"num_tokens": 1127572305.0,
"step": 1475
},
{
"epoch": 2.019099590723056,
"grad_norm": 0.32184315313361184,
"learning_rate": 2.9034353353581956e-05,
"loss": 0.4247,
"num_tokens": 1131358945.0,
"step": 1480
},
{
"epoch": 2.0259208731241474,
"grad_norm": 0.27261892484724387,
"learning_rate": 2.8907720488157948e-05,
"loss": 0.4194,
"num_tokens": 1135163182.0,
"step": 1485
},
{
"epoch": 2.0327421555252387,
"grad_norm": 0.23968605532571555,
"learning_rate": 2.87810428406154e-05,
"loss": 0.4321,
"num_tokens": 1139156691.0,
"step": 1490
},
{
"epoch": 2.03956343792633,
"grad_norm": 0.23673185470909455,
"learning_rate": 2.8654324440797948e-05,
"loss": 0.4215,
"num_tokens": 1142963803.0,
"step": 1495
},
{
"epoch": 2.0463847203274215,
"grad_norm": 0.2491861063584158,
"learning_rate": 2.8527569319845597e-05,
"loss": 0.4089,
"num_tokens": 1146661529.0,
"step": 1500
},
{
"epoch": 2.053206002728513,
"grad_norm": 0.23530867040961626,
"learning_rate": 2.8400781510066536e-05,
"loss": 0.4281,
"num_tokens": 1150631571.0,
"step": 1505
},
{
"epoch": 2.060027285129604,
"grad_norm": 0.2631434011973304,
"learning_rate": 2.8273965044808864e-05,
"loss": 0.4176,
"num_tokens": 1154372520.0,
"step": 1510
},
{
"epoch": 2.0668485675306956,
"grad_norm": 0.2284833410164006,
"learning_rate": 2.8147123958332216e-05,
"loss": 0.4229,
"num_tokens": 1158377799.0,
"step": 1515
},
{
"epoch": 2.0736698499317874,
"grad_norm": 0.2678035655026217,
"learning_rate": 2.8020262285679523e-05,
"loss": 0.4258,
"num_tokens": 1162137345.0,
"step": 1520
},
{
"epoch": 2.0804911323328787,
"grad_norm": 0.2712552795411336,
"learning_rate": 2.7893384062548554e-05,
"loss": 0.4254,
"num_tokens": 1165931384.0,
"step": 1525
},
{
"epoch": 2.08731241473397,
"grad_norm": 0.22877487888785064,
"learning_rate": 2.7766493325163606e-05,
"loss": 0.418,
"num_tokens": 1169758424.0,
"step": 1530
},
{
"epoch": 2.0941336971350615,
"grad_norm": 0.23596015312602944,
"learning_rate": 2.7639594110147073e-05,
"loss": 0.4233,
"num_tokens": 1173501069.0,
"step": 1535
},
{
"epoch": 2.100954979536153,
"grad_norm": 0.24515986360845682,
"learning_rate": 2.7512690454391032e-05,
"loss": 0.4303,
"num_tokens": 1177335233.0,
"step": 1540
},
{
"epoch": 2.107776261937244,
"grad_norm": 0.22909429894313327,
"learning_rate": 2.7385786394928827e-05,
"loss": 0.4194,
"num_tokens": 1181125439.0,
"step": 1545
},
{
"epoch": 2.1145975443383356,
"grad_norm": 0.2336846178361885,
"learning_rate": 2.725888596880666e-05,
"loss": 0.4286,
"num_tokens": 1185004310.0,
"step": 1550
},
{
"epoch": 2.121418826739427,
"grad_norm": 0.24571070856958216,
"learning_rate": 2.7131993212955126e-05,
"loss": 0.4342,
"num_tokens": 1188730220.0,
"step": 1555
},
{
"epoch": 2.1282401091405183,
"grad_norm": 0.22792129868881894,
"learning_rate": 2.7005112164060832e-05,
"loss": 0.4132,
"num_tokens": 1192452885.0,
"step": 1560
},
{
"epoch": 2.1350613915416097,
"grad_norm": 0.2410539139234191,
"learning_rate": 2.6878246858437957e-05,
"loss": 0.42,
"num_tokens": 1196235271.0,
"step": 1565
},
{
"epoch": 2.141882673942701,
"grad_norm": 0.25078836647709346,
"learning_rate": 2.675140133189986e-05,
"loss": 0.4163,
"num_tokens": 1200064083.0,
"step": 1570
},
{
"epoch": 2.148703956343793,
"grad_norm": 0.22906696585845734,
"learning_rate": 2.66245796196307e-05,
"loss": 0.4242,
"num_tokens": 1203975750.0,
"step": 1575
},
{
"epoch": 2.155525238744884,
"grad_norm": 0.24781657545420474,
"learning_rate": 2.649778575605706e-05,
"loss": 0.4281,
"num_tokens": 1207941601.0,
"step": 1580
},
{
"epoch": 2.1623465211459756,
"grad_norm": 0.24090982192799237,
"learning_rate": 2.6371023774719595e-05,
"loss": 0.4182,
"num_tokens": 1211722253.0,
"step": 1585
},
{
"epoch": 2.169167803547067,
"grad_norm": 0.25749659183735263,
"learning_rate": 2.624429770814473e-05,
"loss": 0.425,
"num_tokens": 1215472991.0,
"step": 1590
},
{
"epoch": 2.1759890859481583,
"grad_norm": 0.2472677398883936,
"learning_rate": 2.6117611587716384e-05,
"loss": 0.4288,
"num_tokens": 1219314367.0,
"step": 1595
},
{
"epoch": 2.1828103683492497,
"grad_norm": 0.25092458443075377,
"learning_rate": 2.599096944354772e-05,
"loss": 0.4301,
"num_tokens": 1223192538.0,
"step": 1600
},
{
"epoch": 2.189631650750341,
"grad_norm": 0.247907240580044,
"learning_rate": 2.5864375304352918e-05,
"loss": 0.4074,
"num_tokens": 1226911643.0,
"step": 1605
},
{
"epoch": 2.1964529331514324,
"grad_norm": 0.23295476928581843,
"learning_rate": 2.5737833197319062e-05,
"loss": 0.4225,
"num_tokens": 1230700907.0,
"step": 1610
},
{
"epoch": 2.203274215552524,
"grad_norm": 0.24438835145296037,
"learning_rate": 2.5611347147977982e-05,
"loss": 0.4263,
"num_tokens": 1234481818.0,
"step": 1615
},
{
"epoch": 2.210095497953615,
"grad_norm": 0.24089894593772532,
"learning_rate": 2.5484921180078213e-05,
"loss": 0.4254,
"num_tokens": 1238353778.0,
"step": 1620
},
{
"epoch": 2.2169167803547065,
"grad_norm": 0.21464574119985996,
"learning_rate": 2.5358559315456993e-05,
"loss": 0.4227,
"num_tokens": 1242159504.0,
"step": 1625
},
{
"epoch": 2.223738062755798,
"grad_norm": 0.22839035226091423,
"learning_rate": 2.5232265573912327e-05,
"loss": 0.4244,
"num_tokens": 1246049292.0,
"step": 1630
},
{
"epoch": 2.2305593451568897,
"grad_norm": 0.22402310267370926,
"learning_rate": 2.5106043973075076e-05,
"loss": 0.4328,
"num_tokens": 1249978353.0,
"step": 1635
},
{
"epoch": 2.237380627557981,
"grad_norm": 0.23206367405395442,
"learning_rate": 2.4979898528281214e-05,
"loss": 0.4211,
"num_tokens": 1253553694.0,
"step": 1640
},
{
"epoch": 2.2442019099590724,
"grad_norm": 0.2333554639072696,
"learning_rate": 2.485383325244403e-05,
"loss": 0.4246,
"num_tokens": 1257413581.0,
"step": 1645
},
{
"epoch": 2.251023192360164,
"grad_norm": 0.23454156642634885,
"learning_rate": 2.4727852155926497e-05,
"loss": 0.4279,
"num_tokens": 1261372211.0,
"step": 1650
},
{
"epoch": 2.257844474761255,
"grad_norm": 0.21421952276925463,
"learning_rate": 2.4601959246413696e-05,
"loss": 0.4232,
"num_tokens": 1265194682.0,
"step": 1655
},
{
"epoch": 2.2646657571623465,
"grad_norm": 0.22105746329030013,
"learning_rate": 2.447615852878533e-05,
"loss": 0.421,
"num_tokens": 1268958243.0,
"step": 1660
},
{
"epoch": 2.271487039563438,
"grad_norm": 0.23675555949897692,
"learning_rate": 2.4350454004988283e-05,
"loss": 0.4132,
"num_tokens": 1272721552.0,
"step": 1665
},
{
"epoch": 2.2783083219645293,
"grad_norm": 0.23313280614805948,
"learning_rate": 2.4224849673909374e-05,
"loss": 0.4201,
"num_tokens": 1276637460.0,
"step": 1670
},
{
"epoch": 2.2851296043656206,
"grad_norm": 0.24879594808091673,
"learning_rate": 2.409934953124809e-05,
"loss": 0.4277,
"num_tokens": 1280437716.0,
"step": 1675
},
{
"epoch": 2.291950886766712,
"grad_norm": 0.23629762626791306,
"learning_rate": 2.3973957569389503e-05,
"loss": 0.4225,
"num_tokens": 1284303404.0,
"step": 1680
},
{
"epoch": 2.2987721691678034,
"grad_norm": 0.23426015282967028,
"learning_rate": 2.3848677777277278e-05,
"loss": 0.4207,
"num_tokens": 1288266060.0,
"step": 1685
},
{
"epoch": 2.305593451568895,
"grad_norm": 0.2474806470725563,
"learning_rate": 2.3723514140286734e-05,
"loss": 0.4263,
"num_tokens": 1292088844.0,
"step": 1690
},
{
"epoch": 2.3124147339699865,
"grad_norm": 0.26321888944320493,
"learning_rate": 2.359847064009808e-05,
"loss": 0.419,
"num_tokens": 1295907279.0,
"step": 1695
},
{
"epoch": 2.319236016371078,
"grad_norm": 0.2330696578889981,
"learning_rate": 2.3473551254569794e-05,
"loss": 0.4132,
"num_tokens": 1299636811.0,
"step": 1700
},
{
"epoch": 2.3260572987721693,
"grad_norm": 0.23243524196557758,
"learning_rate": 2.3348759957611998e-05,
"loss": 0.4282,
"num_tokens": 1303503660.0,
"step": 1705
},
{
"epoch": 2.3328785811732606,
"grad_norm": 0.2353602509324825,
"learning_rate": 2.3224100719060127e-05,
"loss": 0.4286,
"num_tokens": 1307218513.0,
"step": 1710
},
{
"epoch": 2.339699863574352,
"grad_norm": 0.2233695529503808,
"learning_rate": 2.309957750454858e-05,
"loss": 0.4157,
"num_tokens": 1311018055.0,
"step": 1715
},
{
"epoch": 2.3465211459754434,
"grad_norm": 0.22408309313922292,
"learning_rate": 2.2975194275384594e-05,
"loss": 0.4175,
"num_tokens": 1314633285.0,
"step": 1720
},
{
"epoch": 2.3533424283765347,
"grad_norm": 0.22428384354773337,
"learning_rate": 2.2850954988422207e-05,
"loss": 0.4171,
"num_tokens": 1318426856.0,
"step": 1725
},
{
"epoch": 2.360163710777626,
"grad_norm": 0.22843262783520524,
"learning_rate": 2.272686359593642e-05,
"loss": 0.417,
"num_tokens": 1322084417.0,
"step": 1730
},
{
"epoch": 2.3669849931787175,
"grad_norm": 0.21581659433548242,
"learning_rate": 2.2602924045497425e-05,
"loss": 0.4214,
"num_tokens": 1325960721.0,
"step": 1735
},
{
"epoch": 2.373806275579809,
"grad_norm": 0.22722846051005088,
"learning_rate": 2.247914027984505e-05,
"loss": 0.41,
"num_tokens": 1329717684.0,
"step": 1740
},
{
"epoch": 2.3806275579809,
"grad_norm": 0.22477800449961008,
"learning_rate": 2.2355516236763324e-05,
"loss": 0.4138,
"num_tokens": 1333545659.0,
"step": 1745
},
{
"epoch": 2.3874488403819916,
"grad_norm": 0.22050953995659528,
"learning_rate": 2.2232055848955248e-05,
"loss": 0.4198,
"num_tokens": 1337260747.0,
"step": 1750
},
{
"epoch": 2.3942701227830834,
"grad_norm": 0.22917354851814678,
"learning_rate": 2.2108763043917608e-05,
"loss": 0.4211,
"num_tokens": 1341043858.0,
"step": 1755
},
{
"epoch": 2.4010914051841747,
"grad_norm": 0.2284572798651079,
"learning_rate": 2.1985641743816105e-05,
"loss": 0.4319,
"num_tokens": 1344938218.0,
"step": 1760
},
{
"epoch": 2.407912687585266,
"grad_norm": 0.21929109499301586,
"learning_rate": 2.1862695865360554e-05,
"loss": 0.4303,
"num_tokens": 1348788919.0,
"step": 1765
},
{
"epoch": 2.4147339699863575,
"grad_norm": 0.2256346107569212,
"learning_rate": 2.17399293196803e-05,
"loss": 0.4214,
"num_tokens": 1352691877.0,
"step": 1770
},
{
"epoch": 2.421555252387449,
"grad_norm": 0.23019155066996747,
"learning_rate": 2.1617346012199778e-05,
"loss": 0.4235,
"num_tokens": 1356448019.0,
"step": 1775
},
{
"epoch": 2.42837653478854,
"grad_norm": 0.24309913308062492,
"learning_rate": 2.1494949842514288e-05,
"loss": 0.4272,
"num_tokens": 1360291674.0,
"step": 1780
},
{
"epoch": 2.4351978171896316,
"grad_norm": 0.2173000207873259,
"learning_rate": 2.137274470426596e-05,
"loss": 0.4182,
"num_tokens": 1364175739.0,
"step": 1785
},
{
"epoch": 2.442019099590723,
"grad_norm": 0.21860172592053967,
"learning_rate": 2.125073448501985e-05,
"loss": 0.4264,
"num_tokens": 1368149466.0,
"step": 1790
},
{
"epoch": 2.4488403819918143,
"grad_norm": 0.2176204658866239,
"learning_rate": 2.11289230661403e-05,
"loss": 0.4198,
"num_tokens": 1371952154.0,
"step": 1795
},
{
"epoch": 2.4556616643929057,
"grad_norm": 0.2354590603024919,
"learning_rate": 2.1007314322667436e-05,
"loss": 0.4232,
"num_tokens": 1375728846.0,
"step": 1800
},
{
"epoch": 2.4624829467939975,
"grad_norm": 0.21712276199851965,
"learning_rate": 2.0885912123193945e-05,
"loss": 0.4157,
"num_tokens": 1379383893.0,
"step": 1805
},
{
"epoch": 2.469304229195089,
"grad_norm": 0.22832909360309003,
"learning_rate": 2.0764720329741953e-05,
"loss": 0.4229,
"num_tokens": 1383235087.0,
"step": 1810
},
{
"epoch": 2.47612551159618,
"grad_norm": 0.22951251208399123,
"learning_rate": 2.064374279764022e-05,
"loss": 0.4132,
"num_tokens": 1387022452.0,
"step": 1815
},
{
"epoch": 2.4829467939972716,
"grad_norm": 0.22838348706069087,
"learning_rate": 2.052298337540142e-05,
"loss": 0.4199,
"num_tokens": 1390822247.0,
"step": 1820
},
{
"epoch": 2.489768076398363,
"grad_norm": 0.21887085907417755,
"learning_rate": 2.0402445904599827e-05,
"loss": 0.4191,
"num_tokens": 1394564010.0,
"step": 1825
},
{
"epoch": 2.4965893587994543,
"grad_norm": 0.23482857590479367,
"learning_rate": 2.0282134219748983e-05,
"loss": 0.4149,
"num_tokens": 1398382007.0,
"step": 1830
},
{
"epoch": 2.5034106412005457,
"grad_norm": 0.24606812348910376,
"learning_rate": 2.0162052148179798e-05,
"loss": 0.4205,
"num_tokens": 1402127581.0,
"step": 1835
},
{
"epoch": 2.510231923601637,
"grad_norm": 0.21575272826753958,
"learning_rate": 2.0042203509918768e-05,
"loss": 0.4267,
"num_tokens": 1406014675.0,
"step": 1840
},
{
"epoch": 2.5170532060027284,
"grad_norm": 0.21192778882004046,
"learning_rate": 1.992259211756645e-05,
"loss": 0.4175,
"num_tokens": 1409722672.0,
"step": 1845
},
{
"epoch": 2.52387448840382,
"grad_norm": 0.22811496251814561,
"learning_rate": 1.98032217761762e-05,
"loss": 0.4261,
"num_tokens": 1413666158.0,
"step": 1850
},
{
"epoch": 2.530695770804911,
"grad_norm": 0.22908632960325515,
"learning_rate": 1.9684096283133084e-05,
"loss": 0.4269,
"num_tokens": 1417402571.0,
"step": 1855
},
{
"epoch": 2.5375170532060025,
"grad_norm": 0.22178872945976558,
"learning_rate": 1.9565219428033127e-05,
"loss": 0.4163,
"num_tokens": 1421340412.0,
"step": 1860
},
{
"epoch": 2.544338335607094,
"grad_norm": 0.22694151179861227,
"learning_rate": 1.9446594992562716e-05,
"loss": 0.4249,
"num_tokens": 1425199953.0,
"step": 1865
},
{
"epoch": 2.5511596180081857,
"grad_norm": 0.236572889052039,
"learning_rate": 1.932822675037833e-05,
"loss": 0.4181,
"num_tokens": 1429272848.0,
"step": 1870
},
{
"epoch": 2.557980900409277,
"grad_norm": 0.23388899031526594,
"learning_rate": 1.921011846698646e-05,
"loss": 0.4147,
"num_tokens": 1433091944.0,
"step": 1875
},
{
"epoch": 2.5648021828103684,
"grad_norm": 0.23974990168673324,
"learning_rate": 1.9092273899623864e-05,
"loss": 0.42,
"num_tokens": 1436974840.0,
"step": 1880
},
{
"epoch": 2.57162346521146,
"grad_norm": 0.2309497009298231,
"learning_rate": 1.8974696797137996e-05,
"loss": 0.4254,
"num_tokens": 1440774914.0,
"step": 1885
},
{
"epoch": 2.578444747612551,
"grad_norm": 0.23368473252531424,
"learning_rate": 1.885739089986779e-05,
"loss": 0.418,
"num_tokens": 1444556279.0,
"step": 1890
},
{
"epoch": 2.5852660300136425,
"grad_norm": 0.21692135999687384,
"learning_rate": 1.8740359939524655e-05,
"loss": 0.4231,
"num_tokens": 1448438583.0,
"step": 1895
},
{
"epoch": 2.592087312414734,
"grad_norm": 0.21687707639488846,
"learning_rate": 1.8623607639073743e-05,
"loss": 0.4274,
"num_tokens": 1452265791.0,
"step": 1900
},
{
"epoch": 2.5989085948158253,
"grad_norm": 0.25141521675980416,
"learning_rate": 1.8507137712615553e-05,
"loss": 0.4257,
"num_tokens": 1455960099.0,
"step": 1905
},
{
"epoch": 2.6057298772169166,
"grad_norm": 0.2487430740484725,
"learning_rate": 1.8390953865267756e-05,
"loss": 0.4223,
"num_tokens": 1459864902.0,
"step": 1910
},
{
"epoch": 2.6125511596180084,
"grad_norm": 0.2348067182488432,
"learning_rate": 1.8275059793047318e-05,
"loss": 0.4113,
"num_tokens": 1463560724.0,
"step": 1915
},
{
"epoch": 2.6193724420191,
"grad_norm": 0.20831466486828573,
"learning_rate": 1.8159459182752958e-05,
"loss": 0.4153,
"num_tokens": 1467318533.0,
"step": 1920
},
{
"epoch": 2.626193724420191,
"grad_norm": 0.221088737686775,
"learning_rate": 1.8044155711847833e-05,
"loss": 0.4189,
"num_tokens": 1471225951.0,
"step": 1925
},
{
"epoch": 2.6330150068212825,
"grad_norm": 0.23573336270806225,
"learning_rate": 1.792915304834256e-05,
"loss": 0.414,
"num_tokens": 1474976948.0,
"step": 1930
},
{
"epoch": 2.639836289222374,
"grad_norm": 0.23412991747166595,
"learning_rate": 1.781445485067854e-05,
"loss": 0.4181,
"num_tokens": 1478830865.0,
"step": 1935
},
{
"epoch": 2.6466575716234653,
"grad_norm": 0.22999200373661702,
"learning_rate": 1.770006476761157e-05,
"loss": 0.4309,
"num_tokens": 1482679267.0,
"step": 1940
},
{
"epoch": 2.6534788540245566,
"grad_norm": 0.2311723989723882,
"learning_rate": 1.7585986438095763e-05,
"loss": 0.4237,
"num_tokens": 1486426058.0,
"step": 1945
},
{
"epoch": 2.660300136425648,
"grad_norm": 0.22020293262737,
"learning_rate": 1.7472223491167767e-05,
"loss": 0.4101,
"num_tokens": 1490166733.0,
"step": 1950
},
{
"epoch": 2.6671214188267394,
"grad_norm": 0.21795987471310851,
"learning_rate": 1.735877954583139e-05,
"loss": 0.4245,
"num_tokens": 1493992930.0,
"step": 1955
},
{
"epoch": 2.6739427012278307,
"grad_norm": 0.23745451187143324,
"learning_rate": 1.724565821094239e-05,
"loss": 0.4098,
"num_tokens": 1497730531.0,
"step": 1960
},
{
"epoch": 2.680763983628922,
"grad_norm": 0.226178899379112,
"learning_rate": 1.7132863085093728e-05,
"loss": 0.425,
"num_tokens": 1501593876.0,
"step": 1965
},
{
"epoch": 2.6875852660300135,
"grad_norm": 0.231976304295542,
"learning_rate": 1.7020397756501062e-05,
"loss": 0.4215,
"num_tokens": 1505411865.0,
"step": 1970
},
{
"epoch": 2.694406548431105,
"grad_norm": 0.2081553021114587,
"learning_rate": 1.6908265802888605e-05,
"loss": 0.4144,
"num_tokens": 1509039077.0,
"step": 1975
},
{
"epoch": 2.701227830832196,
"grad_norm": 0.23286471006088036,
"learning_rate": 1.6796470791375302e-05,
"loss": 0.4158,
"num_tokens": 1512763400.0,
"step": 1980
},
{
"epoch": 2.708049113233288,
"grad_norm": 0.21963420285235397,
"learning_rate": 1.668501627836138e-05,
"loss": 0.4221,
"num_tokens": 1516724445.0,
"step": 1985
},
{
"epoch": 2.7148703956343794,
"grad_norm": 0.2102372089422378,
"learning_rate": 1.657390580941521e-05,
"loss": 0.4149,
"num_tokens": 1520614981.0,
"step": 1990
},
{
"epoch": 2.7216916780354707,
"grad_norm": 0.2103099247946458,
"learning_rate": 1.646314291916045e-05,
"loss": 0.4225,
"num_tokens": 1524397796.0,
"step": 1995
},
{
"epoch": 2.728512960436562,
"grad_norm": 0.20651298615731473,
"learning_rate": 1.6352731131163724e-05,
"loss": 0.4176,
"num_tokens": 1528158624.0,
"step": 2000
},
{
"epoch": 2.7353342428376535,
"grad_norm": 0.22256304912813424,
"learning_rate": 1.624267395782242e-05,
"loss": 0.4413,
"num_tokens": 1532032964.0,
"step": 2005
},
{
"epoch": 2.742155525238745,
"grad_norm": 0.2141320219657911,
"learning_rate": 1.6132974900252988e-05,
"loss": 0.42,
"num_tokens": 1535931108.0,
"step": 2010
},
{
"epoch": 2.748976807639836,
"grad_norm": 0.21399803252659633,
"learning_rate": 1.6023637448179608e-05,
"loss": 0.4079,
"num_tokens": 1539611481.0,
"step": 2015
},
{
"epoch": 2.7557980900409276,
"grad_norm": 0.21363759608230212,
"learning_rate": 1.591466507982312e-05,
"loss": 0.4285,
"num_tokens": 1543769258.0,
"step": 2020
},
{
"epoch": 2.762619372442019,
"grad_norm": 0.2085416000025761,
"learning_rate": 1.580606126179038e-05,
"loss": 0.4202,
"num_tokens": 1547704714.0,
"step": 2025
},
{
"epoch": 2.7694406548431107,
"grad_norm": 0.21003686502862656,
"learning_rate": 1.569782944896402e-05,
"loss": 0.4189,
"num_tokens": 1551607979.0,
"step": 2030
},
{
"epoch": 2.776261937244202,
"grad_norm": 0.22506606570671495,
"learning_rate": 1.5589973084392513e-05,
"loss": 0.4233,
"num_tokens": 1555548130.0,
"step": 2035
},
{
"epoch": 2.7830832196452935,
"grad_norm": 0.24072762404877676,
"learning_rate": 1.5482495599180637e-05,
"loss": 0.4094,
"num_tokens": 1559374280.0,
"step": 2040
},
{
"epoch": 2.789904502046385,
"grad_norm": 0.20730508474415332,
"learning_rate": 1.5375400412380347e-05,
"loss": 0.421,
"num_tokens": 1563231974.0,
"step": 2045
},
{
"epoch": 2.796725784447476,
"grad_norm": 0.23906963103039622,
"learning_rate": 1.5268690930882e-05,
"loss": 0.4261,
"num_tokens": 1567090495.0,
"step": 2050
},
{
"epoch": 2.8035470668485676,
"grad_norm": 0.22062602067558976,
"learning_rate": 1.5162370549305962e-05,
"loss": 0.4308,
"num_tokens": 1570887007.0,
"step": 2055
},
{
"epoch": 2.810368349249659,
"grad_norm": 0.20431590447856307,
"learning_rate": 1.505644264989464e-05,
"loss": 0.4136,
"num_tokens": 1574806787.0,
"step": 2060
},
{
"epoch": 2.8171896316507503,
"grad_norm": 0.24109171460042456,
"learning_rate": 1.4950910602404886e-05,
"loss": 0.4191,
"num_tokens": 1578732597.0,
"step": 2065
},
{
"epoch": 2.8240109140518417,
"grad_norm": 0.21187950729408436,
"learning_rate": 1.4845777764000757e-05,
"loss": 0.423,
"num_tokens": 1582527990.0,
"step": 2070
},
{
"epoch": 2.830832196452933,
"grad_norm": 0.21453562191523357,
"learning_rate": 1.4741047479146803e-05,
"loss": 0.4108,
"num_tokens": 1586163936.0,
"step": 2075
},
{
"epoch": 2.8376534788540244,
"grad_norm": 0.22588422097050712,
"learning_rate": 1.463672307950159e-05,
"loss": 0.4087,
"num_tokens": 1589894282.0,
"step": 2080
},
{
"epoch": 2.844474761255116,
"grad_norm": 0.2261754562696023,
"learning_rate": 1.4532807883811745e-05,
"loss": 0.4241,
"num_tokens": 1593669447.0,
"step": 2085
},
{
"epoch": 2.851296043656207,
"grad_norm": 0.20337839289851203,
"learning_rate": 1.4429305197806386e-05,
"loss": 0.4164,
"num_tokens": 1597406094.0,
"step": 2090
},
{
"epoch": 2.8581173260572985,
"grad_norm": 0.19221798653016375,
"learning_rate": 1.4326218314091971e-05,
"loss": 0.4197,
"num_tokens": 1601353966.0,
"step": 2095
},
{
"epoch": 2.8649386084583903,
"grad_norm": 0.21350869825951038,
"learning_rate": 1.4223550512047517e-05,
"loss": 0.4238,
"num_tokens": 1605354453.0,
"step": 2100
},
{
"epoch": 2.8717598908594817,
"grad_norm": 0.21516182131932057,
"learning_rate": 1.4121305057720305e-05,
"loss": 0.4277,
"num_tokens": 1609346678.0,
"step": 2105
},
{
"epoch": 2.878581173260573,
"grad_norm": 0.20487482373295057,
"learning_rate": 1.4019485203722004e-05,
"loss": 0.4233,
"num_tokens": 1613213477.0,
"step": 2110
},
{
"epoch": 2.8854024556616644,
"grad_norm": 0.19971661497573895,
"learning_rate": 1.391809418912513e-05,
"loss": 0.4134,
"num_tokens": 1616990063.0,
"step": 2115
},
{
"epoch": 2.892223738062756,
"grad_norm": 0.21479965480668722,
"learning_rate": 1.3817135239360079e-05,
"loss": 0.4122,
"num_tokens": 1620738563.0,
"step": 2120
},
{
"epoch": 2.899045020463847,
"grad_norm": 0.21149002401342695,
"learning_rate": 1.371661156611247e-05,
"loss": 0.4254,
"num_tokens": 1624503206.0,
"step": 2125
},
{
"epoch": 2.9058663028649385,
"grad_norm": 0.2064220920477961,
"learning_rate": 1.3616526367220999e-05,
"loss": 0.4218,
"num_tokens": 1628389551.0,
"step": 2130
},
{
"epoch": 2.91268758526603,
"grad_norm": 0.224336611095809,
"learning_rate": 1.3516882826575699e-05,
"loss": 0.4168,
"num_tokens": 1632099673.0,
"step": 2135
},
{
"epoch": 2.9195088676671213,
"grad_norm": 0.20231394272285003,
"learning_rate": 1.3417684114016682e-05,
"loss": 0.4208,
"num_tokens": 1636056127.0,
"step": 2140
},
{
"epoch": 2.926330150068213,
"grad_norm": 0.2342068060643603,
"learning_rate": 1.3318933385233252e-05,
"loss": 0.4134,
"num_tokens": 1639823774.0,
"step": 2145
},
{
"epoch": 2.9331514324693044,
"grad_norm": 0.22071791865366464,
"learning_rate": 1.3220633781663561e-05,
"loss": 0.4205,
"num_tokens": 1643643243.0,
"step": 2150
},
{
"epoch": 2.939972714870396,
"grad_norm": 0.20280836857510123,
"learning_rate": 1.3122788430394659e-05,
"loss": 0.4137,
"num_tokens": 1647326744.0,
"step": 2155
},
{
"epoch": 2.946793997271487,
"grad_norm": 0.21111204058607796,
"learning_rate": 1.3025400444062991e-05,
"loss": 0.4128,
"num_tokens": 1651209275.0,
"step": 2160
},
{
"epoch": 2.9536152796725785,
"grad_norm": 0.22752544012779208,
"learning_rate": 1.2928472920755427e-05,
"loss": 0.4197,
"num_tokens": 1654945680.0,
"step": 2165
},
{
"epoch": 2.96043656207367,
"grad_norm": 0.20457377739676993,
"learning_rate": 1.2832008943910679e-05,
"loss": 0.4126,
"num_tokens": 1658670255.0,
"step": 2170
},
{
"epoch": 2.9672578444747613,
"grad_norm": 0.215676001216208,
"learning_rate": 1.273601158222118e-05,
"loss": 0.421,
"num_tokens": 1662421467.0,
"step": 2175
},
{
"epoch": 2.9740791268758526,
"grad_norm": 0.20052662912803418,
"learning_rate": 1.2640483889535548e-05,
"loss": 0.4155,
"num_tokens": 1666233128.0,
"step": 2180
},
{
"epoch": 2.980900409276944,
"grad_norm": 0.22018742684050363,
"learning_rate": 1.2545428904761358e-05,
"loss": 0.4206,
"num_tokens": 1670143796.0,
"step": 2185
},
{
"epoch": 2.9877216916780354,
"grad_norm": 0.20843817841523296,
"learning_rate": 1.2450849651768482e-05,
"loss": 0.4166,
"num_tokens": 1674122886.0,
"step": 2190
},
{
"epoch": 2.9945429740791267,
"grad_norm": 0.21312098205498078,
"learning_rate": 1.2356749139292936e-05,
"loss": 0.4191,
"num_tokens": 1677987646.0,
"step": 2195
},
{
"epoch": 3.001364256480218,
"grad_norm": 0.25313048875354344,
"learning_rate": 1.2263130360841133e-05,
"loss": 0.4077,
"num_tokens": 1681710780.0,
"step": 2200
},
{
"epoch": 3.00818553888131,
"grad_norm": 0.2518733265600338,
"learning_rate": 1.2169996294594647e-05,
"loss": 0.3943,
"num_tokens": 1685570373.0,
"step": 2205
},
{
"epoch": 3.0150068212824013,
"grad_norm": 0.21608953499384176,
"learning_rate": 1.2077349903315494e-05,
"loss": 0.3942,
"num_tokens": 1689499335.0,
"step": 2210
},
{
"epoch": 3.0218281036834926,
"grad_norm": 0.21094067692735943,
"learning_rate": 1.1985194134251893e-05,
"loss": 0.3885,
"num_tokens": 1693267116.0,
"step": 2215
},
{
"epoch": 3.028649386084584,
"grad_norm": 0.22097130594934408,
"learning_rate": 1.1893531919044455e-05,
"loss": 0.392,
"num_tokens": 1696911966.0,
"step": 2220
},
{
"epoch": 3.0354706684856754,
"grad_norm": 0.23081007114913413,
"learning_rate": 1.1802366173632978e-05,
"loss": 0.3911,
"num_tokens": 1700717213.0,
"step": 2225
},
{
"epoch": 3.0422919508867667,
"grad_norm": 0.22034774494074957,
"learning_rate": 1.1711699798163662e-05,
"loss": 0.3914,
"num_tokens": 1704583362.0,
"step": 2230
},
{
"epoch": 3.049113233287858,
"grad_norm": 0.20795664906099545,
"learning_rate": 1.1621535676896832e-05,
"loss": 0.3913,
"num_tokens": 1708328029.0,
"step": 2235
},
{
"epoch": 3.0559345156889495,
"grad_norm": 0.20846407848313514,
"learning_rate": 1.153187667811523e-05,
"loss": 0.3981,
"num_tokens": 1712138483.0,
"step": 2240
},
{
"epoch": 3.062755798090041,
"grad_norm": 0.20753109208216994,
"learning_rate": 1.1442725654032726e-05,
"loss": 0.3941,
"num_tokens": 1715891615.0,
"step": 2245
},
{
"epoch": 3.069577080491132,
"grad_norm": 0.20038731471295618,
"learning_rate": 1.1354085440703613e-05,
"loss": 0.4021,
"num_tokens": 1719889011.0,
"step": 2250
},
{
"epoch": 3.0763983628922236,
"grad_norm": 0.21209283757585357,
"learning_rate": 1.1265958857932374e-05,
"loss": 0.3896,
"num_tokens": 1723608607.0,
"step": 2255
},
{
"epoch": 3.083219645293315,
"grad_norm": 0.23047949629485057,
"learning_rate": 1.1178348709183984e-05,
"loss": 0.4023,
"num_tokens": 1727372956.0,
"step": 2260
},
{
"epoch": 3.0900409276944067,
"grad_norm": 0.20869366897756297,
"learning_rate": 1.1091257781494702e-05,
"loss": 0.3921,
"num_tokens": 1731315261.0,
"step": 2265
},
{
"epoch": 3.096862210095498,
"grad_norm": 0.21058145322360297,
"learning_rate": 1.1004688845383456e-05,
"loss": 0.3963,
"num_tokens": 1735219718.0,
"step": 2270
},
{
"epoch": 3.1036834924965895,
"grad_norm": 0.2074326101986663,
"learning_rate": 1.0918644654763688e-05,
"loss": 0.3896,
"num_tokens": 1739047080.0,
"step": 2275
},
{
"epoch": 3.110504774897681,
"grad_norm": 0.20481355721124894,
"learning_rate": 1.0833127946855707e-05,
"loss": 0.3826,
"num_tokens": 1742971837.0,
"step": 2280
},
{
"epoch": 3.117326057298772,
"grad_norm": 0.19813330790330563,
"learning_rate": 1.0748141442099694e-05,
"loss": 0.3878,
"num_tokens": 1746927074.0,
"step": 2285
},
{
"epoch": 3.1241473396998636,
"grad_norm": 0.2116417089035285,
"learning_rate": 1.0663687844069093e-05,
"loss": 0.3878,
"num_tokens": 1750733072.0,
"step": 2290
},
{
"epoch": 3.130968622100955,
"grad_norm": 0.22256327065708215,
"learning_rate": 1.0579769839384614e-05,
"loss": 0.3946,
"num_tokens": 1754529527.0,
"step": 2295
},
{
"epoch": 3.1377899045020463,
"grad_norm": 0.2041840929582227,
"learning_rate": 1.0496390097628808e-05,
"loss": 0.3935,
"num_tokens": 1758441793.0,
"step": 2300
},
{
"epoch": 3.1446111869031377,
"grad_norm": 0.19564312042697185,
"learning_rate": 1.0413551271261101e-05,
"loss": 0.3836,
"num_tokens": 1762275922.0,
"step": 2305
},
{
"epoch": 3.151432469304229,
"grad_norm": 0.2068432147143155,
"learning_rate": 1.0331255995533418e-05,
"loss": 0.3906,
"num_tokens": 1766113412.0,
"step": 2310
},
{
"epoch": 3.1582537517053204,
"grad_norm": 0.22243916899267124,
"learning_rate": 1.0249506888406379e-05,
"loss": 0.3948,
"num_tokens": 1769906086.0,
"step": 2315
},
{
"epoch": 3.1650750341064118,
"grad_norm": 0.2048461202536205,
"learning_rate": 1.0168306550465994e-05,
"loss": 0.3966,
"num_tokens": 1773669464.0,
"step": 2320
},
{
"epoch": 3.1718963165075036,
"grad_norm": 0.21617012487326628,
"learning_rate": 1.0087657564840935e-05,
"loss": 0.3938,
"num_tokens": 1777499688.0,
"step": 2325
},
{
"epoch": 3.178717598908595,
"grad_norm": 0.20554623338455671,
"learning_rate": 1.000756249712037e-05,
"loss": 0.399,
"num_tokens": 1781484075.0,
"step": 2330
},
{
"epoch": 3.1855388813096863,
"grad_norm": 0.19715452218076857,
"learning_rate": 9.928023895272351e-06,
"loss": 0.3949,
"num_tokens": 1785359652.0,
"step": 2335
},
{
"epoch": 3.1923601637107777,
"grad_norm": 0.2213446767833458,
"learning_rate": 9.849044289562725e-06,
"loss": 0.3933,
"num_tokens": 1789122591.0,
"step": 2340
},
{
"epoch": 3.199181446111869,
"grad_norm": 0.20417351278770474,
"learning_rate": 9.770626192474689e-06,
"loss": 0.402,
"num_tokens": 1793107659.0,
"step": 2345
},
{
"epoch": 3.2060027285129604,
"grad_norm": 0.20314880208976857,
"learning_rate": 9.692772098628843e-06,
"loss": 0.391,
"num_tokens": 1796804821.0,
"step": 2350
},
{
"epoch": 3.212824010914052,
"grad_norm": 0.19634303877612064,
"learning_rate": 9.615484484703807e-06,
"loss": 0.3875,
"num_tokens": 1800706093.0,
"step": 2355
},
{
"epoch": 3.219645293315143,
"grad_norm": 0.20356531552961224,
"learning_rate": 9.53876580935749e-06,
"loss": 0.3892,
"num_tokens": 1804621910.0,
"step": 2360
},
{
"epoch": 3.2264665757162345,
"grad_norm": 0.20498462055547575,
"learning_rate": 9.462618513148825e-06,
"loss": 0.3898,
"num_tokens": 1808407310.0,
"step": 2365
},
{
"epoch": 3.233287858117326,
"grad_norm": 0.21014352436728398,
"learning_rate": 9.387045018460136e-06,
"loss": 0.3808,
"num_tokens": 1812224911.0,
"step": 2370
},
{
"epoch": 3.2401091405184177,
"grad_norm": 0.21372227802314,
"learning_rate": 9.312047729420112e-06,
"loss": 0.389,
"num_tokens": 1816033009.0,
"step": 2375
},
{
"epoch": 3.246930422919509,
"grad_norm": 0.21668432624109216,
"learning_rate": 9.237629031827294e-06,
"loss": 0.3909,
"num_tokens": 1819832724.0,
"step": 2380
},
{
"epoch": 3.2537517053206004,
"grad_norm": 0.2119446997252824,
"learning_rate": 9.163791293074183e-06,
"loss": 0.3951,
"num_tokens": 1823734105.0,
"step": 2385
},
{
"epoch": 3.260572987721692,
"grad_norm": 0.20740892554253706,
"learning_rate": 9.09053686207194e-06,
"loss": 0.3974,
"num_tokens": 1827553738.0,
"step": 2390
},
{
"epoch": 3.267394270122783,
"grad_norm": 0.2059967519948797,
"learning_rate": 9.017868069175678e-06,
"loss": 0.3914,
"num_tokens": 1831290342.0,
"step": 2395
},
{
"epoch": 3.2742155525238745,
"grad_norm": 0.21234381046184714,
"learning_rate": 8.945787226110273e-06,
"loss": 0.3965,
"num_tokens": 1835037993.0,
"step": 2400
},
{
"epoch": 3.281036834924966,
"grad_norm": 0.20237389246899862,
"learning_rate": 8.874296625896888e-06,
"loss": 0.3861,
"num_tokens": 1838807356.0,
"step": 2405
},
{
"epoch": 3.2878581173260573,
"grad_norm": 0.19098693481860202,
"learning_rate": 8.803398542779994e-06,
"loss": 0.4008,
"num_tokens": 1842745596.0,
"step": 2410
},
{
"epoch": 3.2946793997271486,
"grad_norm": 0.21338378176043743,
"learning_rate": 8.73309523215502e-06,
"loss": 0.3911,
"num_tokens": 1846517419.0,
"step": 2415
},
{
"epoch": 3.30150068212824,
"grad_norm": 0.20688191017456503,
"learning_rate": 8.663388930496616e-06,
"loss": 0.397,
"num_tokens": 1850310616.0,
"step": 2420
},
{
"epoch": 3.3083219645293314,
"grad_norm": 0.19608884555809325,
"learning_rate": 8.594281855287512e-06,
"loss": 0.3896,
"num_tokens": 1854227804.0,
"step": 2425
},
{
"epoch": 3.3151432469304227,
"grad_norm": 0.20671461791807003,
"learning_rate": 8.525776204947961e-06,
"loss": 0.3844,
"num_tokens": 1858022696.0,
"step": 2430
},
{
"epoch": 3.321964529331514,
"grad_norm": 0.20319736048116888,
"learning_rate": 8.45787415876581e-06,
"loss": 0.3831,
"num_tokens": 1861807749.0,
"step": 2435
},
{
"epoch": 3.328785811732606,
"grad_norm": 0.2050817882021739,
"learning_rate": 8.390577876827183e-06,
"loss": 0.4052,
"num_tokens": 1865749161.0,
"step": 2440
},
{
"epoch": 3.3356070941336973,
"grad_norm": 0.21735850171481977,
"learning_rate": 8.323889499947733e-06,
"loss": 0.3865,
"num_tokens": 1869507562.0,
"step": 2445
},
{
"epoch": 3.3424283765347886,
"grad_norm": 0.20478772687116364,
"learning_rate": 8.257811149604578e-06,
"loss": 0.3903,
"num_tokens": 1873354894.0,
"step": 2450
},
{
"epoch": 3.34924965893588,
"grad_norm": 0.22023262268774746,
"learning_rate": 8.1923449278688e-06,
"loss": 0.3926,
"num_tokens": 1877168182.0,
"step": 2455
},
{
"epoch": 3.3560709413369714,
"grad_norm": 0.2253966050871356,
"learning_rate": 8.127492917338545e-06,
"loss": 0.3969,
"num_tokens": 1881013690.0,
"step": 2460
},
{
"epoch": 3.3628922237380627,
"grad_norm": 0.20017364506269658,
"learning_rate": 8.063257181072827e-06,
"loss": 0.3949,
"num_tokens": 1884975840.0,
"step": 2465
},
{
"epoch": 3.369713506139154,
"grad_norm": 0.2043509066053362,
"learning_rate": 7.999639762525855e-06,
"loss": 0.3902,
"num_tokens": 1888839607.0,
"step": 2470
},
{
"epoch": 3.3765347885402455,
"grad_norm": 0.20210724992772502,
"learning_rate": 7.936642685482029e-06,
"loss": 0.3924,
"num_tokens": 1892784948.0,
"step": 2475
},
{
"epoch": 3.383356070941337,
"grad_norm": 0.21968340995681965,
"learning_rate": 7.874267953991589e-06,
"loss": 0.3933,
"num_tokens": 1896452542.0,
"step": 2480
},
{
"epoch": 3.390177353342428,
"grad_norm": 0.2094714463856819,
"learning_rate": 7.812517552306842e-06,
"loss": 0.3939,
"num_tokens": 1900341954.0,
"step": 2485
},
{
"epoch": 3.39699863574352,
"grad_norm": 0.2091973568099767,
"learning_rate": 7.751393444819021e-06,
"loss": 0.3964,
"num_tokens": 1904200124.0,
"step": 2490
},
{
"epoch": 3.4038199181446114,
"grad_norm": 0.20170653729583482,
"learning_rate": 7.690897575995838e-06,
"loss": 0.3843,
"num_tokens": 1908080037.0,
"step": 2495
},
{
"epoch": 3.4106412005457027,
"grad_norm": 0.23178079669342305,
"learning_rate": 7.63103187031961e-06,
"loss": 0.3838,
"num_tokens": 1911799657.0,
"step": 2500
},
{
"epoch": 3.417462482946794,
"grad_norm": 0.23412806397059693,
"learning_rate": 7.571798232226003e-06,
"loss": 0.3951,
"num_tokens": 1915633789.0,
"step": 2505
},
{
"epoch": 3.4242837653478855,
"grad_norm": 0.2262063695861437,
"learning_rate": 7.5131985460434985e-06,
"loss": 0.3917,
"num_tokens": 1919464952.0,
"step": 2510
},
{
"epoch": 3.431105047748977,
"grad_norm": 0.21906232852290922,
"learning_rate": 7.4552346759334285e-06,
"loss": 0.3893,
"num_tokens": 1923208468.0,
"step": 2515
},
{
"epoch": 3.437926330150068,
"grad_norm": 0.19736609233735516,
"learning_rate": 7.3979084658306535e-06,
"loss": 0.3944,
"num_tokens": 1927046873.0,
"step": 2520
},
{
"epoch": 3.4447476125511596,
"grad_norm": 0.20436848581764777,
"learning_rate": 7.34122173938495e-06,
"loss": 0.3842,
"num_tokens": 1930903387.0,
"step": 2525
},
{
"epoch": 3.451568894952251,
"grad_norm": 0.20859725244629027,
"learning_rate": 7.285176299902956e-06,
"loss": 0.3849,
"num_tokens": 1934670669.0,
"step": 2530
},
{
"epoch": 3.4583901773533423,
"grad_norm": 0.20852983402642666,
"learning_rate": 7.229773930290816e-06,
"loss": 0.3904,
"num_tokens": 1938387280.0,
"step": 2535
},
{
"epoch": 3.4652114597544337,
"grad_norm": 0.19522927512353125,
"learning_rate": 7.175016392997473e-06,
"loss": 0.379,
"num_tokens": 1942343131.0,
"step": 2540
},
{
"epoch": 3.472032742155525,
"grad_norm": 0.20050884496880403,
"learning_rate": 7.1209054299585965e-06,
"loss": 0.3876,
"num_tokens": 1945923779.0,
"step": 2545
},
{
"epoch": 3.4788540245566164,
"grad_norm": 0.2057634574949913,
"learning_rate": 7.0674427625411585e-06,
"loss": 0.3923,
"num_tokens": 1949676825.0,
"step": 2550
},
{
"epoch": 3.485675306957708,
"grad_norm": 0.1972141823138041,
"learning_rate": 7.014630091488686e-06,
"loss": 0.3816,
"num_tokens": 1953537310.0,
"step": 2555
},
{
"epoch": 3.4924965893587996,
"grad_norm": 0.20211921503834646,
"learning_rate": 6.962469096867162e-06,
"loss": 0.3885,
"num_tokens": 1957305222.0,
"step": 2560
},
{
"epoch": 3.499317871759891,
"grad_norm": 0.21038414370930605,
"learning_rate": 6.910961438011552e-06,
"loss": 0.385,
"num_tokens": 1960958556.0,
"step": 2565
},
{
"epoch": 3.5061391541609823,
"grad_norm": 0.21665071977354775,
"learning_rate": 6.860108753473055e-06,
"loss": 0.3941,
"num_tokens": 1964999544.0,
"step": 2570
},
{
"epoch": 3.5129604365620737,
"grad_norm": 0.19728776121424704,
"learning_rate": 6.809912660966959e-06,
"loss": 0.3882,
"num_tokens": 1968883238.0,
"step": 2575
},
{
"epoch": 3.519781718963165,
"grad_norm": 0.22398814515162693,
"learning_rate": 6.760374757321162e-06,
"loss": 0.3949,
"num_tokens": 1972701391.0,
"step": 2580
},
{
"epoch": 3.5266030013642564,
"grad_norm": 0.2002701521202298,
"learning_rate": 6.711496618425414e-06,
"loss": 0.3921,
"num_tokens": 1976508939.0,
"step": 2585
},
{
"epoch": 3.533424283765348,
"grad_norm": 0.19910839515627227,
"learning_rate": 6.663279799181149e-06,
"loss": 0.3956,
"num_tokens": 1980362572.0,
"step": 2590
},
{
"epoch": 3.540245566166439,
"grad_norm": 0.2041823149699508,
"learning_rate": 6.6157258334520285e-06,
"loss": 0.3973,
"num_tokens": 1984261238.0,
"step": 2595
},
{
"epoch": 3.547066848567531,
"grad_norm": 0.2000905521095183,
"learning_rate": 6.568836234015172e-06,
"loss": 0.3897,
"num_tokens": 1988149913.0,
"step": 2600
},
{
"epoch": 3.5538881309686223,
"grad_norm": 0.18563333401941437,
"learning_rate": 6.522612492512997e-06,
"loss": 0.3869,
"num_tokens": 1992084412.0,
"step": 2605
},
{
"epoch": 3.5607094133697137,
"grad_norm": 0.20277027008106682,
"learning_rate": 6.477056079405794e-06,
"loss": 0.3886,
"num_tokens": 1995921814.0,
"step": 2610
},
{
"epoch": 3.567530695770805,
"grad_norm": 0.20852766092177452,
"learning_rate": 6.432168443924929e-06,
"loss": 0.3883,
"num_tokens": 1999809637.0,
"step": 2615
},
{
"epoch": 3.5743519781718964,
"grad_norm": 0.21203977784508607,
"learning_rate": 6.387951014026755e-06,
"loss": 0.384,
"num_tokens": 2003526409.0,
"step": 2620
},
{
"epoch": 3.581173260572988,
"grad_norm": 0.2069840864292817,
"learning_rate": 6.3444051963471806e-06,
"loss": 0.3979,
"num_tokens": 2007455906.0,
"step": 2625
},
{
"epoch": 3.587994542974079,
"grad_norm": 0.20048934543362684,
"learning_rate": 6.301532376156921e-06,
"loss": 0.3934,
"num_tokens": 2011337751.0,
"step": 2630
},
{
"epoch": 3.5948158253751705,
"grad_norm": 0.19834381113774482,
"learning_rate": 6.259333917317436e-06,
"loss": 0.3992,
"num_tokens": 2015319415.0,
"step": 2635
},
{
"epoch": 3.601637107776262,
"grad_norm": 0.21282358337605378,
"learning_rate": 6.21781116223753e-06,
"loss": 0.4006,
"num_tokens": 2019219253.0,
"step": 2640
},
{
"epoch": 3.6084583901773533,
"grad_norm": 0.2029670555965298,
"learning_rate": 6.176965431830666e-06,
"loss": 0.3984,
"num_tokens": 2023102339.0,
"step": 2645
},
{
"epoch": 3.6152796725784446,
"grad_norm": 0.21004596795864844,
"learning_rate": 6.136798025472937e-06,
"loss": 0.3904,
"num_tokens": 2026777557.0,
"step": 2650
},
{
"epoch": 3.622100954979536,
"grad_norm": 0.20450488637042483,
"learning_rate": 6.097310220961715e-06,
"loss": 0.3926,
"num_tokens": 2030544635.0,
"step": 2655
},
{
"epoch": 3.6289222373806274,
"grad_norm": 0.20780498338670383,
"learning_rate": 6.058503274475029e-06,
"loss": 0.3959,
"num_tokens": 2034457342.0,
"step": 2660
},
{
"epoch": 3.6357435197817187,
"grad_norm": 0.20299175264729932,
"learning_rate": 6.020378420531589e-06,
"loss": 0.3935,
"num_tokens": 2038160685.0,
"step": 2665
},
{
"epoch": 3.64256480218281,
"grad_norm": 0.203714903198161,
"learning_rate": 5.982936871951507e-06,
"loss": 0.3893,
"num_tokens": 2042057043.0,
"step": 2670
},
{
"epoch": 3.649386084583902,
"grad_norm": 0.18615329351658916,
"learning_rate": 5.946179819817731e-06,
"loss": 0.381,
"num_tokens": 2045931129.0,
"step": 2675
},
{
"epoch": 3.6562073669849933,
"grad_norm": 0.2045589475870847,
"learning_rate": 5.910108433438151e-06,
"loss": 0.3942,
"num_tokens": 2049650512.0,
"step": 2680
},
{
"epoch": 3.6630286493860846,
"grad_norm": 0.20203376952951235,
"learning_rate": 5.874723860308384e-06,
"loss": 0.3822,
"num_tokens": 2053328762.0,
"step": 2685
},
{
"epoch": 3.669849931787176,
"grad_norm": 0.19587913638609752,
"learning_rate": 5.840027226075295e-06,
"loss": 0.3862,
"num_tokens": 2057162099.0,
"step": 2690
},
{
"epoch": 3.6766712141882674,
"grad_norm": 0.20886702731734277,
"learning_rate": 5.806019634501175e-06,
"loss": 0.3961,
"num_tokens": 2060972297.0,
"step": 2695
},
{
"epoch": 3.6834924965893587,
"grad_norm": 0.21067356253291641,
"learning_rate": 5.772702167428618e-06,
"loss": 0.3802,
"num_tokens": 2064586221.0,
"step": 2700
},
{
"epoch": 3.69031377899045,
"grad_norm": 0.20718394506562157,
"learning_rate": 5.7400758847461315e-06,
"loss": 0.3983,
"num_tokens": 2068459718.0,
"step": 2705
},
{
"epoch": 3.6971350613915415,
"grad_norm": 0.19555241045212834,
"learning_rate": 5.7081418243544e-06,
"loss": 0.3895,
"num_tokens": 2072325609.0,
"step": 2710
},
{
"epoch": 3.7039563437926333,
"grad_norm": 0.1963361038784244,
"learning_rate": 5.676901002133273e-06,
"loss": 0.3929,
"num_tokens": 2076390764.0,
"step": 2715
},
{
"epoch": 3.7107776261937246,
"grad_norm": 0.20786774861006216,
"learning_rate": 5.646354411909446e-06,
"loss": 0.4022,
"num_tokens": 2080252567.0,
"step": 2720
},
{
"epoch": 3.717598908594816,
"grad_norm": 0.20572707253355813,
"learning_rate": 5.616503025424856e-06,
"loss": 0.3877,
"num_tokens": 2084020477.0,
"step": 2725
},
{
"epoch": 3.7244201909959074,
"grad_norm": 0.20508204818855866,
"learning_rate": 5.587347792305745e-06,
"loss": 0.3832,
"num_tokens": 2087723825.0,
"step": 2730
},
{
"epoch": 3.7312414733969987,
"grad_norm": 0.19829256497358866,
"learning_rate": 5.558889640032476e-06,
"loss": 0.3959,
"num_tokens": 2091548707.0,
"step": 2735
},
{
"epoch": 3.73806275579809,
"grad_norm": 0.20585067559570674,
"learning_rate": 5.531129473910013e-06,
"loss": 0.395,
"num_tokens": 2095277696.0,
"step": 2740
},
{
"epoch": 3.7448840381991815,
"grad_norm": 0.20584618251918524,
"learning_rate": 5.504068177039132e-06,
"loss": 0.3843,
"num_tokens": 2099065289.0,
"step": 2745
},
{
"epoch": 3.751705320600273,
"grad_norm": 0.1961381624274159,
"learning_rate": 5.477706610288317e-06,
"loss": 0.3914,
"num_tokens": 2102889019.0,
"step": 2750
},
{
"epoch": 3.758526603001364,
"grad_norm": 0.20088944941627254,
"learning_rate": 5.45204561226638e-06,
"loss": 0.3895,
"num_tokens": 2106610423.0,
"step": 2755
},
{
"epoch": 3.7653478854024556,
"grad_norm": 0.19125391439188097,
"learning_rate": 5.42708599929578e-06,
"loss": 0.396,
"num_tokens": 2110417885.0,
"step": 2760
},
{
"epoch": 3.772169167803547,
"grad_norm": 0.21608047273020625,
"learning_rate": 5.402828565386665e-06,
"loss": 0.3968,
"num_tokens": 2114204416.0,
"step": 2765
},
{
"epoch": 3.7789904502046383,
"grad_norm": 0.1942534200680033,
"learning_rate": 5.3792740822116025e-06,
"loss": 0.3886,
"num_tokens": 2118096713.0,
"step": 2770
},
{
"epoch": 3.7858117326057297,
"grad_norm": 0.19802316385817678,
"learning_rate": 5.356423299081025e-06,
"loss": 0.4007,
"num_tokens": 2121992916.0,
"step": 2775
},
{
"epoch": 3.792633015006821,
"grad_norm": 0.20270167705867925,
"learning_rate": 5.33427694291941e-06,
"loss": 0.393,
"num_tokens": 2125730480.0,
"step": 2780
},
{
"epoch": 3.799454297407913,
"grad_norm": 0.20021094011750865,
"learning_rate": 5.31283571824215e-06,
"loss": 0.3879,
"num_tokens": 2129639321.0,
"step": 2785
},
{
"epoch": 3.806275579809004,
"grad_norm": 0.20135100813170187,
"learning_rate": 5.292100307133135e-06,
"loss": 0.3876,
"num_tokens": 2133458245.0,
"step": 2790
},
{
"epoch": 3.8130968622100956,
"grad_norm": 0.20308852278461587,
"learning_rate": 5.27207136922305e-06,
"loss": 0.3924,
"num_tokens": 2137312410.0,
"step": 2795
},
{
"epoch": 3.819918144611187,
"grad_norm": 0.19442660123469885,
"learning_rate": 5.25274954166841e-06,
"loss": 0.3905,
"num_tokens": 2141197422.0,
"step": 2800
},
{
"epoch": 3.8267394270122783,
"grad_norm": 0.2038756065211056,
"learning_rate": 5.234135439131267e-06,
"loss": 0.3891,
"num_tokens": 2145011937.0,
"step": 2805
},
{
"epoch": 3.8335607094133697,
"grad_norm": 0.20340134857081124,
"learning_rate": 5.2162296537596785e-06,
"loss": 0.3868,
"num_tokens": 2148857263.0,
"step": 2810
},
{
"epoch": 3.840381991814461,
"grad_norm": 0.19451669508345293,
"learning_rate": 5.199032755168853e-06,
"loss": 0.3906,
"num_tokens": 2152616579.0,
"step": 2815
},
{
"epoch": 3.8472032742155524,
"grad_norm": 0.19947015975252247,
"learning_rate": 5.1825452904230384e-06,
"loss": 0.3878,
"num_tokens": 2156280215.0,
"step": 2820
},
{
"epoch": 3.854024556616644,
"grad_norm": 0.19242022707343331,
"learning_rate": 5.166767784018122e-06,
"loss": 0.3908,
"num_tokens": 2160088451.0,
"step": 2825
},
{
"epoch": 3.8608458390177356,
"grad_norm": 0.20797387240019166,
"learning_rate": 5.151700737864934e-06,
"loss": 0.3894,
"num_tokens": 2163915194.0,
"step": 2830
},
{
"epoch": 3.867667121418827,
"grad_norm": 0.2017238446572756,
"learning_rate": 5.137344631273288e-06,
"loss": 0.3969,
"num_tokens": 2167621168.0,
"step": 2835
},
{
"epoch": 3.8744884038199183,
"grad_norm": 0.20071816929517722,
"learning_rate": 5.123699920936733e-06,
"loss": 0.3951,
"num_tokens": 2171481655.0,
"step": 2840
},
{
"epoch": 3.8813096862210097,
"grad_norm": 0.19141590926200375,
"learning_rate": 5.110767040918028e-06,
"loss": 0.3956,
"num_tokens": 2175244240.0,
"step": 2845
},
{
"epoch": 3.888130968622101,
"grad_norm": 0.1998247337143825,
"learning_rate": 5.0985464026353306e-06,
"loss": 0.3921,
"num_tokens": 2178959600.0,
"step": 2850
},
{
"epoch": 3.8949522510231924,
"grad_norm": 0.1866653423880014,
"learning_rate": 5.0870383948491004e-06,
"loss": 0.3894,
"num_tokens": 2182711952.0,
"step": 2855
},
{
"epoch": 3.901773533424284,
"grad_norm": 0.19710468274556872,
"learning_rate": 5.07624338364975e-06,
"loss": 0.3885,
"num_tokens": 2186594283.0,
"step": 2860
},
{
"epoch": 3.908594815825375,
"grad_norm": 0.21264165496615145,
"learning_rate": 5.066161712445985e-06,
"loss": 0.3865,
"num_tokens": 2190266795.0,
"step": 2865
},
{
"epoch": 3.9154160982264665,
"grad_norm": 0.2101090061168048,
"learning_rate": 5.0567937019538814e-06,
"loss": 0.397,
"num_tokens": 2194018553.0,
"step": 2870
},
{
"epoch": 3.922237380627558,
"grad_norm": 0.20800078607789743,
"learning_rate": 5.0481396501866925e-06,
"loss": 0.3948,
"num_tokens": 2197673799.0,
"step": 2875
},
{
"epoch": 3.9290586630286493,
"grad_norm": 0.19793740700955814,
"learning_rate": 5.040199832445351e-06,
"loss": 0.3923,
"num_tokens": 2201714258.0,
"step": 2880
},
{
"epoch": 3.9358799454297406,
"grad_norm": 0.20386936192836522,
"learning_rate": 5.032974501309735e-06,
"loss": 0.3862,
"num_tokens": 2205348220.0,
"step": 2885
},
{
"epoch": 3.942701227830832,
"grad_norm": 0.2091053406878887,
"learning_rate": 5.026463886630607e-06,
"loss": 0.4021,
"num_tokens": 2209250198.0,
"step": 2890
},
{
"epoch": 3.9495225102319234,
"grad_norm": 0.19787040691237082,
"learning_rate": 5.020668195522323e-06,
"loss": 0.3926,
"num_tokens": 2213251612.0,
"step": 2895
},
{
"epoch": 3.956343792633015,
"grad_norm": 0.20667483997814673,
"learning_rate": 5.015587612356232e-06,
"loss": 0.3832,
"num_tokens": 2217075610.0,
"step": 2900
},
{
"epoch": 3.9631650750341065,
"grad_norm": 0.1986326433013966,
"learning_rate": 5.011222298754814e-06,
"loss": 0.3889,
"num_tokens": 2220901736.0,
"step": 2905
},
{
"epoch": 3.969986357435198,
"grad_norm": 0.218645478760147,
"learning_rate": 5.007572393586543e-06,
"loss": 0.3784,
"num_tokens": 2224698938.0,
"step": 2910
},
{
"epoch": 3.9768076398362893,
"grad_norm": 0.20263059309127532,
"learning_rate": 5.004638012961454e-06,
"loss": 0.3852,
"num_tokens": 2228505486.0,
"step": 2915
},
{
"epoch": 3.9836289222373806,
"grad_norm": 0.21296036316974268,
"learning_rate": 5.002419250227476e-06,
"loss": 0.3879,
"num_tokens": 2232055237.0,
"step": 2920
},
{
"epoch": 3.990450204638472,
"grad_norm": 0.20148472020199598,
"learning_rate": 5.000916175967434e-06,
"loss": 0.3939,
"num_tokens": 2235867011.0,
"step": 2925
},
{
"epoch": 3.9972714870395634,
"grad_norm": 0.21024070606119777,
"learning_rate": 5.000128837996827e-06,
"loss": 0.3915,
"num_tokens": 2239685763.0,
"step": 2930
},
{
"epoch": 4.0,
"step": 2932,
"total_flos": 4583117197672448.0,
"train_loss": 0.0,
"train_runtime": 0.0145,
"train_samples_per_second": 25827825.108,
"train_steps_per_second": 201975.78
}
],
"logging_steps": 5,
"max_steps": 2932,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4583117197672448.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}