Taywon's picture
Upload folder using huggingface_hub
a4ba09b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 1570,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 2.05683453977108,
"epoch": 0.064,
"grad_norm": 1.5962693691253662,
"learning_rate": 3.7500000000000005e-06,
"loss": 2.056,
"mean_token_accuracy": 0.527055786550045,
"num_tokens": 78959.0,
"step": 10
},
{
"entropy": 2.2151891469955443,
"epoch": 0.128,
"grad_norm": 1.022516131401062,
"learning_rate": 7.916666666666667e-06,
"loss": 2.0766,
"mean_token_accuracy": 0.5204883277416229,
"num_tokens": 158094.0,
"step": 20
},
{
"entropy": 2.376429131627083,
"epoch": 0.192,
"grad_norm": 0.8220515251159668,
"learning_rate": 1.2083333333333333e-05,
"loss": 1.8902,
"mean_token_accuracy": 0.5391427092254162,
"num_tokens": 239163.0,
"step": 30
},
{
"entropy": 2.285146689414978,
"epoch": 0.256,
"grad_norm": 0.7022648453712463,
"learning_rate": 1.6250000000000002e-05,
"loss": 1.7478,
"mean_token_accuracy": 0.56041978597641,
"num_tokens": 317818.0,
"step": 40
},
{
"entropy": 2.3282038152217863,
"epoch": 0.32,
"grad_norm": 0.5584391951560974,
"learning_rate": 1.9999978697023387e-05,
"loss": 1.7687,
"mean_token_accuracy": 0.5601607479155064,
"num_tokens": 396146.0,
"step": 50
},
{
"entropy": 2.2709642231464384,
"epoch": 0.384,
"grad_norm": 0.5373395085334778,
"learning_rate": 1.999742244965125e-05,
"loss": 1.6913,
"mean_token_accuracy": 0.5693033933639526,
"num_tokens": 474291.0,
"step": 60
},
{
"entropy": 2.2445768117904663,
"epoch": 0.448,
"grad_norm": 0.4558122754096985,
"learning_rate": 1.9990606854864625e-05,
"loss": 1.679,
"mean_token_accuracy": 0.5720810443162918,
"num_tokens": 554739.0,
"step": 70
},
{
"entropy": 2.2270330280065536,
"epoch": 0.512,
"grad_norm": 0.5535369515419006,
"learning_rate": 1.997953481641056e-05,
"loss": 1.6522,
"mean_token_accuracy": 0.574026207625866,
"num_tokens": 633658.0,
"step": 80
},
{
"entropy": 2.2367560386657717,
"epoch": 0.576,
"grad_norm": 0.5366299152374268,
"learning_rate": 1.9964211051470778e-05,
"loss": 1.6955,
"mean_token_accuracy": 0.5699351653456688,
"num_tokens": 712400.0,
"step": 90
},
{
"entropy": 2.21894571185112,
"epoch": 0.64,
"grad_norm": 0.4690150022506714,
"learning_rate": 1.994464208865191e-05,
"loss": 1.7048,
"mean_token_accuracy": 0.5701304003596306,
"num_tokens": 792630.0,
"step": 100
},
{
"entropy": 2.235249537229538,
"epoch": 0.704,
"grad_norm": 0.5834165811538696,
"learning_rate": 1.9920836265204047e-05,
"loss": 1.7032,
"mean_token_accuracy": 0.5705543920397759,
"num_tokens": 872045.0,
"step": 110
},
{
"entropy": 2.2257163137197495,
"epoch": 0.768,
"grad_norm": 0.5584805011749268,
"learning_rate": 1.989280372346868e-05,
"loss": 1.666,
"mean_token_accuracy": 0.5684764981269836,
"num_tokens": 952057.0,
"step": 120
},
{
"entropy": 2.2563431203365325,
"epoch": 0.832,
"grad_norm": 0.5170231461524963,
"learning_rate": 1.986055640655763e-05,
"loss": 1.7134,
"mean_token_accuracy": 0.570289532840252,
"num_tokens": 1029200.0,
"step": 130
},
{
"entropy": 2.2378907680511473,
"epoch": 0.896,
"grad_norm": 0.5027748942375183,
"learning_rate": 1.9824108053264726e-05,
"loss": 1.6719,
"mean_token_accuracy": 0.5730531394481659,
"num_tokens": 1105844.0,
"step": 140
},
{
"entropy": 2.1966699600219726,
"epoch": 0.96,
"grad_norm": 0.5884814262390137,
"learning_rate": 1.9783474192212484e-05,
"loss": 1.6327,
"mean_token_accuracy": 0.5813805550336838,
"num_tokens": 1182935.0,
"step": 150
},
{
"entropy": 2.20564815804765,
"epoch": 1.0192,
"grad_norm": 0.570175290107727,
"learning_rate": 1.9738672135236218e-05,
"loss": 1.6118,
"mean_token_accuracy": 0.582583570802534,
"num_tokens": 1254363.0,
"step": 160
},
{
"entropy": 2.1847074955701826,
"epoch": 1.0832,
"grad_norm": 0.5836730003356934,
"learning_rate": 1.968972097000843e-05,
"loss": 1.6172,
"mean_token_accuracy": 0.5812226444482803,
"num_tokens": 1330281.0,
"step": 170
},
{
"entropy": 2.1814055383205413,
"epoch": 1.1472,
"grad_norm": 0.5746439695358276,
"learning_rate": 1.96366415519066e-05,
"loss": 1.6192,
"mean_token_accuracy": 0.5789176046848297,
"num_tokens": 1409407.0,
"step": 180
},
{
"entropy": 2.2038993716239927,
"epoch": 1.2112,
"grad_norm": 0.5652104616165161,
"learning_rate": 1.957945649512788e-05,
"loss": 1.6166,
"mean_token_accuracy": 0.5809548154473305,
"num_tokens": 1489034.0,
"step": 190
},
{
"entropy": 2.173789343237877,
"epoch": 1.2752,
"grad_norm": 0.6653291583061218,
"learning_rate": 1.951819016305442e-05,
"loss": 1.62,
"mean_token_accuracy": 0.5827470317482948,
"num_tokens": 1568549.0,
"step": 200
},
{
"entropy": 2.1907752990722655,
"epoch": 1.3392,
"grad_norm": 0.7024573087692261,
"learning_rate": 1.9452868657873513e-05,
"loss": 1.6397,
"mean_token_accuracy": 0.5796025812625885,
"num_tokens": 1647404.0,
"step": 210
},
{
"entropy": 2.189376249909401,
"epoch": 1.4032,
"grad_norm": 0.5727422833442688,
"learning_rate": 1.9383519809456862e-05,
"loss": 1.6349,
"mean_token_accuracy": 0.5815459445118905,
"num_tokens": 1728421.0,
"step": 220
},
{
"entropy": 2.209022229909897,
"epoch": 1.4672,
"grad_norm": 0.6421232223510742,
"learning_rate": 1.931017316350384e-05,
"loss": 1.6425,
"mean_token_accuracy": 0.5790404245257378,
"num_tokens": 1806891.0,
"step": 230
},
{
"entropy": 2.2337595343589784,
"epoch": 1.5312000000000001,
"grad_norm": 0.6296209692955017,
"learning_rate": 1.9232859968953702e-05,
"loss": 1.624,
"mean_token_accuracy": 0.5814317353069782,
"num_tokens": 1883100.0,
"step": 240
},
{
"entropy": 2.205833575129509,
"epoch": 1.5952,
"grad_norm": 0.6371021866798401,
"learning_rate": 1.9151613164672136e-05,
"loss": 1.6284,
"mean_token_accuracy": 0.5819905593991279,
"num_tokens": 1961317.0,
"step": 250
},
{
"entropy": 2.205822005867958,
"epoch": 1.6592,
"grad_norm": 0.6950616836547852,
"learning_rate": 1.9066467365417844e-05,
"loss": 1.6374,
"mean_token_accuracy": 0.5760326236486435,
"num_tokens": 2042881.0,
"step": 260
},
{
"entropy": 2.2163637399673464,
"epoch": 1.7231999999999998,
"grad_norm": 0.7801616191864014,
"learning_rate": 1.8977458847095117e-05,
"loss": 1.663,
"mean_token_accuracy": 0.5744953289628029,
"num_tokens": 2121403.0,
"step": 270
},
{
"entropy": 2.199243775010109,
"epoch": 1.7872,
"grad_norm": 0.6671239733695984,
"learning_rate": 1.888462553129867e-05,
"loss": 1.6456,
"mean_token_accuracy": 0.579181258380413,
"num_tokens": 2200908.0,
"step": 280
},
{
"entropy": 2.214826595783234,
"epoch": 1.8512,
"grad_norm": 0.7415009140968323,
"learning_rate": 1.878800696915737e-05,
"loss": 1.6113,
"mean_token_accuracy": 0.5840038731694221,
"num_tokens": 2278414.0,
"step": 290
},
{
"entropy": 2.187604659795761,
"epoch": 1.9152,
"grad_norm": 0.662319540977478,
"learning_rate": 1.868764432448369e-05,
"loss": 1.6182,
"mean_token_accuracy": 0.580166706442833,
"num_tokens": 2355826.0,
"step": 300
},
{
"entropy": 2.2184703826904295,
"epoch": 1.9792,
"grad_norm": 0.7123025059700012,
"learning_rate": 1.8583580356236065e-05,
"loss": 1.655,
"mean_token_accuracy": 0.5762834578752518,
"num_tokens": 2434933.0,
"step": 310
},
{
"entropy": 2.1887036239778674,
"epoch": 2.0384,
"grad_norm": 0.6846157312393188,
"learning_rate": 1.8475859400301708e-05,
"loss": 1.5935,
"mean_token_accuracy": 0.5881956976813238,
"num_tokens": 2507166.0,
"step": 320
},
{
"entropy": 2.102977079153061,
"epoch": 2.1024,
"grad_norm": 0.7967628240585327,
"learning_rate": 1.8364527350607527e-05,
"loss": 1.5405,
"mean_token_accuracy": 0.5946892097592353,
"num_tokens": 2584298.0,
"step": 330
},
{
"entropy": 2.118516767024994,
"epoch": 2.1664,
"grad_norm": 0.7417224645614624,
"learning_rate": 1.824963163956726e-05,
"loss": 1.5727,
"mean_token_accuracy": 0.5870080485939979,
"num_tokens": 2663601.0,
"step": 340
},
{
"entropy": 2.104418155550957,
"epoch": 2.2304,
"grad_norm": 0.7956721782684326,
"learning_rate": 1.8131221217873175e-05,
"loss": 1.5575,
"mean_token_accuracy": 0.5936456203460694,
"num_tokens": 2744783.0,
"step": 350
},
{
"entropy": 2.129578319191933,
"epoch": 2.2944,
"grad_norm": 0.769292950630188,
"learning_rate": 1.8009346533640877e-05,
"loss": 1.5878,
"mean_token_accuracy": 0.5841517195105552,
"num_tokens": 2823023.0,
"step": 360
},
{
"entropy": 2.097687366604805,
"epoch": 2.3584,
"grad_norm": 0.9341740608215332,
"learning_rate": 1.7884059510916167e-05,
"loss": 1.5346,
"mean_token_accuracy": 0.599460557103157,
"num_tokens": 2899598.0,
"step": 370
},
{
"entropy": 2.151599031686783,
"epoch": 2.4224,
"grad_norm": 0.8752340078353882,
"learning_rate": 1.7755413527553087e-05,
"loss": 1.5984,
"mean_token_accuracy": 0.585393351316452,
"num_tokens": 2978519.0,
"step": 380
},
{
"entropy": 2.1223404884338377,
"epoch": 2.4864,
"grad_norm": 1.0296390056610107,
"learning_rate": 1.7623463392472574e-05,
"loss": 1.5232,
"mean_token_accuracy": 0.595654422044754,
"num_tokens": 3055327.0,
"step": 390
},
{
"entropy": 2.16276493370533,
"epoch": 2.5504,
"grad_norm": 0.9905762672424316,
"learning_rate": 1.748826532231142e-05,
"loss": 1.6049,
"mean_token_accuracy": 0.5822189599275589,
"num_tokens": 3135348.0,
"step": 400
},
{
"entropy": 2.127479985356331,
"epoch": 2.6144,
"grad_norm": 0.851375162601471,
"learning_rate": 1.7349876917471474e-05,
"loss": 1.5842,
"mean_token_accuracy": 0.5855211839079857,
"num_tokens": 3213122.0,
"step": 410
},
{
"entropy": 2.167752879858017,
"epoch": 2.6784,
"grad_norm": 0.975143313407898,
"learning_rate": 1.7208357137579318e-05,
"loss": 1.5918,
"mean_token_accuracy": 0.5839722648262977,
"num_tokens": 3289583.0,
"step": 420
},
{
"entropy": 2.127084198594093,
"epoch": 2.7424,
"grad_norm": 0.8077936768531799,
"learning_rate": 1.7063766276366814e-05,
"loss": 1.5916,
"mean_token_accuracy": 0.5900941833853721,
"num_tokens": 3369740.0,
"step": 430
},
{
"entropy": 2.1315969794988634,
"epoch": 2.8064,
"grad_norm": 0.9403624534606934,
"learning_rate": 1.6916165935983323e-05,
"loss": 1.5713,
"mean_token_accuracy": 0.5892721861600876,
"num_tokens": 3448328.0,
"step": 440
},
{
"entropy": 2.130605939030647,
"epoch": 2.8704,
"grad_norm": 0.8252040147781372,
"learning_rate": 1.676561900075041e-05,
"loss": 1.6003,
"mean_token_accuracy": 0.5845118075609207,
"num_tokens": 3529853.0,
"step": 450
},
{
"entropy": 2.112012493610382,
"epoch": 2.9344,
"grad_norm": 0.9267668724060059,
"learning_rate": 1.6612189610370336e-05,
"loss": 1.5796,
"mean_token_accuracy": 0.5887707889080047,
"num_tokens": 3610922.0,
"step": 460
},
{
"entropy": 2.100590059161186,
"epoch": 2.9984,
"grad_norm": 0.8996879458427429,
"learning_rate": 1.6455943132599698e-05,
"loss": 1.5483,
"mean_token_accuracy": 0.5934251204133034,
"num_tokens": 3688391.0,
"step": 470
},
{
"entropy": 2.1115864160898568,
"epoch": 3.0576,
"grad_norm": 1.097270131111145,
"learning_rate": 1.6296946135399835e-05,
"loss": 1.5506,
"mean_token_accuracy": 0.592829834770512,
"num_tokens": 3758747.0,
"step": 480
},
{
"entropy": 2.0610430628061294,
"epoch": 3.1216,
"grad_norm": 1.176645278930664,
"learning_rate": 1.613526635857591e-05,
"loss": 1.4461,
"mean_token_accuracy": 0.6111307457089424,
"num_tokens": 3834689.0,
"step": 490
},
{
"entropy": 2.0154007196426393,
"epoch": 3.1856,
"grad_norm": 1.1834276914596558,
"learning_rate": 1.5970972684916754e-05,
"loss": 1.4852,
"mean_token_accuracy": 0.6026980608701706,
"num_tokens": 3916450.0,
"step": 500
},
{
"entropy": 2.0441433399915696,
"epoch": 3.2496,
"grad_norm": 1.159286379814148,
"learning_rate": 1.5804135110847708e-05,
"loss": 1.4978,
"mean_token_accuracy": 0.6042912915349007,
"num_tokens": 3998511.0,
"step": 510
},
{
"entropy": 2.0493109285831452,
"epoch": 3.3136,
"grad_norm": 1.2141708135604858,
"learning_rate": 1.5634824716609037e-05,
"loss": 1.5018,
"mean_token_accuracy": 0.5995921581983567,
"num_tokens": 4077676.0,
"step": 520
},
{
"entropy": 2.0533218771219253,
"epoch": 3.3776,
"grad_norm": 1.1630637645721436,
"learning_rate": 1.5463113635972577e-05,
"loss": 1.499,
"mean_token_accuracy": 0.6046154126524925,
"num_tokens": 4155264.0,
"step": 530
},
{
"entropy": 2.0600034058094026,
"epoch": 3.4416,
"grad_norm": 1.2523504495620728,
"learning_rate": 1.528907502550954e-05,
"loss": 1.521,
"mean_token_accuracy": 0.6000443026423454,
"num_tokens": 4233655.0,
"step": 540
},
{
"entropy": 2.0414596855640412,
"epoch": 3.5056000000000003,
"grad_norm": 1.3990252017974854,
"learning_rate": 1.5112783033422547e-05,
"loss": 1.4899,
"mean_token_accuracy": 0.6026965886354446,
"num_tokens": 4311644.0,
"step": 550
},
{
"entropy": 2.061043033003807,
"epoch": 3.5696,
"grad_norm": 1.1884260177612305,
"learning_rate": 1.4934312767955193e-05,
"loss": 1.5143,
"mean_token_accuracy": 0.5981319859623909,
"num_tokens": 4390933.0,
"step": 560
},
{
"entropy": 2.034099668264389,
"epoch": 3.6336,
"grad_norm": 1.2996599674224854,
"learning_rate": 1.4753740265392595e-05,
"loss": 1.4953,
"mean_token_accuracy": 0.6029247522354126,
"num_tokens": 4470462.0,
"step": 570
},
{
"entropy": 2.0379767954349517,
"epoch": 3.6976,
"grad_norm": 1.2936193943023682,
"learning_rate": 1.4571142457666536e-05,
"loss": 1.4965,
"mean_token_accuracy": 0.6041712030768395,
"num_tokens": 4549236.0,
"step": 580
},
{
"entropy": 2.040063351392746,
"epoch": 3.7616,
"grad_norm": 1.5094560384750366,
"learning_rate": 1.4386597139579041e-05,
"loss": 1.4979,
"mean_token_accuracy": 0.6051288455724716,
"num_tokens": 4628758.0,
"step": 590
},
{
"entropy": 1.9998936265707017,
"epoch": 3.8256,
"grad_norm": 1.3166426420211792,
"learning_rate": 1.4200182935658327e-05,
"loss": 1.459,
"mean_token_accuracy": 0.6084850415587425,
"num_tokens": 4708526.0,
"step": 600
},
{
"entropy": 2.0041965901851655,
"epoch": 3.8895999999999997,
"grad_norm": 1.2710400819778442,
"learning_rate": 1.4011979266661235e-05,
"loss": 1.4831,
"mean_token_accuracy": 0.6057328775525093,
"num_tokens": 4788733.0,
"step": 610
},
{
"entropy": 2.0265558779239656,
"epoch": 3.9536,
"grad_norm": 1.4318969249725342,
"learning_rate": 1.3822066315736477e-05,
"loss": 1.4966,
"mean_token_accuracy": 0.5994595810770988,
"num_tokens": 4866451.0,
"step": 620
},
{
"entropy": 2.0692459924800977,
"epoch": 4.0128,
"grad_norm": 1.2546013593673706,
"learning_rate": 1.363052499426302e-05,
"loss": 1.503,
"mean_token_accuracy": 0.6039850309088424,
"num_tokens": 4936715.0,
"step": 630
},
{
"entropy": 1.9788923293352128,
"epoch": 4.0768,
"grad_norm": 1.416927456855774,
"learning_rate": 1.3437436907378225e-05,
"loss": 1.4248,
"mean_token_accuracy": 0.6142558038234711,
"num_tokens": 5016713.0,
"step": 640
},
{
"entropy": 1.9646029412746429,
"epoch": 4.1408,
"grad_norm": 1.5146726369857788,
"learning_rate": 1.3242884319210463e-05,
"loss": 1.3875,
"mean_token_accuracy": 0.624424883723259,
"num_tokens": 5096513.0,
"step": 650
},
{
"entropy": 1.93471617102623,
"epoch": 4.2048,
"grad_norm": 1.5090768337249756,
"learning_rate": 1.3046950117830888e-05,
"loss": 1.3884,
"mean_token_accuracy": 0.6222448632121086,
"num_tokens": 5177075.0,
"step": 660
},
{
"entropy": 2.002266028523445,
"epoch": 4.2688,
"grad_norm": 1.74358332157135,
"learning_rate": 1.2849717779939439e-05,
"loss": 1.4062,
"mean_token_accuracy": 0.6180147424340248,
"num_tokens": 5252902.0,
"step": 670
},
{
"entropy": 1.9397415190935134,
"epoch": 4.3328,
"grad_norm": 1.774728775024414,
"learning_rate": 1.2651271335300063e-05,
"loss": 1.3933,
"mean_token_accuracy": 0.626343595981598,
"num_tokens": 5331448.0,
"step": 680
},
{
"entropy": 1.9571841150522231,
"epoch": 4.3968,
"grad_norm": 1.80965256690979,
"learning_rate": 1.2451695330940268e-05,
"loss": 1.4205,
"mean_token_accuracy": 0.6187710732221603,
"num_tokens": 5410857.0,
"step": 690
},
{
"entropy": 1.9691186994314194,
"epoch": 4.4608,
"grad_norm": 1.5400609970092773,
"learning_rate": 1.2251074795130339e-05,
"loss": 1.4123,
"mean_token_accuracy": 0.614769059419632,
"num_tokens": 5488867.0,
"step": 700
},
{
"entropy": 1.9635825514793397,
"epoch": 4.5248,
"grad_norm": 1.467608094215393,
"learning_rate": 1.2049495201157489e-05,
"loss": 1.4228,
"mean_token_accuracy": 0.6202724784612655,
"num_tokens": 5567515.0,
"step": 710
},
{
"entropy": 1.9384470194578172,
"epoch": 4.5888,
"grad_norm": 1.652387022972107,
"learning_rate": 1.1847042430910451e-05,
"loss": 1.4273,
"mean_token_accuracy": 0.6190450325608253,
"num_tokens": 5648858.0,
"step": 720
},
{
"entropy": 1.9911590039730072,
"epoch": 4.6528,
"grad_norm": 1.7492380142211914,
"learning_rate": 1.1643802738289955e-05,
"loss": 1.4776,
"mean_token_accuracy": 0.6073927089571953,
"num_tokens": 5725459.0,
"step": 730
},
{
"entropy": 1.9724233269691467,
"epoch": 4.7168,
"grad_norm": 1.709669828414917,
"learning_rate": 1.1439862712460721e-05,
"loss": 1.4217,
"mean_token_accuracy": 0.6184087961912155,
"num_tokens": 5801601.0,
"step": 740
},
{
"entropy": 1.9725236982107162,
"epoch": 4.7808,
"grad_norm": 1.7469470500946045,
"learning_rate": 1.1235309240960621e-05,
"loss": 1.405,
"mean_token_accuracy": 0.6196158319711685,
"num_tokens": 5881107.0,
"step": 750
},
{
"entropy": 1.9484833419322967,
"epoch": 4.8448,
"grad_norm": 1.532373309135437,
"learning_rate": 1.1030229472682719e-05,
"loss": 1.4155,
"mean_token_accuracy": 0.611663281917572,
"num_tokens": 5960375.0,
"step": 760
},
{
"entropy": 1.9964754343032838,
"epoch": 4.9088,
"grad_norm": 1.7157669067382812,
"learning_rate": 1.0824710780745954e-05,
"loss": 1.4295,
"mean_token_accuracy": 0.6131752103567123,
"num_tokens": 6038267.0,
"step": 770
},
{
"entropy": 1.9598666340112687,
"epoch": 4.9728,
"grad_norm": 1.9844586849212646,
"learning_rate": 1.06188407252703e-05,
"loss": 1.397,
"mean_token_accuracy": 0.6226776748895645,
"num_tokens": 6114749.0,
"step": 780
},
{
"entropy": 1.9227982276194804,
"epoch": 5.032,
"grad_norm": 1.8960447311401367,
"learning_rate": 1.0412707016072254e-05,
"loss": 1.3649,
"mean_token_accuracy": 0.6269845414806057,
"num_tokens": 6190567.0,
"step": 790
},
{
"entropy": 1.9008578658103943,
"epoch": 5.096,
"grad_norm": 2.1205599308013916,
"learning_rate": 1.0206397475296548e-05,
"loss": 1.3582,
"mean_token_accuracy": 0.6292989999055862,
"num_tokens": 6269285.0,
"step": 800
},
{
"entropy": 1.9224162876605988,
"epoch": 5.16,
"grad_norm": 2.0454013347625732,
"learning_rate": 1e-05,
"loss": 1.3349,
"mean_token_accuracy": 0.6315066292881966,
"num_tokens": 6345352.0,
"step": 810
},
{
"entropy": 1.9340467154979706,
"epoch": 5.224,
"grad_norm": 2.2607693672180176,
"learning_rate": 9.793602524703456e-06,
"loss": 1.359,
"mean_token_accuracy": 0.6322078078985214,
"num_tokens": 6422524.0,
"step": 820
},
{
"entropy": 1.9296668291091919,
"epoch": 5.288,
"grad_norm": 2.1245901584625244,
"learning_rate": 9.58729298392775e-06,
"loss": 1.3672,
"mean_token_accuracy": 0.6282135233283043,
"num_tokens": 6500128.0,
"step": 830
},
{
"entropy": 1.9272812247276305,
"epoch": 5.352,
"grad_norm": 1.965820550918579,
"learning_rate": 9.381159274729704e-06,
"loss": 1.3786,
"mean_token_accuracy": 0.6249860525131226,
"num_tokens": 6578766.0,
"step": 840
},
{
"entropy": 1.904970219731331,
"epoch": 5.416,
"grad_norm": 1.9188759326934814,
"learning_rate": 9.175289219254051e-06,
"loss": 1.3418,
"mean_token_accuracy": 0.6325456693768501,
"num_tokens": 6658732.0,
"step": 850
},
{
"entropy": 1.8833305448293687,
"epoch": 5.48,
"grad_norm": 1.9675428867340088,
"learning_rate": 8.969770527317283e-06,
"loss": 1.3274,
"mean_token_accuracy": 0.6377805054187775,
"num_tokens": 6738683.0,
"step": 860
},
{
"entropy": 1.8806802958250046,
"epoch": 5.5440000000000005,
"grad_norm": 1.8849304914474487,
"learning_rate": 8.764690759039382e-06,
"loss": 1.3109,
"mean_token_accuracy": 0.636364534497261,
"num_tokens": 6817786.0,
"step": 870
},
{
"entropy": 1.8846195042133331,
"epoch": 5.608,
"grad_norm": 2.050208330154419,
"learning_rate": 8.56013728753928e-06,
"loss": 1.3449,
"mean_token_accuracy": 0.6316975012421608,
"num_tokens": 6896222.0,
"step": 880
},
{
"entropy": 1.88524529337883,
"epoch": 5.672,
"grad_norm": 2.1371288299560547,
"learning_rate": 8.356197261710048e-06,
"loss": 1.346,
"mean_token_accuracy": 0.633928644657135,
"num_tokens": 6976885.0,
"step": 890
},
{
"entropy": 1.9162244260311128,
"epoch": 5.736,
"grad_norm": 1.9879032373428345,
"learning_rate": 8.152957569089552e-06,
"loss": 1.3486,
"mean_token_accuracy": 0.6311523199081421,
"num_tokens": 7053473.0,
"step": 900
},
{
"entropy": 1.89161317050457,
"epoch": 5.8,
"grad_norm": 2.2934179306030273,
"learning_rate": 7.950504798842513e-06,
"loss": 1.3699,
"mean_token_accuracy": 0.6269390240311623,
"num_tokens": 7133137.0,
"step": 910
},
{
"entropy": 1.888116827607155,
"epoch": 5.864,
"grad_norm": 1.769087791442871,
"learning_rate": 7.748925204869667e-06,
"loss": 1.3756,
"mean_token_accuracy": 0.6285945609211921,
"num_tokens": 7213693.0,
"step": 920
},
{
"entropy": 1.89390210211277,
"epoch": 5.928,
"grad_norm": 2.2577364444732666,
"learning_rate": 7.548304669059735e-06,
"loss": 1.3396,
"mean_token_accuracy": 0.6290415957570076,
"num_tokens": 7291999.0,
"step": 930
},
{
"entropy": 1.8755547761917115,
"epoch": 5.992,
"grad_norm": 2.080371618270874,
"learning_rate": 7.348728664699939e-06,
"loss": 1.3305,
"mean_token_accuracy": 0.6322756335139275,
"num_tokens": 7370138.0,
"step": 940
},
{
"entropy": 1.8396991845723745,
"epoch": 6.0512,
"grad_norm": 2.222177028656006,
"learning_rate": 7.150282220060564e-06,
"loss": 1.2782,
"mean_token_accuracy": 0.6437820018948736,
"num_tokens": 7444764.0,
"step": 950
},
{
"entropy": 1.864711531996727,
"epoch": 6.1152,
"grad_norm": 2.236663579940796,
"learning_rate": 6.9530498821691165e-06,
"loss": 1.342,
"mean_token_accuracy": 0.6400286257266998,
"num_tokens": 7523012.0,
"step": 960
},
{
"entropy": 1.8462383985519408,
"epoch": 6.1792,
"grad_norm": 2.438649892807007,
"learning_rate": 6.757115680789539e-06,
"loss": 1.2769,
"mean_token_accuracy": 0.6437345445156097,
"num_tokens": 7602451.0,
"step": 970
},
{
"entropy": 1.8425735771656035,
"epoch": 6.2432,
"grad_norm": 2.306880235671997,
"learning_rate": 6.562563092621776e-06,
"loss": 1.309,
"mean_token_accuracy": 0.6463457986712455,
"num_tokens": 7681972.0,
"step": 980
},
{
"entropy": 1.8139673799276352,
"epoch": 6.3072,
"grad_norm": 2.286114454269409,
"learning_rate": 6.369475005736984e-06,
"loss": 1.2748,
"mean_token_accuracy": 0.6487143859267235,
"num_tokens": 7762845.0,
"step": 990
},
{
"entropy": 1.8660429507493972,
"epoch": 6.3712,
"grad_norm": 2.421706199645996,
"learning_rate": 6.177933684263524e-06,
"loss": 1.2964,
"mean_token_accuracy": 0.6455973491072655,
"num_tokens": 7839552.0,
"step": 1000
},
{
"entropy": 1.8517659038305283,
"epoch": 6.4352,
"grad_norm": 2.3891334533691406,
"learning_rate": 5.988020733338767e-06,
"loss": 1.2893,
"mean_token_accuracy": 0.6442387655377388,
"num_tokens": 7915996.0,
"step": 1010
},
{
"entropy": 1.8547363132238388,
"epoch": 6.4992,
"grad_norm": 2.26686429977417,
"learning_rate": 5.7998170643416795e-06,
"loss": 1.2973,
"mean_token_accuracy": 0.6435917019844055,
"num_tokens": 7995119.0,
"step": 1020
},
{
"entropy": 1.8365773737430573,
"epoch": 6.5632,
"grad_norm": 2.1454896926879883,
"learning_rate": 5.613402860420962e-06,
"loss": 1.2744,
"mean_token_accuracy": 0.6410152271389962,
"num_tokens": 8075306.0,
"step": 1030
},
{
"entropy": 1.8936803489923477,
"epoch": 6.6272,
"grad_norm": 2.5226423740386963,
"learning_rate": 5.428857542333465e-06,
"loss": 1.3225,
"mean_token_accuracy": 0.6396260514855385,
"num_tokens": 8152449.0,
"step": 1040
},
{
"entropy": 1.8555004209280015,
"epoch": 6.6912,
"grad_norm": 2.216014862060547,
"learning_rate": 5.246259734607411e-06,
"loss": 1.299,
"mean_token_accuracy": 0.641279113292694,
"num_tokens": 8231904.0,
"step": 1050
},
{
"entropy": 1.8588679373264312,
"epoch": 6.7552,
"grad_norm": 2.4265236854553223,
"learning_rate": 5.065687232044811e-06,
"loss": 1.3026,
"mean_token_accuracy": 0.6363563358783721,
"num_tokens": 8310755.0,
"step": 1060
},
{
"entropy": 1.8318012267351151,
"epoch": 6.8192,
"grad_norm": 2.2089412212371826,
"learning_rate": 4.887216966577458e-06,
"loss": 1.2583,
"mean_token_accuracy": 0.6502064153552055,
"num_tokens": 8390161.0,
"step": 1070
},
{
"entropy": 1.8765722244977951,
"epoch": 6.8832,
"grad_norm": 2.3233554363250732,
"learning_rate": 4.710924974490463e-06,
"loss": 1.3223,
"mean_token_accuracy": 0.6393219083547592,
"num_tokens": 8469078.0,
"step": 1080
},
{
"entropy": 1.8413788318634032,
"epoch": 6.9472000000000005,
"grad_norm": 2.321904420852661,
"learning_rate": 4.536886364027428e-06,
"loss": 1.272,
"mean_token_accuracy": 0.647525629401207,
"num_tokens": 8547873.0,
"step": 1090
},
{
"entropy": 1.8666607818088017,
"epoch": 7.0064,
"grad_norm": 2.1004791259765625,
"learning_rate": 4.365175283390968e-06,
"loss": 1.2721,
"mean_token_accuracy": 0.6479364424138456,
"num_tokens": 8619109.0,
"step": 1100
},
{
"entropy": 1.8333647519350051,
"epoch": 7.0704,
"grad_norm": 2.9210190773010254,
"learning_rate": 4.195864889152295e-06,
"loss": 1.1833,
"mean_token_accuracy": 0.6699477419257164,
"num_tokens": 8692475.0,
"step": 1110
},
{
"entropy": 1.8425445258617401,
"epoch": 7.1344,
"grad_norm": 2.3149521350860596,
"learning_rate": 4.029027315083251e-06,
"loss": 1.2707,
"mean_token_accuracy": 0.650185227394104,
"num_tokens": 8770456.0,
"step": 1120
},
{
"entropy": 1.8121359765529632,
"epoch": 7.1984,
"grad_norm": 2.6502795219421387,
"learning_rate": 3.864733641424093e-06,
"loss": 1.2383,
"mean_token_accuracy": 0.6547705471515656,
"num_tokens": 8851214.0,
"step": 1130
},
{
"entropy": 1.802490884065628,
"epoch": 7.2624,
"grad_norm": 2.227534770965576,
"learning_rate": 3.703053864600169e-06,
"loss": 1.2603,
"mean_token_accuracy": 0.6489648431539535,
"num_tokens": 8932363.0,
"step": 1140
},
{
"entropy": 1.8214709132909774,
"epoch": 7.3264,
"grad_norm": 2.5923874378204346,
"learning_rate": 3.544056867400306e-06,
"loss": 1.248,
"mean_token_accuracy": 0.651621387898922,
"num_tokens": 9011734.0,
"step": 1150
},
{
"entropy": 1.826240959763527,
"epoch": 7.3904,
"grad_norm": 2.67551589012146,
"learning_rate": 3.3878103896296677e-06,
"loss": 1.2488,
"mean_token_accuracy": 0.6530374586582184,
"num_tokens": 9090277.0,
"step": 1160
},
{
"entropy": 1.837952870130539,
"epoch": 7.4544,
"grad_norm": 2.2191765308380127,
"learning_rate": 3.2343809992495945e-06,
"loss": 1.2704,
"mean_token_accuracy": 0.6503957703709602,
"num_tokens": 9168093.0,
"step": 1170
},
{
"entropy": 1.8135560542345046,
"epoch": 7.5184,
"grad_norm": 2.5211071968078613,
"learning_rate": 3.083834064016682e-06,
"loss": 1.2212,
"mean_token_accuracy": 0.6587097644805908,
"num_tokens": 9247777.0,
"step": 1180
},
{
"entropy": 1.8237973660230637,
"epoch": 7.5824,
"grad_norm": 2.6236841678619385,
"learning_rate": 2.9362337236331884e-06,
"loss": 1.2604,
"mean_token_accuracy": 0.6501624628901481,
"num_tokens": 9325367.0,
"step": 1190
},
{
"entropy": 1.836614164710045,
"epoch": 7.6464,
"grad_norm": 2.726731777191162,
"learning_rate": 2.791642862420686e-06,
"loss": 1.2554,
"mean_token_accuracy": 0.6520631939172745,
"num_tokens": 9403641.0,
"step": 1200
},
{
"entropy": 1.8044064462184906,
"epoch": 7.7104,
"grad_norm": 2.4943737983703613,
"learning_rate": 2.6501230825285294e-06,
"loss": 1.2519,
"mean_token_accuracy": 0.6524736672639847,
"num_tokens": 9484075.0,
"step": 1210
},
{
"entropy": 1.8258908241987228,
"epoch": 7.7744,
"grad_norm": 2.4426612854003906,
"learning_rate": 2.5117346776885843e-06,
"loss": 1.251,
"mean_token_accuracy": 0.6484281331300735,
"num_tokens": 9561148.0,
"step": 1220
},
{
"entropy": 1.8062447488307953,
"epoch": 7.8384,
"grad_norm": 2.465646266937256,
"learning_rate": 2.3765366075274287e-06,
"loss": 1.2662,
"mean_token_accuracy": 0.6492940753698349,
"num_tokens": 9642108.0,
"step": 1230
},
{
"entropy": 1.8293108910322189,
"epoch": 7.9024,
"grad_norm": 2.4230668544769287,
"learning_rate": 2.2445864724469146e-06,
"loss": 1.2625,
"mean_token_accuracy": 0.6592240884900094,
"num_tokens": 9719660.0,
"step": 1240
},
{
"entropy": 1.837513843178749,
"epoch": 7.9664,
"grad_norm": 2.7502171993255615,
"learning_rate": 2.1159404890838365e-06,
"loss": 1.2677,
"mean_token_accuracy": 0.6493206784129143,
"num_tokens": 9797593.0,
"step": 1250
},
{
"entropy": 1.8162316245001715,
"epoch": 8.0256,
"grad_norm": 2.5199058055877686,
"learning_rate": 1.990653466359125e-06,
"loss": 1.2293,
"mean_token_accuracy": 0.656300467413825,
"num_tokens": 9871177.0,
"step": 1260
},
{
"entropy": 1.780314788222313,
"epoch": 8.0896,
"grad_norm": 2.5237162113189697,
"learning_rate": 1.8687787821268255e-06,
"loss": 1.1791,
"mean_token_accuracy": 0.6675050809979439,
"num_tokens": 9949391.0,
"step": 1270
},
{
"entropy": 1.779639583826065,
"epoch": 8.1536,
"grad_norm": 2.4559428691864014,
"learning_rate": 1.7503683604327426e-06,
"loss": 1.2177,
"mean_token_accuracy": 0.6600575730204582,
"num_tokens": 10030182.0,
"step": 1280
},
{
"entropy": 1.7865025967359542,
"epoch": 8.2176,
"grad_norm": 2.9508230686187744,
"learning_rate": 1.6354726493924745e-06,
"loss": 1.1937,
"mean_token_accuracy": 0.6630557537078857,
"num_tokens": 10107960.0,
"step": 1290
},
{
"entropy": 1.8122670024633407,
"epoch": 8.2816,
"grad_norm": 2.6917898654937744,
"learning_rate": 1.5241405996982928e-06,
"loss": 1.2319,
"mean_token_accuracy": 0.6598842918872834,
"num_tokens": 10185524.0,
"step": 1300
},
{
"entropy": 1.806730917096138,
"epoch": 8.3456,
"grad_norm": 2.7887086868286133,
"learning_rate": 1.4164196437639355e-06,
"loss": 1.25,
"mean_token_accuracy": 0.6578737393021583,
"num_tokens": 10265123.0,
"step": 1310
},
{
"entropy": 1.8156007081270218,
"epoch": 8.4096,
"grad_norm": 2.9965310096740723,
"learning_rate": 1.3123556755163114e-06,
"loss": 1.234,
"mean_token_accuracy": 0.6579165816307068,
"num_tokens": 10342205.0,
"step": 1320
},
{
"entropy": 1.8044028550386428,
"epoch": 8.4736,
"grad_norm": 2.9466843605041504,
"learning_rate": 1.2119930308426264e-06,
"loss": 1.2423,
"mean_token_accuracy": 0.6527451828122139,
"num_tokens": 10420603.0,
"step": 1330
},
{
"entropy": 1.8250535994768142,
"epoch": 8.5376,
"grad_norm": 2.9452784061431885,
"learning_rate": 1.1153744687013313e-06,
"loss": 1.258,
"mean_token_accuracy": 0.6589037463068962,
"num_tokens": 10499049.0,
"step": 1340
},
{
"entropy": 1.7990799486637115,
"epoch": 8.6016,
"grad_norm": 2.6469309329986572,
"learning_rate": 1.0225411529048857e-06,
"loss": 1.2415,
"mean_token_accuracy": 0.6555879130959511,
"num_tokens": 10578178.0,
"step": 1350
},
{
"entropy": 1.7638877242803574,
"epoch": 8.6656,
"grad_norm": 2.77990460395813,
"learning_rate": 9.33532634582156e-07,
"loss": 1.2143,
"mean_token_accuracy": 0.6589834168553352,
"num_tokens": 10659018.0,
"step": 1360
},
{
"entropy": 1.8123771637678145,
"epoch": 8.7296,
"grad_norm": 3.1158993244171143,
"learning_rate": 8.483868353278657e-07,
"loss": 1.2358,
"mean_token_accuracy": 0.6561313390731811,
"num_tokens": 10736582.0,
"step": 1370
},
{
"entropy": 1.8054670304059983,
"epoch": 8.7936,
"grad_norm": 2.915422201156616,
"learning_rate": 7.671400310462984e-07,
"loss": 1.2089,
"mean_token_accuracy": 0.6610309720039368,
"num_tokens": 10814534.0,
"step": 1380
},
{
"entropy": 1.800497230887413,
"epoch": 8.8576,
"grad_norm": 2.7816338539123535,
"learning_rate": 6.898268364961591e-07,
"loss": 1.2227,
"mean_token_accuracy": 0.6584793984889984,
"num_tokens": 10893484.0,
"step": 1390
},
{
"entropy": 1.7850348353385925,
"epoch": 8.9216,
"grad_norm": 2.569054126739502,
"learning_rate": 6.164801905431394e-07,
"loss": 1.2242,
"mean_token_accuracy": 0.6574000924825668,
"num_tokens": 10973818.0,
"step": 1400
},
{
"entropy": 1.795585972070694,
"epoch": 8.9856,
"grad_norm": 2.6529977321624756,
"learning_rate": 5.471313421264879e-07,
"loss": 1.2127,
"mean_token_accuracy": 0.6600923746824264,
"num_tokens": 11051396.0,
"step": 1410
},
{
"entropy": 1.8172799606580992,
"epoch": 9.0448,
"grad_norm": 2.719399929046631,
"learning_rate": 4.818098369455793e-07,
"loss": 1.2758,
"mean_token_accuracy": 0.6563853702029666,
"num_tokens": 11124338.0,
"step": 1420
},
{
"entropy": 1.7982712090015411,
"epoch": 9.1088,
"grad_norm": 2.516369581222534,
"learning_rate": 4.20543504872124e-07,
"loss": 1.2054,
"mean_token_accuracy": 0.6637166649103164,
"num_tokens": 11202233.0,
"step": 1430
},
{
"entropy": 1.795827680826187,
"epoch": 9.1728,
"grad_norm": 3.1275811195373535,
"learning_rate": 3.633584480934016e-07,
"loss": 1.1907,
"mean_token_accuracy": 0.6667785882949829,
"num_tokens": 11279587.0,
"step": 1440
},
{
"entropy": 1.7738620430231093,
"epoch": 9.2368,
"grad_norm": 2.6204917430877686,
"learning_rate": 3.1027902999157146e-07,
"loss": 1.2156,
"mean_token_accuracy": 0.6609065368771553,
"num_tokens": 11360243.0,
"step": 1450
},
{
"entropy": 1.7866268098354339,
"epoch": 9.3008,
"grad_norm": 2.763274908065796,
"learning_rate": 2.61327864763784e-07,
"loss": 1.2109,
"mean_token_accuracy": 0.6604277700185776,
"num_tokens": 11440689.0,
"step": 1460
},
{
"entropy": 1.7874858051538467,
"epoch": 9.3648,
"grad_norm": 2.725693464279175,
"learning_rate": 2.1652580778751875e-07,
"loss": 1.2379,
"mean_token_accuracy": 0.6640482068061828,
"num_tokens": 11520425.0,
"step": 1470
},
{
"entropy": 1.7999387830495834,
"epoch": 9.4288,
"grad_norm": 2.849959135055542,
"learning_rate": 1.758919467352771e-07,
"loss": 1.2453,
"mean_token_accuracy": 0.652675162255764,
"num_tokens": 11600907.0,
"step": 1480
},
{
"entropy": 1.7698762983083725,
"epoch": 9.4928,
"grad_norm": 2.384965419769287,
"learning_rate": 1.3944359344237214e-07,
"loss": 1.2038,
"mean_token_accuracy": 0.6633729308843612,
"num_tokens": 11680986.0,
"step": 1490
},
{
"entropy": 1.7817810475826263,
"epoch": 9.556799999999999,
"grad_norm": 2.6171748638153076,
"learning_rate": 1.0719627653131948e-07,
"loss": 1.2052,
"mean_token_accuracy": 0.662623830139637,
"num_tokens": 11759645.0,
"step": 1500
},
{
"entropy": 1.7850700795650483,
"epoch": 9.6208,
"grad_norm": 2.759584426879883,
"learning_rate": 7.916373479595507e-08,
"loss": 1.2011,
"mean_token_accuracy": 0.6652053311467171,
"num_tokens": 11837753.0,
"step": 1510
},
{
"entropy": 1.8089916795492171,
"epoch": 9.6848,
"grad_norm": 2.8050880432128906,
"learning_rate": 5.535791134809176e-08,
"loss": 1.218,
"mean_token_accuracy": 0.6629775419831276,
"num_tokens": 11915480.0,
"step": 1520
},
{
"entropy": 1.7926248282194137,
"epoch": 9.7488,
"grad_norm": 2.947237491607666,
"learning_rate": 3.57889485292251e-08,
"loss": 1.2402,
"mean_token_accuracy": 0.6560651332139968,
"num_tokens": 11994204.0,
"step": 1530
},
{
"entropy": 1.788163235783577,
"epoch": 9.8128,
"grad_norm": 3.0515170097351074,
"learning_rate": 2.046518358944094e-08,
"loss": 1.2018,
"mean_token_accuracy": 0.66530032902956,
"num_tokens": 12070332.0,
"step": 1540
},
{
"entropy": 1.798910641670227,
"epoch": 9.8768,
"grad_norm": 2.6602368354797363,
"learning_rate": 9.393145135377924e-09,
"loss": 1.2375,
"mean_token_accuracy": 0.6554615125060081,
"num_tokens": 12149645.0,
"step": 1550
},
{
"entropy": 1.794683536887169,
"epoch": 9.9408,
"grad_norm": 2.509953022003174,
"learning_rate": 2.5775503487501795e-09,
"loss": 1.2262,
"mean_token_accuracy": 0.6570453852415085,
"num_tokens": 12229123.0,
"step": 1560
},
{
"entropy": 1.7768772969374786,
"epoch": 10.0,
"grad_norm": 6.54733419418335,
"learning_rate": 2.1302976616066616e-11,
"loss": 1.1615,
"mean_token_accuracy": 0.6717051477045626,
"num_tokens": 12300470.0,
"step": 1570
}
],
"logging_steps": 10,
"max_steps": 1570,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1844416691327468e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}