Mistral-DNA-v1-138M-bacteria / trainer_state.json
RaphaelMourad's picture
Upload 10 files
5cd6bc6 verified
{
"best_metric": 6.110002040863037,
"best_model_checkpoint": "./results/models/checkpoint-242575",
"epoch": 34.0,
"eval_steps": 500,
"global_step": 266050,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06,
"learning_rate": 0.001997444089456869,
"loss": 6.3873,
"step": 500
},
{
"epoch": 0.13,
"learning_rate": 0.001994888178913738,
"loss": 6.2714,
"step": 1000
},
{
"epoch": 0.19,
"learning_rate": 0.0019923322683706073,
"loss": 6.3526,
"step": 1500
},
{
"epoch": 0.26,
"learning_rate": 0.001989776357827476,
"loss": 6.3696,
"step": 2000
},
{
"epoch": 0.32,
"learning_rate": 0.001987220447284345,
"loss": 6.3501,
"step": 2500
},
{
"epoch": 0.38,
"learning_rate": 0.001984664536741214,
"loss": 6.3472,
"step": 3000
},
{
"epoch": 0.45,
"learning_rate": 0.001982108626198083,
"loss": 6.347,
"step": 3500
},
{
"epoch": 0.51,
"learning_rate": 0.0019795527156549523,
"loss": 6.3555,
"step": 4000
},
{
"epoch": 0.58,
"learning_rate": 0.0019769968051118214,
"loss": 6.3432,
"step": 4500
},
{
"epoch": 0.64,
"learning_rate": 0.00197444089456869,
"loss": 6.3439,
"step": 5000
},
{
"epoch": 0.7,
"learning_rate": 0.001971884984025559,
"loss": 6.3312,
"step": 5500
},
{
"epoch": 0.77,
"learning_rate": 0.001969329073482428,
"loss": 6.3279,
"step": 6000
},
{
"epoch": 0.83,
"learning_rate": 0.0019667731629392973,
"loss": 6.3177,
"step": 6500
},
{
"epoch": 0.89,
"learning_rate": 0.001964217252396166,
"loss": 6.3152,
"step": 7000
},
{
"epoch": 0.96,
"learning_rate": 0.001961661341853035,
"loss": 6.3086,
"step": 7500
},
{
"epoch": 1.0,
"eval_loss": 6.252776145935059,
"eval_runtime": 6.9192,
"eval_samples_per_second": 36.276,
"eval_steps_per_second": 1.156,
"step": 7825
},
{
"epoch": 1.02,
"learning_rate": 0.001959105431309904,
"loss": 6.3102,
"step": 8000
},
{
"epoch": 1.09,
"learning_rate": 0.001956549520766773,
"loss": 6.2978,
"step": 8500
},
{
"epoch": 1.15,
"learning_rate": 0.0019539936102236422,
"loss": 6.2904,
"step": 9000
},
{
"epoch": 1.21,
"learning_rate": 0.001951437699680511,
"loss": 6.2953,
"step": 9500
},
{
"epoch": 1.28,
"learning_rate": 0.0019488817891373802,
"loss": 6.2988,
"step": 10000
},
{
"epoch": 1.34,
"learning_rate": 0.0019463258785942493,
"loss": 6.2859,
"step": 10500
},
{
"epoch": 1.41,
"learning_rate": 0.0019437699680511184,
"loss": 6.2906,
"step": 11000
},
{
"epoch": 1.47,
"learning_rate": 0.0019412140575079872,
"loss": 6.2936,
"step": 11500
},
{
"epoch": 1.53,
"learning_rate": 0.0019386581469648563,
"loss": 6.288,
"step": 12000
},
{
"epoch": 1.6,
"learning_rate": 0.0019361022364217254,
"loss": 6.2888,
"step": 12500
},
{
"epoch": 1.66,
"learning_rate": 0.0019335463258785943,
"loss": 6.2866,
"step": 13000
},
{
"epoch": 1.73,
"learning_rate": 0.0019309904153354633,
"loss": 6.2776,
"step": 13500
},
{
"epoch": 1.79,
"learning_rate": 0.0019284345047923324,
"loss": 6.2706,
"step": 14000
},
{
"epoch": 1.85,
"learning_rate": 0.0019258785942492015,
"loss": 6.2705,
"step": 14500
},
{
"epoch": 1.92,
"learning_rate": 0.0019233226837060702,
"loss": 6.2807,
"step": 15000
},
{
"epoch": 1.98,
"learning_rate": 0.0019207667731629392,
"loss": 6.278,
"step": 15500
},
{
"epoch": 2.0,
"eval_loss": 6.216161251068115,
"eval_runtime": 6.9294,
"eval_samples_per_second": 36.223,
"eval_steps_per_second": 1.155,
"step": 15650
},
{
"epoch": 2.04,
"learning_rate": 0.0019182108626198083,
"loss": 6.2688,
"step": 16000
},
{
"epoch": 2.11,
"learning_rate": 0.0019156549520766772,
"loss": 6.2732,
"step": 16500
},
{
"epoch": 2.17,
"learning_rate": 0.0019130990415335463,
"loss": 6.2603,
"step": 17000
},
{
"epoch": 2.24,
"learning_rate": 0.0019105431309904154,
"loss": 6.2656,
"step": 17500
},
{
"epoch": 2.3,
"learning_rate": 0.0019079872204472844,
"loss": 6.2551,
"step": 18000
},
{
"epoch": 2.36,
"learning_rate": 0.0019054313099041533,
"loss": 6.259,
"step": 18500
},
{
"epoch": 2.43,
"learning_rate": 0.0019028753993610224,
"loss": 6.2607,
"step": 19000
},
{
"epoch": 2.49,
"learning_rate": 0.0019003194888178915,
"loss": 6.2634,
"step": 19500
},
{
"epoch": 2.56,
"learning_rate": 0.0018977635782747603,
"loss": 6.26,
"step": 20000
},
{
"epoch": 2.62,
"learning_rate": 0.0018952076677316294,
"loss": 6.2502,
"step": 20500
},
{
"epoch": 2.68,
"learning_rate": 0.0018926517571884985,
"loss": 6.2453,
"step": 21000
},
{
"epoch": 2.75,
"learning_rate": 0.0018900958466453676,
"loss": 6.2529,
"step": 21500
},
{
"epoch": 2.81,
"learning_rate": 0.0018875399361022365,
"loss": 6.2493,
"step": 22000
},
{
"epoch": 2.88,
"learning_rate": 0.0018849840255591055,
"loss": 6.2534,
"step": 22500
},
{
"epoch": 2.94,
"learning_rate": 0.0018824281150159746,
"loss": 6.2442,
"step": 23000
},
{
"epoch": 3.0,
"eval_loss": 6.19460391998291,
"eval_runtime": 6.7538,
"eval_samples_per_second": 37.164,
"eval_steps_per_second": 1.185,
"step": 23475
},
{
"epoch": 3.0,
"learning_rate": 0.0018798722044728435,
"loss": 6.2498,
"step": 23500
},
{
"epoch": 3.07,
"learning_rate": 0.0018773162939297126,
"loss": 6.2428,
"step": 24000
},
{
"epoch": 3.13,
"learning_rate": 0.0018747603833865817,
"loss": 6.2447,
"step": 24500
},
{
"epoch": 3.19,
"learning_rate": 0.0018722044728434505,
"loss": 6.24,
"step": 25000
},
{
"epoch": 3.26,
"learning_rate": 0.0018696485623003194,
"loss": 6.2476,
"step": 25500
},
{
"epoch": 3.32,
"learning_rate": 0.0018670926517571885,
"loss": 6.2486,
"step": 26000
},
{
"epoch": 3.39,
"learning_rate": 0.0018645367412140576,
"loss": 6.2428,
"step": 26500
},
{
"epoch": 3.45,
"learning_rate": 0.0018619808306709264,
"loss": 6.2475,
"step": 27000
},
{
"epoch": 3.51,
"learning_rate": 0.0018594249201277955,
"loss": 6.2481,
"step": 27500
},
{
"epoch": 3.58,
"learning_rate": 0.0018568690095846646,
"loss": 6.2381,
"step": 28000
},
{
"epoch": 3.64,
"learning_rate": 0.0018543130990415334,
"loss": 6.2402,
"step": 28500
},
{
"epoch": 3.71,
"learning_rate": 0.0018517571884984025,
"loss": 6.2389,
"step": 29000
},
{
"epoch": 3.77,
"learning_rate": 0.0018492012779552716,
"loss": 6.2414,
"step": 29500
},
{
"epoch": 3.83,
"learning_rate": 0.0018466453674121407,
"loss": 6.2373,
"step": 30000
},
{
"epoch": 3.9,
"learning_rate": 0.0018440894568690096,
"loss": 6.2408,
"step": 30500
},
{
"epoch": 3.96,
"learning_rate": 0.0018415335463258786,
"loss": 6.2299,
"step": 31000
},
{
"epoch": 4.0,
"eval_loss": 6.177552700042725,
"eval_runtime": 6.7861,
"eval_samples_per_second": 36.987,
"eval_steps_per_second": 1.179,
"step": 31300
},
{
"epoch": 4.03,
"learning_rate": 0.0018389776357827477,
"loss": 6.2279,
"step": 31500
},
{
"epoch": 4.09,
"learning_rate": 0.0018364217252396166,
"loss": 6.2341,
"step": 32000
},
{
"epoch": 4.15,
"learning_rate": 0.0018338658146964857,
"loss": 6.2291,
"step": 32500
},
{
"epoch": 4.22,
"learning_rate": 0.0018313099041533548,
"loss": 6.2349,
"step": 33000
},
{
"epoch": 4.28,
"learning_rate": 0.0018287539936102238,
"loss": 6.231,
"step": 33500
},
{
"epoch": 4.35,
"learning_rate": 0.0018261980830670927,
"loss": 6.2295,
"step": 34000
},
{
"epoch": 4.41,
"learning_rate": 0.0018236421725239618,
"loss": 6.2344,
"step": 34500
},
{
"epoch": 4.47,
"learning_rate": 0.0018210862619808307,
"loss": 6.2289,
"step": 35000
},
{
"epoch": 4.54,
"learning_rate": 0.0018185303514376995,
"loss": 6.2309,
"step": 35500
},
{
"epoch": 4.6,
"learning_rate": 0.0018159744408945686,
"loss": 6.2255,
"step": 36000
},
{
"epoch": 4.66,
"learning_rate": 0.0018134185303514377,
"loss": 6.2281,
"step": 36500
},
{
"epoch": 4.73,
"learning_rate": 0.0018108626198083068,
"loss": 6.2269,
"step": 37000
},
{
"epoch": 4.79,
"learning_rate": 0.0018083067092651756,
"loss": 6.2235,
"step": 37500
},
{
"epoch": 4.86,
"learning_rate": 0.0018057507987220447,
"loss": 6.2252,
"step": 38000
},
{
"epoch": 4.92,
"learning_rate": 0.0018031948881789138,
"loss": 6.2243,
"step": 38500
},
{
"epoch": 4.98,
"learning_rate": 0.0018006389776357827,
"loss": 6.2243,
"step": 39000
},
{
"epoch": 5.0,
"eval_loss": 6.16984748840332,
"eval_runtime": 6.9467,
"eval_samples_per_second": 36.132,
"eval_steps_per_second": 1.152,
"step": 39125
},
{
"epoch": 5.05,
"learning_rate": 0.0017980830670926518,
"loss": 6.2197,
"step": 39500
},
{
"epoch": 5.11,
"learning_rate": 0.0017955271565495208,
"loss": 6.2246,
"step": 40000
},
{
"epoch": 5.18,
"learning_rate": 0.00179297124600639,
"loss": 6.2219,
"step": 40500
},
{
"epoch": 5.24,
"learning_rate": 0.0017904153354632588,
"loss": 6.2246,
"step": 41000
},
{
"epoch": 5.3,
"learning_rate": 0.0017878594249201279,
"loss": 6.2203,
"step": 41500
},
{
"epoch": 5.37,
"learning_rate": 0.001785303514376997,
"loss": 6.2305,
"step": 42000
},
{
"epoch": 5.43,
"learning_rate": 0.0017827476038338658,
"loss": 6.225,
"step": 42500
},
{
"epoch": 5.5,
"learning_rate": 0.001780191693290735,
"loss": 6.2216,
"step": 43000
},
{
"epoch": 5.56,
"learning_rate": 0.001777635782747604,
"loss": 6.2242,
"step": 43500
},
{
"epoch": 5.62,
"learning_rate": 0.001775079872204473,
"loss": 6.224,
"step": 44000
},
{
"epoch": 5.69,
"learning_rate": 0.001772523961661342,
"loss": 6.2168,
"step": 44500
},
{
"epoch": 5.75,
"learning_rate": 0.0017699680511182108,
"loss": 6.2249,
"step": 45000
},
{
"epoch": 5.81,
"learning_rate": 0.00176741214057508,
"loss": 6.2202,
"step": 45500
},
{
"epoch": 5.88,
"learning_rate": 0.0017648562300319488,
"loss": 6.2227,
"step": 46000
},
{
"epoch": 5.94,
"learning_rate": 0.0017623003194888178,
"loss": 6.2162,
"step": 46500
},
{
"epoch": 6.0,
"eval_loss": 6.162996768951416,
"eval_runtime": 6.7939,
"eval_samples_per_second": 36.945,
"eval_steps_per_second": 1.178,
"step": 46950
},
{
"epoch": 6.01,
"learning_rate": 0.001759744408945687,
"loss": 6.2237,
"step": 47000
},
{
"epoch": 6.07,
"learning_rate": 0.0017571884984025558,
"loss": 6.2159,
"step": 47500
},
{
"epoch": 6.13,
"learning_rate": 0.0017546325878594249,
"loss": 6.2176,
"step": 48000
},
{
"epoch": 6.2,
"learning_rate": 0.001752076677316294,
"loss": 6.2175,
"step": 48500
},
{
"epoch": 6.26,
"learning_rate": 0.001749520766773163,
"loss": 6.2171,
"step": 49000
},
{
"epoch": 6.33,
"learning_rate": 0.001746964856230032,
"loss": 6.2105,
"step": 49500
},
{
"epoch": 6.39,
"learning_rate": 0.001744408945686901,
"loss": 6.221,
"step": 50000
},
{
"epoch": 6.45,
"learning_rate": 0.00174185303514377,
"loss": 6.2141,
"step": 50500
},
{
"epoch": 6.52,
"learning_rate": 0.001739297124600639,
"loss": 6.213,
"step": 51000
},
{
"epoch": 6.58,
"learning_rate": 0.001736741214057508,
"loss": 6.2182,
"step": 51500
},
{
"epoch": 6.65,
"learning_rate": 0.0017341853035143771,
"loss": 6.2085,
"step": 52000
},
{
"epoch": 6.71,
"learning_rate": 0.0017316293929712462,
"loss": 6.2142,
"step": 52500
},
{
"epoch": 6.77,
"learning_rate": 0.001729073482428115,
"loss": 6.2274,
"step": 53000
},
{
"epoch": 6.84,
"learning_rate": 0.0017265175718849841,
"loss": 6.2124,
"step": 53500
},
{
"epoch": 6.9,
"learning_rate": 0.0017239616613418532,
"loss": 6.2105,
"step": 54000
},
{
"epoch": 6.96,
"learning_rate": 0.0017214057507987219,
"loss": 6.2079,
"step": 54500
},
{
"epoch": 7.0,
"eval_loss": 6.15675163269043,
"eval_runtime": 6.7761,
"eval_samples_per_second": 37.042,
"eval_steps_per_second": 1.181,
"step": 54775
},
{
"epoch": 7.03,
"learning_rate": 0.001718849840255591,
"loss": 6.2153,
"step": 55000
},
{
"epoch": 7.09,
"learning_rate": 0.00171629392971246,
"loss": 6.2105,
"step": 55500
},
{
"epoch": 7.16,
"learning_rate": 0.0017137380191693291,
"loss": 6.2111,
"step": 56000
},
{
"epoch": 7.22,
"learning_rate": 0.001711182108626198,
"loss": 6.2167,
"step": 56500
},
{
"epoch": 7.28,
"learning_rate": 0.001708626198083067,
"loss": 6.216,
"step": 57000
},
{
"epoch": 7.35,
"learning_rate": 0.0017060702875399362,
"loss": 6.2144,
"step": 57500
},
{
"epoch": 7.41,
"learning_rate": 0.001703514376996805,
"loss": 6.2141,
"step": 58000
},
{
"epoch": 7.48,
"learning_rate": 0.001700958466453674,
"loss": 6.2139,
"step": 58500
},
{
"epoch": 7.54,
"learning_rate": 0.0016984025559105432,
"loss": 6.2117,
"step": 59000
},
{
"epoch": 7.6,
"learning_rate": 0.0016958466453674123,
"loss": 6.2122,
"step": 59500
},
{
"epoch": 7.67,
"learning_rate": 0.0016932907348242811,
"loss": 6.2096,
"step": 60000
},
{
"epoch": 7.73,
"learning_rate": 0.0016907348242811502,
"loss": 6.2055,
"step": 60500
},
{
"epoch": 7.8,
"learning_rate": 0.0016881789137380193,
"loss": 6.2088,
"step": 61000
},
{
"epoch": 7.86,
"learning_rate": 0.0016856230031948882,
"loss": 6.2105,
"step": 61500
},
{
"epoch": 7.92,
"learning_rate": 0.0016830670926517573,
"loss": 6.2161,
"step": 62000
},
{
"epoch": 7.99,
"learning_rate": 0.0016805111821086263,
"loss": 6.2105,
"step": 62500
},
{
"epoch": 8.0,
"eval_loss": 6.1555399894714355,
"eval_runtime": 6.9246,
"eval_samples_per_second": 36.248,
"eval_steps_per_second": 1.155,
"step": 62600
},
{
"epoch": 8.05,
"learning_rate": 0.0016779552715654954,
"loss": 6.2111,
"step": 63000
},
{
"epoch": 8.12,
"learning_rate": 0.0016753993610223643,
"loss": 6.2095,
"step": 63500
},
{
"epoch": 8.18,
"learning_rate": 0.0016728434504792334,
"loss": 6.2064,
"step": 64000
},
{
"epoch": 8.24,
"learning_rate": 0.0016702875399361022,
"loss": 6.2147,
"step": 64500
},
{
"epoch": 8.31,
"learning_rate": 0.001667731629392971,
"loss": 6.1994,
"step": 65000
},
{
"epoch": 8.37,
"learning_rate": 0.0016651757188498402,
"loss": 6.2094,
"step": 65500
},
{
"epoch": 8.43,
"learning_rate": 0.0016626198083067093,
"loss": 6.2075,
"step": 66000
},
{
"epoch": 8.5,
"learning_rate": 0.0016600638977635781,
"loss": 6.2085,
"step": 66500
},
{
"epoch": 8.56,
"learning_rate": 0.0016575079872204472,
"loss": 6.2004,
"step": 67000
},
{
"epoch": 8.63,
"learning_rate": 0.0016549520766773163,
"loss": 6.2112,
"step": 67500
},
{
"epoch": 8.69,
"learning_rate": 0.0016523961661341854,
"loss": 6.2051,
"step": 68000
},
{
"epoch": 8.75,
"learning_rate": 0.0016498402555910543,
"loss": 6.2024,
"step": 68500
},
{
"epoch": 8.82,
"learning_rate": 0.0016472843450479233,
"loss": 6.1996,
"step": 69000
},
{
"epoch": 8.88,
"learning_rate": 0.0016447284345047924,
"loss": 6.2066,
"step": 69500
},
{
"epoch": 8.95,
"learning_rate": 0.0016421725239616613,
"loss": 6.2013,
"step": 70000
},
{
"epoch": 9.0,
"eval_loss": 6.147340297698975,
"eval_runtime": 6.7973,
"eval_samples_per_second": 36.927,
"eval_steps_per_second": 1.177,
"step": 70425
},
{
"epoch": 9.01,
"learning_rate": 0.0016396166134185304,
"loss": 6.2085,
"step": 70500
},
{
"epoch": 9.07,
"learning_rate": 0.0016370607028753995,
"loss": 6.1978,
"step": 71000
},
{
"epoch": 9.14,
"learning_rate": 0.0016345047923322685,
"loss": 6.2017,
"step": 71500
},
{
"epoch": 9.2,
"learning_rate": 0.0016319488817891374,
"loss": 6.2047,
"step": 72000
},
{
"epoch": 9.27,
"learning_rate": 0.0016293929712460065,
"loss": 6.1998,
"step": 72500
},
{
"epoch": 9.33,
"learning_rate": 0.0016268370607028756,
"loss": 6.2027,
"step": 73000
},
{
"epoch": 9.39,
"learning_rate": 0.0016242811501597444,
"loss": 6.2024,
"step": 73500
},
{
"epoch": 9.46,
"learning_rate": 0.0016217252396166135,
"loss": 6.1936,
"step": 74000
},
{
"epoch": 9.52,
"learning_rate": 0.0016191693290734824,
"loss": 6.2042,
"step": 74500
},
{
"epoch": 9.58,
"learning_rate": 0.0016166134185303515,
"loss": 6.2021,
"step": 75000
},
{
"epoch": 9.65,
"learning_rate": 0.0016140575079872203,
"loss": 6.2054,
"step": 75500
},
{
"epoch": 9.71,
"learning_rate": 0.0016115015974440894,
"loss": 6.2051,
"step": 76000
},
{
"epoch": 9.78,
"learning_rate": 0.0016089456869009585,
"loss": 6.1963,
"step": 76500
},
{
"epoch": 9.84,
"learning_rate": 0.0016063897763578274,
"loss": 6.2039,
"step": 77000
},
{
"epoch": 9.9,
"learning_rate": 0.0016038338658146965,
"loss": 6.1993,
"step": 77500
},
{
"epoch": 9.97,
"learning_rate": 0.0016012779552715655,
"loss": 6.199,
"step": 78000
},
{
"epoch": 10.0,
"eval_loss": 6.143795967102051,
"eval_runtime": 6.7856,
"eval_samples_per_second": 36.99,
"eval_steps_per_second": 1.179,
"step": 78250
},
{
"epoch": 10.03,
"learning_rate": 0.0015987220447284346,
"loss": 6.1969,
"step": 78500
},
{
"epoch": 10.1,
"learning_rate": 0.0015961661341853035,
"loss": 6.2008,
"step": 79000
},
{
"epoch": 10.16,
"learning_rate": 0.0015936102236421726,
"loss": 6.1974,
"step": 79500
},
{
"epoch": 10.22,
"learning_rate": 0.0015910543130990417,
"loss": 6.1966,
"step": 80000
},
{
"epoch": 10.29,
"learning_rate": 0.0015884984025559105,
"loss": 6.199,
"step": 80500
},
{
"epoch": 10.35,
"learning_rate": 0.0015859424920127796,
"loss": 6.2016,
"step": 81000
},
{
"epoch": 10.42,
"learning_rate": 0.0015833865814696487,
"loss": 6.1986,
"step": 81500
},
{
"epoch": 10.48,
"learning_rate": 0.0015808306709265178,
"loss": 6.2013,
"step": 82000
},
{
"epoch": 10.54,
"learning_rate": 0.0015782747603833866,
"loss": 6.1922,
"step": 82500
},
{
"epoch": 10.61,
"learning_rate": 0.0015757188498402557,
"loss": 6.1999,
"step": 83000
},
{
"epoch": 10.67,
"learning_rate": 0.0015731629392971248,
"loss": 6.1989,
"step": 83500
},
{
"epoch": 10.73,
"learning_rate": 0.0015706070287539937,
"loss": 6.1963,
"step": 84000
},
{
"epoch": 10.8,
"learning_rate": 0.0015680511182108625,
"loss": 6.1966,
"step": 84500
},
{
"epoch": 10.86,
"learning_rate": 0.0015654952076677316,
"loss": 6.1955,
"step": 85000
},
{
"epoch": 10.93,
"learning_rate": 0.0015629392971246005,
"loss": 6.1943,
"step": 85500
},
{
"epoch": 10.99,
"learning_rate": 0.0015603833865814696,
"loss": 6.1953,
"step": 86000
},
{
"epoch": 11.0,
"eval_loss": 6.140895366668701,
"eval_runtime": 6.7824,
"eval_samples_per_second": 37.007,
"eval_steps_per_second": 1.18,
"step": 86075
},
{
"epoch": 11.05,
"learning_rate": 0.0015578274760383386,
"loss": 6.1972,
"step": 86500
},
{
"epoch": 11.12,
"learning_rate": 0.0015552715654952077,
"loss": 6.1896,
"step": 87000
},
{
"epoch": 11.18,
"learning_rate": 0.0015527156549520766,
"loss": 6.1901,
"step": 87500
},
{
"epoch": 11.25,
"learning_rate": 0.0015501597444089457,
"loss": 6.1917,
"step": 88000
},
{
"epoch": 11.31,
"learning_rate": 0.0015476038338658148,
"loss": 6.194,
"step": 88500
},
{
"epoch": 11.37,
"learning_rate": 0.0015450479233226836,
"loss": 6.1934,
"step": 89000
},
{
"epoch": 11.44,
"learning_rate": 0.0015424920127795527,
"loss": 6.1912,
"step": 89500
},
{
"epoch": 11.5,
"learning_rate": 0.0015399361022364218,
"loss": 6.1981,
"step": 90000
},
{
"epoch": 11.57,
"learning_rate": 0.0015373801916932909,
"loss": 6.1942,
"step": 90500
},
{
"epoch": 11.63,
"learning_rate": 0.0015348242811501597,
"loss": 6.1965,
"step": 91000
},
{
"epoch": 11.69,
"learning_rate": 0.0015322683706070288,
"loss": 6.1958,
"step": 91500
},
{
"epoch": 11.76,
"learning_rate": 0.001529712460063898,
"loss": 6.1981,
"step": 92000
},
{
"epoch": 11.82,
"learning_rate": 0.0015271565495207668,
"loss": 6.1972,
"step": 92500
},
{
"epoch": 11.88,
"learning_rate": 0.0015246006389776359,
"loss": 6.1959,
"step": 93000
},
{
"epoch": 11.95,
"learning_rate": 0.001522044728434505,
"loss": 6.1958,
"step": 93500
},
{
"epoch": 12.0,
"eval_loss": 6.139165878295898,
"eval_runtime": 6.8854,
"eval_samples_per_second": 36.454,
"eval_steps_per_second": 1.162,
"step": 93900
},
{
"epoch": 12.01,
"learning_rate": 0.001519488817891374,
"loss": 6.1901,
"step": 94000
},
{
"epoch": 12.08,
"learning_rate": 0.0015169329073482427,
"loss": 6.1945,
"step": 94500
},
{
"epoch": 12.14,
"learning_rate": 0.0015143769968051118,
"loss": 6.1949,
"step": 95000
},
{
"epoch": 12.2,
"learning_rate": 0.0015118210862619808,
"loss": 6.1881,
"step": 95500
},
{
"epoch": 12.27,
"learning_rate": 0.0015092651757188497,
"loss": 6.189,
"step": 96000
},
{
"epoch": 12.33,
"learning_rate": 0.0015067092651757188,
"loss": 6.1923,
"step": 96500
},
{
"epoch": 12.4,
"learning_rate": 0.0015041533546325879,
"loss": 6.1962,
"step": 97000
},
{
"epoch": 12.46,
"learning_rate": 0.001501597444089457,
"loss": 6.1911,
"step": 97500
},
{
"epoch": 12.52,
"learning_rate": 0.0014990415335463258,
"loss": 6.1948,
"step": 98000
},
{
"epoch": 12.59,
"learning_rate": 0.001496485623003195,
"loss": 6.197,
"step": 98500
},
{
"epoch": 12.65,
"learning_rate": 0.001493929712460064,
"loss": 6.1898,
"step": 99000
},
{
"epoch": 12.72,
"learning_rate": 0.0014913738019169329,
"loss": 6.1924,
"step": 99500
},
{
"epoch": 12.78,
"learning_rate": 0.001488817891373802,
"loss": 6.1898,
"step": 100000
},
{
"epoch": 12.84,
"learning_rate": 0.001486261980830671,
"loss": 6.1934,
"step": 100500
},
{
"epoch": 12.91,
"learning_rate": 0.0014837060702875401,
"loss": 6.1944,
"step": 101000
},
{
"epoch": 12.97,
"learning_rate": 0.001481150159744409,
"loss": 6.1935,
"step": 101500
},
{
"epoch": 13.0,
"eval_loss": 6.138128280639648,
"eval_runtime": 6.8154,
"eval_samples_per_second": 36.829,
"eval_steps_per_second": 1.174,
"step": 101725
},
{
"epoch": 13.04,
"learning_rate": 0.001478594249201278,
"loss": 6.1931,
"step": 102000
},
{
"epoch": 13.1,
"learning_rate": 0.0014760383386581471,
"loss": 6.1947,
"step": 102500
},
{
"epoch": 13.16,
"learning_rate": 0.001473482428115016,
"loss": 6.1916,
"step": 103000
},
{
"epoch": 13.23,
"learning_rate": 0.001470926517571885,
"loss": 6.1923,
"step": 103500
},
{
"epoch": 13.29,
"learning_rate": 0.0014683706070287542,
"loss": 6.189,
"step": 104000
},
{
"epoch": 13.35,
"learning_rate": 0.0014658146964856228,
"loss": 6.185,
"step": 104500
},
{
"epoch": 13.42,
"learning_rate": 0.001463258785942492,
"loss": 6.1979,
"step": 105000
},
{
"epoch": 13.48,
"learning_rate": 0.001460702875399361,
"loss": 6.1919,
"step": 105500
},
{
"epoch": 13.55,
"learning_rate": 0.00145814696485623,
"loss": 6.1907,
"step": 106000
},
{
"epoch": 13.61,
"learning_rate": 0.001455591054313099,
"loss": 6.1877,
"step": 106500
},
{
"epoch": 13.67,
"learning_rate": 0.001453035143769968,
"loss": 6.1884,
"step": 107000
},
{
"epoch": 13.74,
"learning_rate": 0.0014504792332268371,
"loss": 6.1904,
"step": 107500
},
{
"epoch": 13.8,
"learning_rate": 0.001447923322683706,
"loss": 6.1909,
"step": 108000
},
{
"epoch": 13.87,
"learning_rate": 0.001445367412140575,
"loss": 6.1957,
"step": 108500
},
{
"epoch": 13.93,
"learning_rate": 0.0014428115015974441,
"loss": 6.1914,
"step": 109000
},
{
"epoch": 13.99,
"learning_rate": 0.0014402555910543132,
"loss": 6.1865,
"step": 109500
},
{
"epoch": 14.0,
"eval_loss": 6.134657859802246,
"eval_runtime": 7.8402,
"eval_samples_per_second": 32.015,
"eval_steps_per_second": 1.02,
"step": 109550
},
{
"epoch": 14.06,
"learning_rate": 0.001437699680511182,
"loss": 6.189,
"step": 110000
},
{
"epoch": 14.12,
"learning_rate": 0.0014351437699680512,
"loss": 6.1883,
"step": 110500
},
{
"epoch": 14.19,
"learning_rate": 0.0014325878594249203,
"loss": 6.1902,
"step": 111000
},
{
"epoch": 14.25,
"learning_rate": 0.0014300319488817891,
"loss": 6.191,
"step": 111500
},
{
"epoch": 14.31,
"learning_rate": 0.0014274760383386582,
"loss": 6.1934,
"step": 112000
},
{
"epoch": 14.38,
"learning_rate": 0.0014249201277955273,
"loss": 6.1841,
"step": 112500
},
{
"epoch": 14.44,
"learning_rate": 0.0014223642172523964,
"loss": 6.1861,
"step": 113000
},
{
"epoch": 14.5,
"learning_rate": 0.0014198083067092652,
"loss": 6.1883,
"step": 113500
},
{
"epoch": 14.57,
"learning_rate": 0.001417252396166134,
"loss": 6.1884,
"step": 114000
},
{
"epoch": 14.63,
"learning_rate": 0.0014146964856230032,
"loss": 6.1797,
"step": 114500
},
{
"epoch": 14.7,
"learning_rate": 0.001412140575079872,
"loss": 6.1894,
"step": 115000
},
{
"epoch": 14.76,
"learning_rate": 0.0014095846645367411,
"loss": 6.187,
"step": 115500
},
{
"epoch": 14.82,
"learning_rate": 0.0014070287539936102,
"loss": 6.1863,
"step": 116000
},
{
"epoch": 14.89,
"learning_rate": 0.0014044728434504793,
"loss": 6.1849,
"step": 116500
},
{
"epoch": 14.95,
"learning_rate": 0.0014019169329073482,
"loss": 6.1896,
"step": 117000
},
{
"epoch": 15.0,
"eval_loss": 6.130258083343506,
"eval_runtime": 6.8222,
"eval_samples_per_second": 36.792,
"eval_steps_per_second": 1.173,
"step": 117375
},
{
"epoch": 15.02,
"learning_rate": 0.0013993610223642173,
"loss": 6.1876,
"step": 117500
},
{
"epoch": 15.08,
"learning_rate": 0.0013968051118210863,
"loss": 6.1846,
"step": 118000
},
{
"epoch": 15.14,
"learning_rate": 0.0013942492012779552,
"loss": 6.1883,
"step": 118500
},
{
"epoch": 15.21,
"learning_rate": 0.0013916932907348243,
"loss": 6.1785,
"step": 119000
},
{
"epoch": 15.27,
"learning_rate": 0.0013891373801916934,
"loss": 6.1827,
"step": 119500
},
{
"epoch": 15.34,
"learning_rate": 0.0013865814696485625,
"loss": 6.1879,
"step": 120000
},
{
"epoch": 15.4,
"learning_rate": 0.0013840255591054313,
"loss": 6.1734,
"step": 120500
},
{
"epoch": 15.46,
"learning_rate": 0.0013814696485623004,
"loss": 6.1852,
"step": 121000
},
{
"epoch": 15.53,
"learning_rate": 0.0013789137380191695,
"loss": 6.1903,
"step": 121500
},
{
"epoch": 15.59,
"learning_rate": 0.0013763578274760384,
"loss": 6.1877,
"step": 122000
},
{
"epoch": 15.65,
"learning_rate": 0.0013738019169329074,
"loss": 6.1779,
"step": 122500
},
{
"epoch": 15.72,
"learning_rate": 0.0013712460063897765,
"loss": 6.185,
"step": 123000
},
{
"epoch": 15.78,
"learning_rate": 0.0013686900958466456,
"loss": 6.1835,
"step": 123500
},
{
"epoch": 15.85,
"learning_rate": 0.0013661341853035143,
"loss": 6.1792,
"step": 124000
},
{
"epoch": 15.91,
"learning_rate": 0.0013635782747603833,
"loss": 6.182,
"step": 124500
},
{
"epoch": 15.97,
"learning_rate": 0.0013610223642172524,
"loss": 6.1884,
"step": 125000
},
{
"epoch": 16.0,
"eval_loss": 6.127689361572266,
"eval_runtime": 6.7878,
"eval_samples_per_second": 36.978,
"eval_steps_per_second": 1.179,
"step": 125200
},
{
"epoch": 16.04,
"learning_rate": 0.0013584664536741213,
"loss": 6.1881,
"step": 125500
},
{
"epoch": 16.1,
"learning_rate": 0.0013559105431309904,
"loss": 6.1847,
"step": 126000
},
{
"epoch": 16.17,
"learning_rate": 0.0013533546325878595,
"loss": 6.1822,
"step": 126500
},
{
"epoch": 16.23,
"learning_rate": 0.0013507987220447283,
"loss": 6.1777,
"step": 127000
},
{
"epoch": 16.29,
"learning_rate": 0.0013482428115015974,
"loss": 6.1886,
"step": 127500
},
{
"epoch": 16.36,
"learning_rate": 0.0013456869009584665,
"loss": 6.1844,
"step": 128000
},
{
"epoch": 16.42,
"learning_rate": 0.0013431309904153356,
"loss": 6.1775,
"step": 128500
},
{
"epoch": 16.49,
"learning_rate": 0.0013405750798722044,
"loss": 6.1829,
"step": 129000
},
{
"epoch": 16.55,
"learning_rate": 0.0013380191693290735,
"loss": 6.1766,
"step": 129500
},
{
"epoch": 16.61,
"learning_rate": 0.0013354632587859426,
"loss": 6.1847,
"step": 130000
},
{
"epoch": 16.68,
"learning_rate": 0.0013329073482428115,
"loss": 6.1804,
"step": 130500
},
{
"epoch": 16.74,
"learning_rate": 0.0013303514376996806,
"loss": 6.1774,
"step": 131000
},
{
"epoch": 16.81,
"learning_rate": 0.0013277955271565496,
"loss": 6.1767,
"step": 131500
},
{
"epoch": 16.87,
"learning_rate": 0.0013252396166134187,
"loss": 6.178,
"step": 132000
},
{
"epoch": 16.93,
"learning_rate": 0.0013226837060702876,
"loss": 6.1806,
"step": 132500
},
{
"epoch": 17.0,
"learning_rate": 0.0013201277955271567,
"loss": 6.1809,
"step": 133000
},
{
"epoch": 17.0,
"eval_loss": 6.124339580535889,
"eval_runtime": 6.8363,
"eval_samples_per_second": 36.716,
"eval_steps_per_second": 1.17,
"step": 133025
},
{
"epoch": 17.06,
"learning_rate": 0.0013175718849840258,
"loss": 6.1733,
"step": 133500
},
{
"epoch": 17.12,
"learning_rate": 0.0013150159744408944,
"loss": 6.1825,
"step": 134000
},
{
"epoch": 17.19,
"learning_rate": 0.0013124600638977635,
"loss": 6.1752,
"step": 134500
},
{
"epoch": 17.25,
"learning_rate": 0.0013099041533546326,
"loss": 6.1785,
"step": 135000
},
{
"epoch": 17.32,
"learning_rate": 0.0013073482428115017,
"loss": 6.1761,
"step": 135500
},
{
"epoch": 17.38,
"learning_rate": 0.0013047923322683705,
"loss": 6.1833,
"step": 136000
},
{
"epoch": 17.44,
"learning_rate": 0.0013022364217252396,
"loss": 6.1727,
"step": 136500
},
{
"epoch": 17.51,
"learning_rate": 0.0012996805111821087,
"loss": 6.1744,
"step": 137000
},
{
"epoch": 17.57,
"learning_rate": 0.0012971246006389776,
"loss": 6.1766,
"step": 137500
},
{
"epoch": 17.64,
"learning_rate": 0.0012945686900958466,
"loss": 6.1754,
"step": 138000
},
{
"epoch": 17.7,
"learning_rate": 0.0012920127795527157,
"loss": 6.1822,
"step": 138500
},
{
"epoch": 17.76,
"learning_rate": 0.0012894568690095848,
"loss": 6.1855,
"step": 139000
},
{
"epoch": 17.83,
"learning_rate": 0.0012869009584664537,
"loss": 6.1797,
"step": 139500
},
{
"epoch": 17.89,
"learning_rate": 0.0012843450479233227,
"loss": 6.1796,
"step": 140000
},
{
"epoch": 17.96,
"learning_rate": 0.0012817891373801918,
"loss": 6.1814,
"step": 140500
},
{
"epoch": 18.0,
"eval_loss": 6.123664379119873,
"eval_runtime": 6.845,
"eval_samples_per_second": 36.669,
"eval_steps_per_second": 1.169,
"step": 140850
},
{
"epoch": 18.02,
"learning_rate": 0.0012792332268370607,
"loss": 6.1872,
"step": 141000
},
{
"epoch": 18.08,
"learning_rate": 0.0012766773162939298,
"loss": 6.1834,
"step": 141500
},
{
"epoch": 18.15,
"learning_rate": 0.0012741214057507989,
"loss": 6.1793,
"step": 142000
},
{
"epoch": 18.21,
"learning_rate": 0.001271565495207668,
"loss": 6.1731,
"step": 142500
},
{
"epoch": 18.27,
"learning_rate": 0.0012690095846645368,
"loss": 6.183,
"step": 143000
},
{
"epoch": 18.34,
"learning_rate": 0.001266453674121406,
"loss": 6.1745,
"step": 143500
},
{
"epoch": 18.4,
"learning_rate": 0.0012638977635782748,
"loss": 6.1846,
"step": 144000
},
{
"epoch": 18.47,
"learning_rate": 0.0012613418530351436,
"loss": 6.1767,
"step": 144500
},
{
"epoch": 18.53,
"learning_rate": 0.0012587859424920127,
"loss": 6.1747,
"step": 145000
},
{
"epoch": 18.59,
"learning_rate": 0.0012562300319488818,
"loss": 6.1804,
"step": 145500
},
{
"epoch": 18.66,
"learning_rate": 0.0012536741214057507,
"loss": 6.1729,
"step": 146000
},
{
"epoch": 18.72,
"learning_rate": 0.0012511182108626197,
"loss": 6.1791,
"step": 146500
},
{
"epoch": 18.79,
"learning_rate": 0.0012485623003194888,
"loss": 6.173,
"step": 147000
},
{
"epoch": 18.85,
"learning_rate": 0.001246006389776358,
"loss": 6.1732,
"step": 147500
},
{
"epoch": 18.91,
"learning_rate": 0.0012434504792332268,
"loss": 6.1771,
"step": 148000
},
{
"epoch": 18.98,
"learning_rate": 0.0012408945686900959,
"loss": 6.1754,
"step": 148500
},
{
"epoch": 19.0,
"eval_loss": 6.119473934173584,
"eval_runtime": 6.9262,
"eval_samples_per_second": 36.239,
"eval_steps_per_second": 1.155,
"step": 148675
},
{
"epoch": 19.04,
"learning_rate": 0.001238338658146965,
"loss": 6.1773,
"step": 149000
},
{
"epoch": 19.11,
"learning_rate": 0.0012357827476038338,
"loss": 6.176,
"step": 149500
},
{
"epoch": 19.17,
"learning_rate": 0.001233226837060703,
"loss": 6.1746,
"step": 150000
},
{
"epoch": 19.23,
"learning_rate": 0.001230670926517572,
"loss": 6.1773,
"step": 150500
},
{
"epoch": 19.3,
"learning_rate": 0.001228115015974441,
"loss": 6.1771,
"step": 151000
},
{
"epoch": 19.36,
"learning_rate": 0.00122555910543131,
"loss": 6.1736,
"step": 151500
},
{
"epoch": 19.42,
"learning_rate": 0.001223003194888179,
"loss": 6.1724,
"step": 152000
},
{
"epoch": 19.49,
"learning_rate": 0.001220447284345048,
"loss": 6.1809,
"step": 152500
},
{
"epoch": 19.55,
"learning_rate": 0.001217891373801917,
"loss": 6.1769,
"step": 153000
},
{
"epoch": 19.62,
"learning_rate": 0.001215335463258786,
"loss": 6.1712,
"step": 153500
},
{
"epoch": 19.68,
"learning_rate": 0.001212779552715655,
"loss": 6.1779,
"step": 154000
},
{
"epoch": 19.74,
"learning_rate": 0.001210223642172524,
"loss": 6.1758,
"step": 154500
},
{
"epoch": 19.81,
"learning_rate": 0.0012076677316293929,
"loss": 6.1681,
"step": 155000
},
{
"epoch": 19.87,
"learning_rate": 0.001205111821086262,
"loss": 6.1728,
"step": 155500
},
{
"epoch": 19.94,
"learning_rate": 0.001202555910543131,
"loss": 6.1737,
"step": 156000
},
{
"epoch": 20.0,
"learning_rate": 0.0012,
"loss": 6.1769,
"step": 156500
},
{
"epoch": 20.0,
"eval_loss": 6.11764669418335,
"eval_runtime": 6.8248,
"eval_samples_per_second": 36.778,
"eval_steps_per_second": 1.172,
"step": 156500
},
{
"epoch": 20.06,
"learning_rate": 0.001197444089456869,
"loss": 6.1785,
"step": 157000
},
{
"epoch": 20.13,
"learning_rate": 0.001194888178913738,
"loss": 6.1772,
"step": 157500
},
{
"epoch": 20.19,
"learning_rate": 0.0011923322683706071,
"loss": 6.1648,
"step": 158000
},
{
"epoch": 20.26,
"learning_rate": 0.001189776357827476,
"loss": 6.1715,
"step": 158500
},
{
"epoch": 20.32,
"learning_rate": 0.001187220447284345,
"loss": 6.1745,
"step": 159000
},
{
"epoch": 20.38,
"learning_rate": 0.0011846645367412142,
"loss": 6.1769,
"step": 159500
},
{
"epoch": 20.45,
"learning_rate": 0.001182108626198083,
"loss": 6.1783,
"step": 160000
},
{
"epoch": 20.51,
"learning_rate": 0.0011795527156549521,
"loss": 6.1693,
"step": 160500
},
{
"epoch": 20.58,
"learning_rate": 0.0011769968051118212,
"loss": 6.1726,
"step": 161000
},
{
"epoch": 20.64,
"learning_rate": 0.0011744408945686903,
"loss": 6.1676,
"step": 161500
},
{
"epoch": 20.7,
"learning_rate": 0.0011718849840255592,
"loss": 6.1669,
"step": 162000
},
{
"epoch": 20.77,
"learning_rate": 0.0011693290734824282,
"loss": 6.1636,
"step": 162500
},
{
"epoch": 20.83,
"learning_rate": 0.0011667731629392973,
"loss": 6.1791,
"step": 163000
},
{
"epoch": 20.89,
"learning_rate": 0.001164217252396166,
"loss": 6.1757,
"step": 163500
},
{
"epoch": 20.96,
"learning_rate": 0.001161661341853035,
"loss": 6.1716,
"step": 164000
},
{
"epoch": 21.0,
"eval_loss": 6.116322994232178,
"eval_runtime": 6.8185,
"eval_samples_per_second": 36.812,
"eval_steps_per_second": 1.173,
"step": 164325
},
{
"epoch": 21.02,
"learning_rate": 0.0011591054313099041,
"loss": 6.1742,
"step": 164500
},
{
"epoch": 21.09,
"learning_rate": 0.001156549520766773,
"loss": 6.1689,
"step": 165000
},
{
"epoch": 21.15,
"learning_rate": 0.001153993610223642,
"loss": 6.162,
"step": 165500
},
{
"epoch": 21.21,
"learning_rate": 0.0011514376996805112,
"loss": 6.1729,
"step": 166000
},
{
"epoch": 21.28,
"learning_rate": 0.0011488817891373803,
"loss": 6.1756,
"step": 166500
},
{
"epoch": 21.34,
"learning_rate": 0.0011463258785942491,
"loss": 6.1694,
"step": 167000
},
{
"epoch": 21.41,
"learning_rate": 0.0011437699680511182,
"loss": 6.1755,
"step": 167500
},
{
"epoch": 21.47,
"learning_rate": 0.0011412140575079873,
"loss": 6.1735,
"step": 168000
},
{
"epoch": 21.53,
"learning_rate": 0.0011386581469648562,
"loss": 6.1676,
"step": 168500
},
{
"epoch": 21.6,
"learning_rate": 0.0011361022364217252,
"loss": 6.1716,
"step": 169000
},
{
"epoch": 21.66,
"learning_rate": 0.0011335463258785943,
"loss": 6.1737,
"step": 169500
},
{
"epoch": 21.73,
"learning_rate": 0.0011309904153354634,
"loss": 6.1743,
"step": 170000
},
{
"epoch": 21.79,
"learning_rate": 0.0011284345047923323,
"loss": 6.1749,
"step": 170500
},
{
"epoch": 21.85,
"learning_rate": 0.0011258785942492014,
"loss": 6.1724,
"step": 171000
},
{
"epoch": 21.92,
"learning_rate": 0.0011233226837060704,
"loss": 6.1676,
"step": 171500
},
{
"epoch": 21.98,
"learning_rate": 0.0011207667731629393,
"loss": 6.1823,
"step": 172000
},
{
"epoch": 22.0,
"eval_loss": 6.115005016326904,
"eval_runtime": 57.523,
"eval_samples_per_second": 4.363,
"eval_steps_per_second": 0.139,
"step": 172150
},
{
"epoch": 22.04,
"learning_rate": 0.0011182108626198084,
"loss": 6.1729,
"step": 172500
},
{
"epoch": 22.11,
"learning_rate": 0.0011156549520766775,
"loss": 6.1746,
"step": 173000
},
{
"epoch": 22.17,
"learning_rate": 0.0011130990415335463,
"loss": 6.1732,
"step": 173500
},
{
"epoch": 22.24,
"learning_rate": 0.0011105431309904152,
"loss": 6.1712,
"step": 174000
},
{
"epoch": 22.3,
"learning_rate": 0.0011079872204472843,
"loss": 6.1677,
"step": 174500
},
{
"epoch": 22.36,
"learning_rate": 0.0011054313099041534,
"loss": 6.1683,
"step": 175000
},
{
"epoch": 22.43,
"learning_rate": 0.0011028753993610222,
"loss": 6.1741,
"step": 175500
},
{
"epoch": 22.49,
"learning_rate": 0.0011003194888178913,
"loss": 6.1663,
"step": 176000
},
{
"epoch": 22.56,
"learning_rate": 0.0010977635782747604,
"loss": 6.1681,
"step": 176500
},
{
"epoch": 22.62,
"learning_rate": 0.0010952076677316295,
"loss": 6.174,
"step": 177000
},
{
"epoch": 22.68,
"learning_rate": 0.0010926517571884984,
"loss": 6.1674,
"step": 177500
},
{
"epoch": 22.75,
"learning_rate": 0.0010900958466453674,
"loss": 6.1701,
"step": 178000
},
{
"epoch": 22.81,
"learning_rate": 0.0010875399361022365,
"loss": 6.1724,
"step": 178500
},
{
"epoch": 22.88,
"learning_rate": 0.0010849840255591054,
"loss": 6.1712,
"step": 179000
},
{
"epoch": 22.94,
"learning_rate": 0.0010824281150159745,
"loss": 6.1679,
"step": 179500
},
{
"epoch": 23.0,
"eval_loss": 6.114222049713135,
"eval_runtime": 6.9135,
"eval_samples_per_second": 36.306,
"eval_steps_per_second": 1.157,
"step": 179975
},
{
"epoch": 23.0,
"learning_rate": 0.0010798722044728436,
"loss": 6.1666,
"step": 180000
},
{
"epoch": 23.07,
"learning_rate": 0.0010773162939297126,
"loss": 6.1702,
"step": 180500
},
{
"epoch": 23.13,
"learning_rate": 0.0010747603833865815,
"loss": 6.166,
"step": 181000
},
{
"epoch": 23.19,
"learning_rate": 0.0010722044728434506,
"loss": 6.1651,
"step": 181500
},
{
"epoch": 23.26,
"learning_rate": 0.0010696485623003197,
"loss": 6.1721,
"step": 182000
},
{
"epoch": 23.32,
"learning_rate": 0.0010670926517571885,
"loss": 6.1721,
"step": 182500
},
{
"epoch": 23.39,
"learning_rate": 0.0010645367412140576,
"loss": 6.168,
"step": 183000
},
{
"epoch": 23.45,
"learning_rate": 0.0010619808306709265,
"loss": 6.1732,
"step": 183500
},
{
"epoch": 23.51,
"learning_rate": 0.0010594249201277954,
"loss": 6.171,
"step": 184000
},
{
"epoch": 23.58,
"learning_rate": 0.0010568690095846644,
"loss": 6.1697,
"step": 184500
},
{
"epoch": 23.64,
"learning_rate": 0.0010543130990415335,
"loss": 6.1667,
"step": 185000
},
{
"epoch": 23.71,
"learning_rate": 0.0010517571884984026,
"loss": 6.1762,
"step": 185500
},
{
"epoch": 23.77,
"learning_rate": 0.0010492012779552715,
"loss": 6.1661,
"step": 186000
},
{
"epoch": 23.83,
"learning_rate": 0.0010466453674121406,
"loss": 6.1741,
"step": 186500
},
{
"epoch": 23.9,
"learning_rate": 0.0010440894568690096,
"loss": 6.166,
"step": 187000
},
{
"epoch": 23.96,
"learning_rate": 0.0010415335463258785,
"loss": 6.1697,
"step": 187500
},
{
"epoch": 24.0,
"eval_loss": 6.112667083740234,
"eval_runtime": 6.8444,
"eval_samples_per_second": 36.672,
"eval_steps_per_second": 1.169,
"step": 187800
},
{
"epoch": 24.03,
"learning_rate": 0.0010389776357827476,
"loss": 6.1674,
"step": 188000
},
{
"epoch": 24.09,
"learning_rate": 0.0010364217252396167,
"loss": 6.169,
"step": 188500
},
{
"epoch": 24.15,
"learning_rate": 0.0010338658146964858,
"loss": 6.1648,
"step": 189000
},
{
"epoch": 24.22,
"learning_rate": 0.0010313099041533546,
"loss": 6.1711,
"step": 189500
},
{
"epoch": 24.28,
"learning_rate": 0.0010287539936102237,
"loss": 6.1621,
"step": 190000
},
{
"epoch": 24.35,
"learning_rate": 0.0010261980830670928,
"loss": 6.1666,
"step": 190500
},
{
"epoch": 24.41,
"learning_rate": 0.0010236421725239617,
"loss": 6.1685,
"step": 191000
},
{
"epoch": 24.47,
"learning_rate": 0.0010210862619808307,
"loss": 6.171,
"step": 191500
},
{
"epoch": 24.54,
"learning_rate": 0.0010185303514376998,
"loss": 6.1777,
"step": 192000
},
{
"epoch": 24.6,
"learning_rate": 0.001015974440894569,
"loss": 6.1717,
"step": 192500
},
{
"epoch": 24.66,
"learning_rate": 0.0010134185303514378,
"loss": 6.1702,
"step": 193000
},
{
"epoch": 24.73,
"learning_rate": 0.0010108626198083066,
"loss": 6.1728,
"step": 193500
},
{
"epoch": 24.79,
"learning_rate": 0.0010083067092651757,
"loss": 6.1671,
"step": 194000
},
{
"epoch": 24.86,
"learning_rate": 0.0010057507987220446,
"loss": 6.16,
"step": 194500
},
{
"epoch": 24.92,
"learning_rate": 0.0010031948881789137,
"loss": 6.1631,
"step": 195000
},
{
"epoch": 24.98,
"learning_rate": 0.0010006389776357828,
"loss": 6.1719,
"step": 195500
},
{
"epoch": 25.0,
"eval_loss": 6.112457275390625,
"eval_runtime": 6.8286,
"eval_samples_per_second": 36.757,
"eval_steps_per_second": 1.172,
"step": 195625
},
{
"epoch": 25.05,
"learning_rate": 0.0009980830670926518,
"loss": 6.1638,
"step": 196000
},
{
"epoch": 25.11,
"learning_rate": 0.0009955271565495207,
"loss": 6.1723,
"step": 196500
},
{
"epoch": 25.18,
"learning_rate": 0.0009929712460063898,
"loss": 6.1753,
"step": 197000
},
{
"epoch": 25.24,
"learning_rate": 0.0009904153354632589,
"loss": 6.1636,
"step": 197500
},
{
"epoch": 25.3,
"learning_rate": 0.0009878594249201277,
"loss": 6.1734,
"step": 198000
},
{
"epoch": 25.37,
"learning_rate": 0.0009853035143769968,
"loss": 6.1705,
"step": 198500
},
{
"epoch": 25.43,
"learning_rate": 0.000982747603833866,
"loss": 6.1615,
"step": 199000
},
{
"epoch": 25.5,
"learning_rate": 0.000980191693290735,
"loss": 6.1636,
"step": 199500
},
{
"epoch": 25.56,
"learning_rate": 0.0009776357827476038,
"loss": 6.1756,
"step": 200000
},
{
"epoch": 25.62,
"learning_rate": 0.0009750798722044729,
"loss": 6.1726,
"step": 200500
},
{
"epoch": 25.69,
"learning_rate": 0.0009725239616613418,
"loss": 6.1758,
"step": 201000
},
{
"epoch": 25.75,
"learning_rate": 0.0009699680511182109,
"loss": 6.1662,
"step": 201500
},
{
"epoch": 25.81,
"learning_rate": 0.0009674121405750799,
"loss": 6.1656,
"step": 202000
},
{
"epoch": 25.88,
"learning_rate": 0.0009648562300319489,
"loss": 6.1568,
"step": 202500
},
{
"epoch": 25.94,
"learning_rate": 0.0009623003194888179,
"loss": 6.1678,
"step": 203000
},
{
"epoch": 26.0,
"eval_loss": 6.112171649932861,
"eval_runtime": 6.8367,
"eval_samples_per_second": 36.714,
"eval_steps_per_second": 1.17,
"step": 203450
},
{
"epoch": 26.01,
"learning_rate": 0.0009597444089456869,
"loss": 6.1698,
"step": 203500
},
{
"epoch": 26.07,
"learning_rate": 0.000957188498402556,
"loss": 6.158,
"step": 204000
},
{
"epoch": 26.13,
"learning_rate": 0.000954632587859425,
"loss": 6.1723,
"step": 204500
},
{
"epoch": 26.2,
"learning_rate": 0.000952076677316294,
"loss": 6.1665,
"step": 205000
},
{
"epoch": 26.26,
"learning_rate": 0.000949520766773163,
"loss": 6.1657,
"step": 205500
},
{
"epoch": 26.33,
"learning_rate": 0.0009469648562300319,
"loss": 6.1692,
"step": 206000
},
{
"epoch": 26.39,
"learning_rate": 0.000944408945686901,
"loss": 6.1713,
"step": 206500
},
{
"epoch": 26.45,
"learning_rate": 0.0009418530351437699,
"loss": 6.1642,
"step": 207000
},
{
"epoch": 26.52,
"learning_rate": 0.000939297124600639,
"loss": 6.1679,
"step": 207500
},
{
"epoch": 26.58,
"learning_rate": 0.000936741214057508,
"loss": 6.1662,
"step": 208000
},
{
"epoch": 26.65,
"learning_rate": 0.0009341853035143771,
"loss": 6.1701,
"step": 208500
},
{
"epoch": 26.71,
"learning_rate": 0.000931629392971246,
"loss": 6.1702,
"step": 209000
},
{
"epoch": 26.77,
"learning_rate": 0.000929073482428115,
"loss": 6.1672,
"step": 209500
},
{
"epoch": 26.84,
"learning_rate": 0.0009265175718849841,
"loss": 6.1697,
"step": 210000
},
{
"epoch": 26.9,
"learning_rate": 0.000923961661341853,
"loss": 6.1657,
"step": 210500
},
{
"epoch": 26.96,
"learning_rate": 0.000921405750798722,
"loss": 6.1664,
"step": 211000
},
{
"epoch": 27.0,
"eval_loss": 6.112409591674805,
"eval_runtime": 6.9228,
"eval_samples_per_second": 36.257,
"eval_steps_per_second": 1.156,
"step": 211275
},
{
"epoch": 27.03,
"learning_rate": 0.000918849840255591,
"loss": 6.164,
"step": 211500
},
{
"epoch": 27.09,
"learning_rate": 0.0009162939297124601,
"loss": 6.1717,
"step": 212000
},
{
"epoch": 27.16,
"learning_rate": 0.0009137380191693291,
"loss": 6.1646,
"step": 212500
},
{
"epoch": 27.22,
"learning_rate": 0.0009111821086261981,
"loss": 6.1641,
"step": 213000
},
{
"epoch": 27.28,
"learning_rate": 0.0009086261980830671,
"loss": 6.1634,
"step": 213500
},
{
"epoch": 27.35,
"learning_rate": 0.0009060702875399361,
"loss": 6.17,
"step": 214000
},
{
"epoch": 27.41,
"learning_rate": 0.0009035143769968052,
"loss": 6.1703,
"step": 214500
},
{
"epoch": 27.48,
"learning_rate": 0.0009009584664536742,
"loss": 6.1721,
"step": 215000
},
{
"epoch": 27.54,
"learning_rate": 0.000898402555910543,
"loss": 6.1701,
"step": 215500
},
{
"epoch": 27.6,
"learning_rate": 0.0008958466453674121,
"loss": 6.1649,
"step": 216000
},
{
"epoch": 27.67,
"learning_rate": 0.0008932907348242811,
"loss": 6.1727,
"step": 216500
},
{
"epoch": 27.73,
"learning_rate": 0.0008907348242811502,
"loss": 6.1688,
"step": 217000
},
{
"epoch": 27.8,
"learning_rate": 0.0008881789137380192,
"loss": 6.166,
"step": 217500
},
{
"epoch": 27.86,
"learning_rate": 0.0008856230031948882,
"loss": 6.1683,
"step": 218000
},
{
"epoch": 27.92,
"learning_rate": 0.0008830670926517572,
"loss": 6.1647,
"step": 218500
},
{
"epoch": 27.99,
"learning_rate": 0.0008805111821086262,
"loss": 6.1643,
"step": 219000
},
{
"epoch": 28.0,
"eval_loss": 6.111097812652588,
"eval_runtime": 6.8343,
"eval_samples_per_second": 36.727,
"eval_steps_per_second": 1.171,
"step": 219100
},
{
"epoch": 28.05,
"learning_rate": 0.0008779552715654953,
"loss": 6.1691,
"step": 219500
},
{
"epoch": 28.12,
"learning_rate": 0.0008753993610223643,
"loss": 6.1653,
"step": 220000
},
{
"epoch": 28.18,
"learning_rate": 0.0008728434504792332,
"loss": 6.1633,
"step": 220500
},
{
"epoch": 28.24,
"learning_rate": 0.0008702875399361022,
"loss": 6.1705,
"step": 221000
},
{
"epoch": 28.31,
"learning_rate": 0.0008677316293929713,
"loss": 6.1695,
"step": 221500
},
{
"epoch": 28.37,
"learning_rate": 0.0008651757188498403,
"loss": 6.168,
"step": 222000
},
{
"epoch": 28.43,
"learning_rate": 0.0008626198083067092,
"loss": 6.1652,
"step": 222500
},
{
"epoch": 28.5,
"learning_rate": 0.0008600638977635783,
"loss": 6.1695,
"step": 223000
},
{
"epoch": 28.56,
"learning_rate": 0.0008575079872204473,
"loss": 6.1675,
"step": 223500
},
{
"epoch": 28.63,
"learning_rate": 0.0008549520766773164,
"loss": 6.1634,
"step": 224000
},
{
"epoch": 28.69,
"learning_rate": 0.0008523961661341853,
"loss": 6.1728,
"step": 224500
},
{
"epoch": 28.75,
"learning_rate": 0.0008498402555910543,
"loss": 6.1689,
"step": 225000
},
{
"epoch": 28.82,
"learning_rate": 0.0008472843450479233,
"loss": 6.1664,
"step": 225500
},
{
"epoch": 28.88,
"learning_rate": 0.0008447284345047923,
"loss": 6.1696,
"step": 226000
},
{
"epoch": 28.95,
"learning_rate": 0.0008421725239616614,
"loss": 6.1629,
"step": 226500
},
{
"epoch": 29.0,
"eval_loss": 6.1112260818481445,
"eval_runtime": 6.8361,
"eval_samples_per_second": 36.717,
"eval_steps_per_second": 1.17,
"step": 226925
},
{
"epoch": 29.01,
"learning_rate": 0.0008396166134185303,
"loss": 6.1625,
"step": 227000
},
{
"epoch": 29.07,
"learning_rate": 0.0008370607028753994,
"loss": 6.167,
"step": 227500
},
{
"epoch": 29.14,
"learning_rate": 0.0008345047923322684,
"loss": 6.1627,
"step": 228000
},
{
"epoch": 29.2,
"learning_rate": 0.0008319488817891374,
"loss": 6.1653,
"step": 228500
},
{
"epoch": 29.27,
"learning_rate": 0.0008293929712460064,
"loss": 6.1711,
"step": 229000
},
{
"epoch": 29.33,
"learning_rate": 0.0008268370607028754,
"loss": 6.1661,
"step": 229500
},
{
"epoch": 29.39,
"learning_rate": 0.0008242811501597445,
"loss": 6.1655,
"step": 230000
},
{
"epoch": 29.46,
"learning_rate": 0.0008217252396166134,
"loss": 6.1738,
"step": 230500
},
{
"epoch": 29.52,
"learning_rate": 0.0008191693290734825,
"loss": 6.1657,
"step": 231000
},
{
"epoch": 29.58,
"learning_rate": 0.0008166134185303514,
"loss": 6.1686,
"step": 231500
},
{
"epoch": 29.65,
"learning_rate": 0.0008140575079872204,
"loss": 6.1636,
"step": 232000
},
{
"epoch": 29.71,
"learning_rate": 0.0008115015974440895,
"loss": 6.1645,
"step": 232500
},
{
"epoch": 29.78,
"learning_rate": 0.0008089456869009585,
"loss": 6.1653,
"step": 233000
},
{
"epoch": 29.84,
"learning_rate": 0.0008063897763578275,
"loss": 6.1602,
"step": 233500
},
{
"epoch": 29.9,
"learning_rate": 0.0008038338658146965,
"loss": 6.1667,
"step": 234000
},
{
"epoch": 29.97,
"learning_rate": 0.0008012779552715655,
"loss": 6.1712,
"step": 234500
},
{
"epoch": 30.0,
"eval_loss": 6.1112165451049805,
"eval_runtime": 6.8132,
"eval_samples_per_second": 36.84,
"eval_steps_per_second": 1.174,
"step": 234750
},
{
"epoch": 30.03,
"learning_rate": 0.0007987220447284346,
"loss": 6.1752,
"step": 235000
},
{
"epoch": 30.1,
"learning_rate": 0.0007961661341853034,
"loss": 6.1726,
"step": 235500
},
{
"epoch": 30.16,
"learning_rate": 0.0007936102236421725,
"loss": 6.1653,
"step": 236000
},
{
"epoch": 30.22,
"learning_rate": 0.0007910543130990415,
"loss": 6.1677,
"step": 236500
},
{
"epoch": 30.29,
"learning_rate": 0.0007884984025559106,
"loss": 6.1666,
"step": 237000
},
{
"epoch": 30.35,
"learning_rate": 0.0007859424920127796,
"loss": 6.1628,
"step": 237500
},
{
"epoch": 30.42,
"learning_rate": 0.0007833865814696485,
"loss": 6.1717,
"step": 238000
},
{
"epoch": 30.48,
"learning_rate": 0.0007808306709265176,
"loss": 6.1636,
"step": 238500
},
{
"epoch": 30.54,
"learning_rate": 0.0007782747603833866,
"loss": 6.1666,
"step": 239000
},
{
"epoch": 30.61,
"learning_rate": 0.0007757188498402557,
"loss": 6.1612,
"step": 239500
},
{
"epoch": 30.67,
"learning_rate": 0.0007731629392971247,
"loss": 6.1623,
"step": 240000
},
{
"epoch": 30.73,
"learning_rate": 0.0007706070287539936,
"loss": 6.1657,
"step": 240500
},
{
"epoch": 30.8,
"learning_rate": 0.0007680511182108626,
"loss": 6.1634,
"step": 241000
},
{
"epoch": 30.86,
"learning_rate": 0.0007654952076677316,
"loss": 6.1704,
"step": 241500
},
{
"epoch": 30.93,
"learning_rate": 0.0007629392971246007,
"loss": 6.1684,
"step": 242000
},
{
"epoch": 30.99,
"learning_rate": 0.0007603833865814696,
"loss": 6.1588,
"step": 242500
},
{
"epoch": 31.0,
"eval_loss": 6.110002040863037,
"eval_runtime": 6.9156,
"eval_samples_per_second": 36.295,
"eval_steps_per_second": 1.157,
"step": 242575
},
{
"epoch": 31.05,
"learning_rate": 0.0007578274760383387,
"loss": 6.167,
"step": 243000
},
{
"epoch": 31.12,
"learning_rate": 0.0007552715654952077,
"loss": 6.1609,
"step": 243500
},
{
"epoch": 31.18,
"learning_rate": 0.0007527156549520767,
"loss": 6.1638,
"step": 244000
},
{
"epoch": 31.25,
"learning_rate": 0.0007501597444089458,
"loss": 6.1655,
"step": 244500
},
{
"epoch": 31.31,
"learning_rate": 0.0007476038338658147,
"loss": 6.1675,
"step": 245000
},
{
"epoch": 31.37,
"learning_rate": 0.0007450479233226837,
"loss": 6.1714,
"step": 245500
},
{
"epoch": 31.44,
"learning_rate": 0.0007424920127795527,
"loss": 6.1628,
"step": 246000
},
{
"epoch": 31.5,
"learning_rate": 0.0007399361022364218,
"loss": 6.1674,
"step": 246500
},
{
"epoch": 31.57,
"learning_rate": 0.0007373801916932907,
"loss": 6.1589,
"step": 247000
},
{
"epoch": 31.63,
"learning_rate": 0.0007348242811501597,
"loss": 6.1727,
"step": 247500
},
{
"epoch": 31.69,
"learning_rate": 0.0007322683706070288,
"loss": 6.1688,
"step": 248000
},
{
"epoch": 31.76,
"learning_rate": 0.0007297124600638978,
"loss": 6.1678,
"step": 248500
},
{
"epoch": 31.82,
"learning_rate": 0.0007271565495207669,
"loss": 6.162,
"step": 249000
},
{
"epoch": 31.88,
"learning_rate": 0.0007246006389776358,
"loss": 6.1671,
"step": 249500
},
{
"epoch": 31.95,
"learning_rate": 0.0007220447284345049,
"loss": 6.1686,
"step": 250000
},
{
"epoch": 32.0,
"eval_loss": 6.110647201538086,
"eval_runtime": 6.86,
"eval_samples_per_second": 36.589,
"eval_steps_per_second": 1.166,
"step": 250400
},
{
"epoch": 32.01,
"learning_rate": 0.0007194888178913738,
"loss": 6.1651,
"step": 250500
},
{
"epoch": 32.08,
"learning_rate": 0.0007169329073482428,
"loss": 6.1736,
"step": 251000
},
{
"epoch": 32.14,
"learning_rate": 0.0007143769968051118,
"loss": 6.1687,
"step": 251500
},
{
"epoch": 32.2,
"learning_rate": 0.0007118210862619808,
"loss": 6.1646,
"step": 252000
},
{
"epoch": 32.27,
"learning_rate": 0.0007092651757188499,
"loss": 6.1676,
"step": 252500
},
{
"epoch": 32.33,
"learning_rate": 0.0007067092651757189,
"loss": 6.1536,
"step": 253000
},
{
"epoch": 32.4,
"learning_rate": 0.0007041533546325878,
"loss": 6.1628,
"step": 253500
},
{
"epoch": 32.46,
"learning_rate": 0.0007015974440894569,
"loss": 6.1662,
"step": 254000
},
{
"epoch": 32.52,
"learning_rate": 0.0006990415335463259,
"loss": 6.171,
"step": 254500
},
{
"epoch": 32.59,
"learning_rate": 0.000696485623003195,
"loss": 6.1691,
"step": 255000
},
{
"epoch": 32.65,
"learning_rate": 0.0006939297124600638,
"loss": 6.1668,
"step": 255500
},
{
"epoch": 32.72,
"learning_rate": 0.0006913738019169329,
"loss": 6.1738,
"step": 256000
},
{
"epoch": 32.78,
"learning_rate": 0.0006888178913738019,
"loss": 6.1652,
"step": 256500
},
{
"epoch": 32.84,
"learning_rate": 0.0006862619808306709,
"loss": 6.164,
"step": 257000
},
{
"epoch": 32.91,
"learning_rate": 0.00068370607028754,
"loss": 6.1592,
"step": 257500
},
{
"epoch": 32.97,
"learning_rate": 0.0006811501597444089,
"loss": 6.167,
"step": 258000
},
{
"epoch": 33.0,
"eval_loss": 6.110122203826904,
"eval_runtime": 6.8168,
"eval_samples_per_second": 36.821,
"eval_steps_per_second": 1.174,
"step": 258225
},
{
"epoch": 33.04,
"learning_rate": 0.000678594249201278,
"loss": 6.1653,
"step": 258500
},
{
"epoch": 33.1,
"learning_rate": 0.000676038338658147,
"loss": 6.1616,
"step": 259000
},
{
"epoch": 33.16,
"learning_rate": 0.0006734824281150161,
"loss": 6.1651,
"step": 259500
},
{
"epoch": 33.23,
"learning_rate": 0.0006709265175718851,
"loss": 6.1698,
"step": 260000
},
{
"epoch": 33.29,
"learning_rate": 0.0006683706070287539,
"loss": 6.1654,
"step": 260500
},
{
"epoch": 33.35,
"learning_rate": 0.000665814696485623,
"loss": 6.1634,
"step": 261000
},
{
"epoch": 33.42,
"learning_rate": 0.000663258785942492,
"loss": 6.1647,
"step": 261500
},
{
"epoch": 33.48,
"learning_rate": 0.0006607028753993611,
"loss": 6.1584,
"step": 262000
},
{
"epoch": 33.55,
"learning_rate": 0.00065814696485623,
"loss": 6.1657,
"step": 262500
},
{
"epoch": 33.61,
"learning_rate": 0.000655591054313099,
"loss": 6.1676,
"step": 263000
},
{
"epoch": 33.67,
"learning_rate": 0.0006530351437699681,
"loss": 6.1595,
"step": 263500
},
{
"epoch": 33.74,
"learning_rate": 0.0006504792332268371,
"loss": 6.1643,
"step": 264000
},
{
"epoch": 33.8,
"learning_rate": 0.0006479233226837062,
"loss": 6.1737,
"step": 264500
},
{
"epoch": 33.87,
"learning_rate": 0.000645367412140575,
"loss": 6.1674,
"step": 265000
},
{
"epoch": 33.93,
"learning_rate": 0.0006428115015974441,
"loss": 6.1719,
"step": 265500
},
{
"epoch": 33.99,
"learning_rate": 0.0006402555910543131,
"loss": 6.1695,
"step": 266000
},
{
"epoch": 34.0,
"eval_loss": 6.110330104827881,
"eval_runtime": 6.7957,
"eval_samples_per_second": 36.935,
"eval_steps_per_second": 1.177,
"step": 266050
}
],
"logging_steps": 500,
"max_steps": 391250,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"total_flos": 4.575802027758605e+19,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}