atsuki-yamaguchi's picture
Upload folder using huggingface_hub
c130f5b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7733994606041957,
"eval_steps": 500,
"global_step": 21364,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011222328814234257,
"grad_norm": 37.096622467041016,
"learning_rate": 1.0157273918741808e-06,
"loss": 8.8686,
"step": 31
},
{
"epoch": 0.0022444657628468514,
"grad_norm": 13.880346298217773,
"learning_rate": 2.0314547837483616e-06,
"loss": 7.6419,
"step": 62
},
{
"epoch": 0.0033666986442702773,
"grad_norm": 16.09684944152832,
"learning_rate": 3.0471821756225426e-06,
"loss": 6.4382,
"step": 93
},
{
"epoch": 0.004488931525693703,
"grad_norm": 19.170230865478516,
"learning_rate": 4.062909567496723e-06,
"loss": 5.3399,
"step": 124
},
{
"epoch": 0.005611164407117128,
"grad_norm": 24.654130935668945,
"learning_rate": 5.078636959370905e-06,
"loss": 4.7646,
"step": 155
},
{
"epoch": 0.006733397288540555,
"grad_norm": 24.712974548339844,
"learning_rate": 6.094364351245085e-06,
"loss": 4.4667,
"step": 186
},
{
"epoch": 0.00785563016996398,
"grad_norm": 17.238990783691406,
"learning_rate": 7.110091743119267e-06,
"loss": 4.2168,
"step": 217
},
{
"epoch": 0.008977863051387406,
"grad_norm": 20.40213394165039,
"learning_rate": 8.125819134993446e-06,
"loss": 4.0355,
"step": 248
},
{
"epoch": 0.010100095932810832,
"grad_norm": 15.052313804626465,
"learning_rate": 9.141546526867629e-06,
"loss": 3.8458,
"step": 279
},
{
"epoch": 0.011222328814234257,
"grad_norm": 18.802026748657227,
"learning_rate": 1.015727391874181e-05,
"loss": 3.6688,
"step": 310
},
{
"epoch": 0.012344561695657683,
"grad_norm": 16.62171745300293,
"learning_rate": 1.117300131061599e-05,
"loss": 3.52,
"step": 341
},
{
"epoch": 0.01346679457708111,
"grad_norm": 16.29236602783203,
"learning_rate": 1.218872870249017e-05,
"loss": 3.402,
"step": 372
},
{
"epoch": 0.014589027458504534,
"grad_norm": 11.65068531036377,
"learning_rate": 1.3204456094364351e-05,
"loss": 3.2829,
"step": 403
},
{
"epoch": 0.01571126033992796,
"grad_norm": 10.617654800415039,
"learning_rate": 1.4220183486238533e-05,
"loss": 3.2008,
"step": 434
},
{
"epoch": 0.016833493221351387,
"grad_norm": 10.611294746398926,
"learning_rate": 1.5235910878112714e-05,
"loss": 3.1249,
"step": 465
},
{
"epoch": 0.01795572610277481,
"grad_norm": 9.946114540100098,
"learning_rate": 1.6251638269986893e-05,
"loss": 3.0503,
"step": 496
},
{
"epoch": 0.019077958984198236,
"grad_norm": 10.92148494720459,
"learning_rate": 1.7267365661861077e-05,
"loss": 2.9903,
"step": 527
},
{
"epoch": 0.020200191865621664,
"grad_norm": 8.329671859741211,
"learning_rate": 1.8283093053735257e-05,
"loss": 2.9261,
"step": 558
},
{
"epoch": 0.02132242474704509,
"grad_norm": 7.897571086883545,
"learning_rate": 1.9298820445609438e-05,
"loss": 2.889,
"step": 589
},
{
"epoch": 0.022444657628468513,
"grad_norm": 7.548309326171875,
"learning_rate": 2.031454783748362e-05,
"loss": 2.7945,
"step": 620
},
{
"epoch": 0.02356689050989194,
"grad_norm": 8.54383659362793,
"learning_rate": 2.13302752293578e-05,
"loss": 2.7538,
"step": 651
},
{
"epoch": 0.024689123391315366,
"grad_norm": 7.025435924530029,
"learning_rate": 2.234600262123198e-05,
"loss": 2.7075,
"step": 682
},
{
"epoch": 0.02581135627273879,
"grad_norm": 7.59956169128418,
"learning_rate": 2.336173001310616e-05,
"loss": 2.6625,
"step": 713
},
{
"epoch": 0.02693358915416222,
"grad_norm": 6.982921123504639,
"learning_rate": 2.437745740498034e-05,
"loss": 2.6248,
"step": 744
},
{
"epoch": 0.028055822035585643,
"grad_norm": 6.033556938171387,
"learning_rate": 2.5393184796854525e-05,
"loss": 2.5724,
"step": 775
},
{
"epoch": 0.029178054917009068,
"grad_norm": 6.674008846282959,
"learning_rate": 2.6408912188728702e-05,
"loss": 2.5292,
"step": 806
},
{
"epoch": 0.030300287798432492,
"grad_norm": 6.499022006988525,
"learning_rate": 2.7424639580602886e-05,
"loss": 2.496,
"step": 837
},
{
"epoch": 0.03142252067985592,
"grad_norm": 6.163687229156494,
"learning_rate": 2.8440366972477066e-05,
"loss": 2.4541,
"step": 868
},
{
"epoch": 0.032544753561279345,
"grad_norm": 5.20266580581665,
"learning_rate": 2.9456094364351244e-05,
"loss": 2.449,
"step": 899
},
{
"epoch": 0.03366698644270277,
"grad_norm": 5.6633830070495605,
"learning_rate": 3.0471821756225428e-05,
"loss": 2.4085,
"step": 930
},
{
"epoch": 0.034789219324126194,
"grad_norm": 6.414912700653076,
"learning_rate": 3.148754914809961e-05,
"loss": 2.3791,
"step": 961
},
{
"epoch": 0.03591145220554962,
"grad_norm": 4.983119964599609,
"learning_rate": 3.2503276539973785e-05,
"loss": 2.3505,
"step": 992
},
{
"epoch": 0.03703368508697305,
"grad_norm": 5.280698299407959,
"learning_rate": 3.351900393184797e-05,
"loss": 2.3191,
"step": 1023
},
{
"epoch": 0.03815591796839647,
"grad_norm": 5.565277099609375,
"learning_rate": 3.453473132372215e-05,
"loss": 2.2957,
"step": 1054
},
{
"epoch": 0.0392781508498199,
"grad_norm": 5.02451753616333,
"learning_rate": 3.555045871559633e-05,
"loss": 2.2618,
"step": 1085
},
{
"epoch": 0.04040038373124333,
"grad_norm": 4.424225807189941,
"learning_rate": 3.6566186107470514e-05,
"loss": 2.2512,
"step": 1116
},
{
"epoch": 0.04152261661266675,
"grad_norm": 6.270051002502441,
"learning_rate": 3.7581913499344695e-05,
"loss": 2.2354,
"step": 1147
},
{
"epoch": 0.04264484949409018,
"grad_norm": 14.256332397460938,
"learning_rate": 3.8597640891218876e-05,
"loss": 2.2364,
"step": 1178
},
{
"epoch": 0.043767082375513605,
"grad_norm": 4.837010383605957,
"learning_rate": 3.9613368283093056e-05,
"loss": 2.2346,
"step": 1209
},
{
"epoch": 0.044889315256937026,
"grad_norm": 3.9555633068084717,
"learning_rate": 4.062909567496724e-05,
"loss": 2.2003,
"step": 1240
},
{
"epoch": 0.046011548138360454,
"grad_norm": 4.136904716491699,
"learning_rate": 4.164482306684142e-05,
"loss": 2.2056,
"step": 1271
},
{
"epoch": 0.04713378101978388,
"grad_norm": 4.25378942489624,
"learning_rate": 4.26605504587156e-05,
"loss": 2.1395,
"step": 1302
},
{
"epoch": 0.048256013901207304,
"grad_norm": 3.6108360290527344,
"learning_rate": 4.367627785058978e-05,
"loss": 2.1296,
"step": 1333
},
{
"epoch": 0.04937824678263073,
"grad_norm": 3.66212797164917,
"learning_rate": 4.469200524246396e-05,
"loss": 2.1316,
"step": 1364
},
{
"epoch": 0.05050047966405416,
"grad_norm": 3.5523183345794678,
"learning_rate": 4.570773263433814e-05,
"loss": 2.1381,
"step": 1395
},
{
"epoch": 0.05162271254547758,
"grad_norm": 3.710803747177124,
"learning_rate": 4.672346002621232e-05,
"loss": 2.1296,
"step": 1426
},
{
"epoch": 0.05274494542690101,
"grad_norm": 3.346266031265259,
"learning_rate": 4.77391874180865e-05,
"loss": 2.0755,
"step": 1457
},
{
"epoch": 0.05386717830832444,
"grad_norm": 3.264901876449585,
"learning_rate": 4.875491480996068e-05,
"loss": 2.0902,
"step": 1488
},
{
"epoch": 0.05498941118974786,
"grad_norm": 3.031913995742798,
"learning_rate": 4.977064220183487e-05,
"loss": 2.1002,
"step": 1519
},
{
"epoch": 0.056111644071171286,
"grad_norm": 3.3827006816864014,
"learning_rate": 4.9999915451558777e-05,
"loss": 2.111,
"step": 1550
},
{
"epoch": 0.057233876952594714,
"grad_norm": 3.5572054386138916,
"learning_rate": 4.999955597496219e-05,
"loss": 2.0809,
"step": 1581
},
{
"epoch": 0.058356109834018136,
"grad_norm": 3.2875311374664307,
"learning_rate": 4.9998914381774255e-05,
"loss": 2.0562,
"step": 1612
},
{
"epoch": 0.059478342715441564,
"grad_norm": 2.903362274169922,
"learning_rate": 4.999799067923527e-05,
"loss": 2.0598,
"step": 1643
},
{
"epoch": 0.060600575596864985,
"grad_norm": 2.980804681777954,
"learning_rate": 4.999678487776908e-05,
"loss": 2.0458,
"step": 1674
},
{
"epoch": 0.06172280847828841,
"grad_norm": 2.880610466003418,
"learning_rate": 4.9995296990983006e-05,
"loss": 2.0433,
"step": 1705
},
{
"epoch": 0.06284504135971183,
"grad_norm": 2.7269234657287598,
"learning_rate": 4.999352703566763e-05,
"loss": 2.0189,
"step": 1736
},
{
"epoch": 0.06396727424113527,
"grad_norm": 2.808084487915039,
"learning_rate": 4.999147503179668e-05,
"loss": 2.0083,
"step": 1767
},
{
"epoch": 0.06508950712255869,
"grad_norm": 2.925065279006958,
"learning_rate": 4.998914100252672e-05,
"loss": 2.001,
"step": 1798
},
{
"epoch": 0.06621174000398211,
"grad_norm": 2.996300458908081,
"learning_rate": 4.998652497419696e-05,
"loss": 1.9877,
"step": 1829
},
{
"epoch": 0.06733397288540555,
"grad_norm": 2.6028084754943848,
"learning_rate": 4.9983626976328927e-05,
"loss": 1.9778,
"step": 1860
},
{
"epoch": 0.06845620576682897,
"grad_norm": 2.4577603340148926,
"learning_rate": 4.998044704162613e-05,
"loss": 1.9998,
"step": 1891
},
{
"epoch": 0.06957843864825239,
"grad_norm": 2.4269509315490723,
"learning_rate": 4.9976985205973705e-05,
"loss": 1.9813,
"step": 1922
},
{
"epoch": 0.07070067152967582,
"grad_norm": 2.6069250106811523,
"learning_rate": 4.997324150843799e-05,
"loss": 1.9781,
"step": 1953
},
{
"epoch": 0.07182290441109924,
"grad_norm": 2.5287699699401855,
"learning_rate": 4.99692159912661e-05,
"loss": 1.9684,
"step": 1984
},
{
"epoch": 0.07294513729252267,
"grad_norm": 2.6519899368286133,
"learning_rate": 4.996490869988546e-05,
"loss": 1.9821,
"step": 2015
},
{
"epoch": 0.0740673701739461,
"grad_norm": 2.525928497314453,
"learning_rate": 4.996031968290326e-05,
"loss": 1.9512,
"step": 2046
},
{
"epoch": 0.07518960305536952,
"grad_norm": 2.4517486095428467,
"learning_rate": 4.995544899210594e-05,
"loss": 1.9283,
"step": 2077
},
{
"epoch": 0.07631183593679294,
"grad_norm": 2.7807457447052,
"learning_rate": 4.9950296682458583e-05,
"loss": 1.9448,
"step": 2108
},
{
"epoch": 0.07743406881821638,
"grad_norm": 2.4739558696746826,
"learning_rate": 4.994486281210429e-05,
"loss": 1.946,
"step": 2139
},
{
"epoch": 0.0785563016996398,
"grad_norm": 2.6515214443206787,
"learning_rate": 4.9939147442363566e-05,
"loss": 1.9474,
"step": 2170
},
{
"epoch": 0.07967853458106322,
"grad_norm": 2.8361852169036865,
"learning_rate": 4.9933150637733574e-05,
"loss": 1.9463,
"step": 2201
},
{
"epoch": 0.08080076746248666,
"grad_norm": 2.332261323928833,
"learning_rate": 4.992687246588743e-05,
"loss": 1.9607,
"step": 2232
},
{
"epoch": 0.08192300034391008,
"grad_norm": 2.3486499786376953,
"learning_rate": 4.992031299767347e-05,
"loss": 1.9248,
"step": 2263
},
{
"epoch": 0.0830452332253335,
"grad_norm": 3.125208616256714,
"learning_rate": 4.9913472307114386e-05,
"loss": 1.9088,
"step": 2294
},
{
"epoch": 0.08416746610675693,
"grad_norm": 2.2809853553771973,
"learning_rate": 4.9906350471406446e-05,
"loss": 1.9199,
"step": 2325
},
{
"epoch": 0.08528969898818035,
"grad_norm": 2.567641258239746,
"learning_rate": 4.989894757091861e-05,
"loss": 1.9054,
"step": 2356
},
{
"epoch": 0.08641193186960378,
"grad_norm": 2.2755303382873535,
"learning_rate": 4.989126368919158e-05,
"loss": 1.903,
"step": 2387
},
{
"epoch": 0.08753416475102721,
"grad_norm": 2.147775888442993,
"learning_rate": 4.988329891293693e-05,
"loss": 1.8993,
"step": 2418
},
{
"epoch": 0.08865639763245063,
"grad_norm": 2.2279839515686035,
"learning_rate": 4.987505333203608e-05,
"loss": 1.905,
"step": 2449
},
{
"epoch": 0.08977863051387405,
"grad_norm": 2.317538022994995,
"learning_rate": 4.9866527039539276e-05,
"loss": 1.8776,
"step": 2480
},
{
"epoch": 0.09090086339529749,
"grad_norm": 2.296868324279785,
"learning_rate": 4.9857720131664594e-05,
"loss": 1.8714,
"step": 2511
},
{
"epoch": 0.09202309627672091,
"grad_norm": 2.282538890838623,
"learning_rate": 4.9848632707796773e-05,
"loss": 1.8765,
"step": 2542
},
{
"epoch": 0.09314532915814433,
"grad_norm": 2.1396827697753906,
"learning_rate": 4.9839264870486155e-05,
"loss": 1.8827,
"step": 2573
},
{
"epoch": 0.09426756203956776,
"grad_norm": 2.1897048950195312,
"learning_rate": 4.9829616725447526e-05,
"loss": 1.8655,
"step": 2604
},
{
"epoch": 0.09538979492099119,
"grad_norm": 2.1385130882263184,
"learning_rate": 4.981968838155888e-05,
"loss": 1.8768,
"step": 2635
},
{
"epoch": 0.09651202780241461,
"grad_norm": 2.264171600341797,
"learning_rate": 4.980947995086024e-05,
"loss": 1.8734,
"step": 2666
},
{
"epoch": 0.09763426068383804,
"grad_norm": 2.089871883392334,
"learning_rate": 4.979899154855234e-05,
"loss": 1.8516,
"step": 2697
},
{
"epoch": 0.09875649356526146,
"grad_norm": 2.092179298400879,
"learning_rate": 4.9788223292995386e-05,
"loss": 1.8729,
"step": 2728
},
{
"epoch": 0.09987872644668488,
"grad_norm": 2.3216769695281982,
"learning_rate": 4.977717530570768e-05,
"loss": 1.8673,
"step": 2759
},
{
"epoch": 0.10100095932810832,
"grad_norm": 2.104457139968872,
"learning_rate": 4.976584771136425e-05,
"loss": 1.8734,
"step": 2790
},
{
"epoch": 0.10212319220953174,
"grad_norm": 2.236363649368286,
"learning_rate": 4.975424063779547e-05,
"loss": 1.8316,
"step": 2821
},
{
"epoch": 0.10324542509095516,
"grad_norm": 2.264967203140259,
"learning_rate": 4.974235421598557e-05,
"loss": 1.8614,
"step": 2852
},
{
"epoch": 0.1043676579723786,
"grad_norm": 2.1815454959869385,
"learning_rate": 4.973018858007122e-05,
"loss": 1.8365,
"step": 2883
},
{
"epoch": 0.10548989085380202,
"grad_norm": 2.049677848815918,
"learning_rate": 4.9717743867339963e-05,
"loss": 1.8454,
"step": 2914
},
{
"epoch": 0.10661212373522544,
"grad_norm": 1.9844895601272583,
"learning_rate": 4.9705020218228695e-05,
"loss": 1.8419,
"step": 2945
},
{
"epoch": 0.10773435661664887,
"grad_norm": 2.052708387374878,
"learning_rate": 4.969201777632205e-05,
"loss": 1.8509,
"step": 2976
},
{
"epoch": 0.1088565894980723,
"grad_norm": 2.014535665512085,
"learning_rate": 4.9678736688350846e-05,
"loss": 1.8129,
"step": 3007
},
{
"epoch": 0.10997882237949572,
"grad_norm": 1.9768311977386475,
"learning_rate": 4.966517710419033e-05,
"loss": 1.8375,
"step": 3038
},
{
"epoch": 0.11110105526091915,
"grad_norm": 2.046293258666992,
"learning_rate": 4.965133917685858e-05,
"loss": 1.8132,
"step": 3069
},
{
"epoch": 0.11222328814234257,
"grad_norm": 2.104555368423462,
"learning_rate": 4.9637223062514714e-05,
"loss": 1.8147,
"step": 3100
},
{
"epoch": 0.113345521023766,
"grad_norm": 2.04533052444458,
"learning_rate": 4.962282892045718e-05,
"loss": 1.8591,
"step": 3131
},
{
"epoch": 0.11446775390518943,
"grad_norm": 1.967282772064209,
"learning_rate": 4.9608156913121904e-05,
"loss": 1.7966,
"step": 3162
},
{
"epoch": 0.11558998678661285,
"grad_norm": 2.092106342315674,
"learning_rate": 4.959320720608049e-05,
"loss": 1.8301,
"step": 3193
},
{
"epoch": 0.11671221966803627,
"grad_norm": 2.0512046813964844,
"learning_rate": 4.9577979968038354e-05,
"loss": 1.8211,
"step": 3224
},
{
"epoch": 0.11783445254945969,
"grad_norm": 1.9260915517807007,
"learning_rate": 4.956247537083282e-05,
"loss": 1.7989,
"step": 3255
},
{
"epoch": 0.11895668543088313,
"grad_norm": 2.0938026905059814,
"learning_rate": 4.9546693589431145e-05,
"loss": 1.8336,
"step": 3286
},
{
"epoch": 0.12007891831230655,
"grad_norm": 1.9972988367080688,
"learning_rate": 4.9530634801928595e-05,
"loss": 1.8147,
"step": 3317
},
{
"epoch": 0.12120115119372997,
"grad_norm": 1.9120224714279175,
"learning_rate": 4.9514299189546395e-05,
"loss": 1.8028,
"step": 3348
},
{
"epoch": 0.1223233840751534,
"grad_norm": 1.959033727645874,
"learning_rate": 4.949768693662973e-05,
"loss": 1.8281,
"step": 3379
},
{
"epoch": 0.12344561695657683,
"grad_norm": 1.9182357788085938,
"learning_rate": 4.948079823064559e-05,
"loss": 1.8165,
"step": 3410
},
{
"epoch": 0.12456784983800025,
"grad_norm": 1.9079999923706055,
"learning_rate": 4.946363326218074e-05,
"loss": 1.7916,
"step": 3441
},
{
"epoch": 0.12569008271942367,
"grad_norm": 1.916276216506958,
"learning_rate": 4.9446192224939525e-05,
"loss": 1.8086,
"step": 3472
},
{
"epoch": 0.1268123156008471,
"grad_norm": 1.903389811515808,
"learning_rate": 4.942847531574167e-05,
"loss": 1.8116,
"step": 3503
},
{
"epoch": 0.12793454848227054,
"grad_norm": 2.064885139465332,
"learning_rate": 4.941048273452008e-05,
"loss": 1.8144,
"step": 3534
},
{
"epoch": 0.12905678136369395,
"grad_norm": 2.1314241886138916,
"learning_rate": 4.9392214684318605e-05,
"loss": 1.7943,
"step": 3565
},
{
"epoch": 0.13017901424511738,
"grad_norm": 2.0061681270599365,
"learning_rate": 4.93736713712897e-05,
"loss": 1.794,
"step": 3596
},
{
"epoch": 0.13130124712654082,
"grad_norm": 1.9408286809921265,
"learning_rate": 4.9354853004692124e-05,
"loss": 1.7882,
"step": 3627
},
{
"epoch": 0.13242348000796422,
"grad_norm": 1.8884766101837158,
"learning_rate": 4.93357597968886e-05,
"loss": 1.7846,
"step": 3658
},
{
"epoch": 0.13354571288938766,
"grad_norm": 1.9393378496170044,
"learning_rate": 4.931639196334338e-05,
"loss": 1.7923,
"step": 3689
},
{
"epoch": 0.1346679457708111,
"grad_norm": 1.8815410137176514,
"learning_rate": 4.9296749722619826e-05,
"loss": 1.7939,
"step": 3720
},
{
"epoch": 0.1357901786522345,
"grad_norm": 1.8603038787841797,
"learning_rate": 4.9276833296377966e-05,
"loss": 1.7589,
"step": 3751
},
{
"epoch": 0.13691241153365794,
"grad_norm": 1.775247573852539,
"learning_rate": 4.925664290937196e-05,
"loss": 1.7897,
"step": 3782
},
{
"epoch": 0.13803464441508137,
"grad_norm": 1.8576780557632446,
"learning_rate": 4.9236178789447576e-05,
"loss": 1.7908,
"step": 3813
},
{
"epoch": 0.13915687729650478,
"grad_norm": 1.800264596939087,
"learning_rate": 4.921544116753962e-05,
"loss": 1.7736,
"step": 3844
},
{
"epoch": 0.1402791101779282,
"grad_norm": 1.9730401039123535,
"learning_rate": 4.919443027766935e-05,
"loss": 1.7639,
"step": 3875
},
{
"epoch": 0.14140134305935165,
"grad_norm": 1.8654968738555908,
"learning_rate": 4.91731463569418e-05,
"loss": 1.7477,
"step": 3906
},
{
"epoch": 0.14252357594077505,
"grad_norm": 1.8131386041641235,
"learning_rate": 4.915158964554312e-05,
"loss": 1.7887,
"step": 3937
},
{
"epoch": 0.1436458088221985,
"grad_norm": 1.8576264381408691,
"learning_rate": 4.912976038673786e-05,
"loss": 1.7779,
"step": 3968
},
{
"epoch": 0.14476804170362192,
"grad_norm": 1.8940199613571167,
"learning_rate": 4.9107658826866254e-05,
"loss": 1.7653,
"step": 3999
},
{
"epoch": 0.14589027458504533,
"grad_norm": 1.7727802991867065,
"learning_rate": 4.908528521534139e-05,
"loss": 1.7809,
"step": 4030
},
{
"epoch": 0.14701250746646877,
"grad_norm": 1.7416553497314453,
"learning_rate": 4.906263980464644e-05,
"loss": 1.7605,
"step": 4061
},
{
"epoch": 0.1481347403478922,
"grad_norm": 1.82987642288208,
"learning_rate": 4.903972285033178e-05,
"loss": 1.7554,
"step": 4092
},
{
"epoch": 0.1492569732293156,
"grad_norm": 1.916339635848999,
"learning_rate": 4.901653461101213e-05,
"loss": 1.7872,
"step": 4123
},
{
"epoch": 0.15037920611073904,
"grad_norm": 1.8903008699417114,
"learning_rate": 4.8993075348363626e-05,
"loss": 1.782,
"step": 4154
},
{
"epoch": 0.15150143899216248,
"grad_norm": 1.9334847927093506,
"learning_rate": 4.896934532712084e-05,
"loss": 1.7565,
"step": 4185
},
{
"epoch": 0.1526236718735859,
"grad_norm": 1.7778478860855103,
"learning_rate": 4.8945344815073846e-05,
"loss": 1.7613,
"step": 4216
},
{
"epoch": 0.15374590475500932,
"grad_norm": 1.7348295450210571,
"learning_rate": 4.892107408306516e-05,
"loss": 1.7512,
"step": 4247
},
{
"epoch": 0.15486813763643276,
"grad_norm": 1.7189710140228271,
"learning_rate": 4.889653340498669e-05,
"loss": 1.741,
"step": 4278
},
{
"epoch": 0.15599037051785616,
"grad_norm": 1.8557075262069702,
"learning_rate": 4.8871723057776664e-05,
"loss": 1.7471,
"step": 4309
},
{
"epoch": 0.1571126033992796,
"grad_norm": 1.7188880443572998,
"learning_rate": 4.8846643321416476e-05,
"loss": 1.7492,
"step": 4340
},
{
"epoch": 0.15823483628070303,
"grad_norm": 1.6712063550949097,
"learning_rate": 4.882129447892753e-05,
"loss": 1.7434,
"step": 4371
},
{
"epoch": 0.15935706916212644,
"grad_norm": 1.7652437686920166,
"learning_rate": 4.8795676816368076e-05,
"loss": 1.7422,
"step": 4402
},
{
"epoch": 0.16047930204354988,
"grad_norm": 1.7910144329071045,
"learning_rate": 4.876979062282995e-05,
"loss": 1.7635,
"step": 4433
},
{
"epoch": 0.1616015349249733,
"grad_norm": 1.9248684644699097,
"learning_rate": 4.8743636190435325e-05,
"loss": 1.7401,
"step": 4464
},
{
"epoch": 0.16272376780639672,
"grad_norm": 1.828202486038208,
"learning_rate": 4.871721381433344e-05,
"loss": 1.7419,
"step": 4495
},
{
"epoch": 0.16384600068782015,
"grad_norm": 1.7170790433883667,
"learning_rate": 4.869052379269719e-05,
"loss": 1.7562,
"step": 4526
},
{
"epoch": 0.1649682335692436,
"grad_norm": 1.753203272819519,
"learning_rate": 4.866356642671985e-05,
"loss": 1.7569,
"step": 4557
},
{
"epoch": 0.166090466450667,
"grad_norm": 1.7906442880630493,
"learning_rate": 4.8636342020611634e-05,
"loss": 1.7376,
"step": 4588
},
{
"epoch": 0.16721269933209043,
"grad_norm": 1.7113378047943115,
"learning_rate": 4.860885088159626e-05,
"loss": 1.7386,
"step": 4619
},
{
"epoch": 0.16833493221351387,
"grad_norm": 1.7997937202453613,
"learning_rate": 4.858109331990751e-05,
"loss": 1.7531,
"step": 4650
},
{
"epoch": 0.16945716509493727,
"grad_norm": 1.76421320438385,
"learning_rate": 4.855306964878567e-05,
"loss": 1.7402,
"step": 4681
},
{
"epoch": 0.1705793979763607,
"grad_norm": 1.7803616523742676,
"learning_rate": 4.8524780184474084e-05,
"loss": 1.7345,
"step": 4712
},
{
"epoch": 0.17170163085778414,
"grad_norm": 1.7763142585754395,
"learning_rate": 4.8496225246215496e-05,
"loss": 1.7469,
"step": 4743
},
{
"epoch": 0.17282386373920755,
"grad_norm": 1.728219747543335,
"learning_rate": 4.8467405156248505e-05,
"loss": 1.7182,
"step": 4774
},
{
"epoch": 0.17394609662063099,
"grad_norm": 1.7837860584259033,
"learning_rate": 4.843832023980392e-05,
"loss": 1.739,
"step": 4805
},
{
"epoch": 0.17506832950205442,
"grad_norm": 1.7005128860473633,
"learning_rate": 4.840897082510106e-05,
"loss": 1.7377,
"step": 4836
},
{
"epoch": 0.17619056238347783,
"grad_norm": 1.6570392847061157,
"learning_rate": 4.8379357243344084e-05,
"loss": 1.712,
"step": 4867
},
{
"epoch": 0.17731279526490126,
"grad_norm": 1.6575350761413574,
"learning_rate": 4.8349479828718236e-05,
"loss": 1.7147,
"step": 4898
},
{
"epoch": 0.1784350281463247,
"grad_norm": 1.8768808841705322,
"learning_rate": 4.8319338918386075e-05,
"loss": 1.7312,
"step": 4929
},
{
"epoch": 0.1795572610277481,
"grad_norm": 1.7145389318466187,
"learning_rate": 4.828893485248369e-05,
"loss": 1.7221,
"step": 4960
},
{
"epoch": 0.18067949390917154,
"grad_norm": 1.834173560142517,
"learning_rate": 4.825826797411682e-05,
"loss": 1.7322,
"step": 4991
},
{
"epoch": 0.18180172679059498,
"grad_norm": 1.7125933170318604,
"learning_rate": 4.822733862935702e-05,
"loss": 1.7156,
"step": 5022
},
{
"epoch": 0.18292395967201838,
"grad_norm": 1.7470024824142456,
"learning_rate": 4.819614716723775e-05,
"loss": 1.7176,
"step": 5053
},
{
"epoch": 0.18404619255344182,
"grad_norm": 1.7042289972305298,
"learning_rate": 4.8164693939750425e-05,
"loss": 1.7192,
"step": 5084
},
{
"epoch": 0.18516842543486525,
"grad_norm": 1.6803418397903442,
"learning_rate": 4.813297930184042e-05,
"loss": 1.7197,
"step": 5115
},
{
"epoch": 0.18629065831628866,
"grad_norm": 1.7296956777572632,
"learning_rate": 4.810100361140314e-05,
"loss": 1.72,
"step": 5146
},
{
"epoch": 0.1874128911977121,
"grad_norm": 1.6245464086532593,
"learning_rate": 4.8068767229279885e-05,
"loss": 1.7081,
"step": 5177
},
{
"epoch": 0.18853512407913553,
"grad_norm": 1.7138885259628296,
"learning_rate": 4.8036270519253854e-05,
"loss": 1.7068,
"step": 5208
},
{
"epoch": 0.18965735696055894,
"grad_norm": 1.704185128211975,
"learning_rate": 4.8003513848046e-05,
"loss": 1.7219,
"step": 5239
},
{
"epoch": 0.19077958984198237,
"grad_norm": 1.712551236152649,
"learning_rate": 4.79704975853109e-05,
"loss": 1.7118,
"step": 5270
},
{
"epoch": 0.1919018227234058,
"grad_norm": 1.7193052768707275,
"learning_rate": 4.793722210363262e-05,
"loss": 1.7195,
"step": 5301
},
{
"epoch": 0.19302405560482921,
"grad_norm": 1.5574607849121094,
"learning_rate": 4.7903687778520414e-05,
"loss": 1.7286,
"step": 5332
},
{
"epoch": 0.19414628848625265,
"grad_norm": 1.7480719089508057,
"learning_rate": 4.7869894988404593e-05,
"loss": 1.6957,
"step": 5363
},
{
"epoch": 0.19526852136767608,
"grad_norm": 1.7487633228302002,
"learning_rate": 4.783584411463221e-05,
"loss": 1.7203,
"step": 5394
},
{
"epoch": 0.1963907542490995,
"grad_norm": 1.6720587015151978,
"learning_rate": 4.780153554146274e-05,
"loss": 1.7009,
"step": 5425
},
{
"epoch": 0.19751298713052293,
"grad_norm": 1.6622951030731201,
"learning_rate": 4.7766969656063766e-05,
"loss": 1.7049,
"step": 5456
},
{
"epoch": 0.19863522001194636,
"grad_norm": 1.656158208847046,
"learning_rate": 4.773214684850662e-05,
"loss": 1.7104,
"step": 5487
},
{
"epoch": 0.19975745289336977,
"grad_norm": 1.6559454202651978,
"learning_rate": 4.769706751176193e-05,
"loss": 1.7089,
"step": 5518
},
{
"epoch": 0.2008796857747932,
"grad_norm": 1.7262494564056396,
"learning_rate": 4.7661732041695264e-05,
"loss": 1.7143,
"step": 5549
},
{
"epoch": 0.20200191865621664,
"grad_norm": 1.6877381801605225,
"learning_rate": 4.762614083706258e-05,
"loss": 1.7134,
"step": 5580
},
{
"epoch": 0.20312415153764005,
"grad_norm": 1.5669549703598022,
"learning_rate": 4.759029429950581e-05,
"loss": 1.7213,
"step": 5611
},
{
"epoch": 0.20424638441906348,
"grad_norm": 1.7044217586517334,
"learning_rate": 4.7554192833548235e-05,
"loss": 1.7185,
"step": 5642
},
{
"epoch": 0.20536861730048692,
"grad_norm": 1.6999757289886475,
"learning_rate": 4.751783684659e-05,
"loss": 1.7163,
"step": 5673
},
{
"epoch": 0.20649085018191032,
"grad_norm": 1.6043522357940674,
"learning_rate": 4.748122674890348e-05,
"loss": 1.7031,
"step": 5704
},
{
"epoch": 0.20761308306333376,
"grad_norm": 1.7062305212020874,
"learning_rate": 4.7444362953628654e-05,
"loss": 1.7035,
"step": 5735
},
{
"epoch": 0.2087353159447572,
"grad_norm": 1.6612005233764648,
"learning_rate": 4.7407245876768424e-05,
"loss": 1.6981,
"step": 5766
},
{
"epoch": 0.2098575488261806,
"grad_norm": 1.7277076244354248,
"learning_rate": 4.736987593718397e-05,
"loss": 1.7161,
"step": 5797
},
{
"epoch": 0.21097978170760404,
"grad_norm": 1.705458402633667,
"learning_rate": 4.733225355658999e-05,
"loss": 1.6854,
"step": 5828
},
{
"epoch": 0.21210201458902747,
"grad_norm": 1.629443883895874,
"learning_rate": 4.7294379159549926e-05,
"loss": 1.7025,
"step": 5859
},
{
"epoch": 0.21322424747045088,
"grad_norm": 1.613192081451416,
"learning_rate": 4.725625317347119e-05,
"loss": 1.6992,
"step": 5890
},
{
"epoch": 0.2143464803518743,
"grad_norm": 1.6801332235336304,
"learning_rate": 4.7217876028600374e-05,
"loss": 1.6798,
"step": 5921
},
{
"epoch": 0.21546871323329775,
"grad_norm": 1.6418830156326294,
"learning_rate": 4.717924815801832e-05,
"loss": 1.6918,
"step": 5952
},
{
"epoch": 0.21659094611472116,
"grad_norm": 1.6128371953964233,
"learning_rate": 4.714036999763532e-05,
"loss": 1.706,
"step": 5983
},
{
"epoch": 0.2177131789961446,
"grad_norm": 1.71291983127594,
"learning_rate": 4.7101241986186116e-05,
"loss": 1.6861,
"step": 6014
},
{
"epoch": 0.21883541187756803,
"grad_norm": 1.5903745889663696,
"learning_rate": 4.7061864565225e-05,
"loss": 1.6886,
"step": 6045
},
{
"epoch": 0.21995764475899143,
"grad_norm": 1.71088445186615,
"learning_rate": 4.702223817912081e-05,
"loss": 1.7003,
"step": 6076
},
{
"epoch": 0.22107987764041487,
"grad_norm": 1.541530966758728,
"learning_rate": 4.698236327505195e-05,
"loss": 1.6956,
"step": 6107
},
{
"epoch": 0.2222021105218383,
"grad_norm": 1.539455533027649,
"learning_rate": 4.694224030300127e-05,
"loss": 1.6833,
"step": 6138
},
{
"epoch": 0.2233243434032617,
"grad_norm": 1.688120722770691,
"learning_rate": 4.690186971575107e-05,
"loss": 1.6973,
"step": 6169
},
{
"epoch": 0.22444657628468515,
"grad_norm": 1.6934964656829834,
"learning_rate": 4.6861251968877916e-05,
"loss": 1.6979,
"step": 6200
},
{
"epoch": 0.22556880916610858,
"grad_norm": 1.6558688879013062,
"learning_rate": 4.68203875207476e-05,
"loss": 1.6925,
"step": 6231
},
{
"epoch": 0.226691042047532,
"grad_norm": 1.6245280504226685,
"learning_rate": 4.677927683250983e-05,
"loss": 1.6688,
"step": 6262
},
{
"epoch": 0.22781327492895542,
"grad_norm": 1.5808422565460205,
"learning_rate": 4.6737920368093156e-05,
"loss": 1.688,
"step": 6293
},
{
"epoch": 0.22893550781037886,
"grad_norm": 1.5224875211715698,
"learning_rate": 4.669631859419965e-05,
"loss": 1.6864,
"step": 6324
},
{
"epoch": 0.23005774069180226,
"grad_norm": 1.5904366970062256,
"learning_rate": 4.6654471980299676e-05,
"loss": 1.6893,
"step": 6355
},
{
"epoch": 0.2311799735732257,
"grad_norm": 1.6145131587982178,
"learning_rate": 4.661238099862658e-05,
"loss": 1.6818,
"step": 6386
},
{
"epoch": 0.23230220645464913,
"grad_norm": 1.6297610998153687,
"learning_rate": 4.657004612417138e-05,
"loss": 1.687,
"step": 6417
},
{
"epoch": 0.23342443933607254,
"grad_norm": 1.6199692487716675,
"learning_rate": 4.6527467834677374e-05,
"loss": 1.6945,
"step": 6448
},
{
"epoch": 0.23454667221749598,
"grad_norm": 1.5439369678497314,
"learning_rate": 4.648464661063478e-05,
"loss": 1.6926,
"step": 6479
},
{
"epoch": 0.23566890509891938,
"grad_norm": 1.6095410585403442,
"learning_rate": 4.6441582935275264e-05,
"loss": 1.689,
"step": 6510
},
{
"epoch": 0.23679113798034282,
"grad_norm": 1.4971855878829956,
"learning_rate": 4.6398277294566586e-05,
"loss": 1.6622,
"step": 6541
},
{
"epoch": 0.23791337086176625,
"grad_norm": 1.53174889087677,
"learning_rate": 4.6354730177207e-05,
"loss": 1.6785,
"step": 6572
},
{
"epoch": 0.23903560374318966,
"grad_norm": 1.4567692279815674,
"learning_rate": 4.6310942074619787e-05,
"loss": 1.6776,
"step": 6603
},
{
"epoch": 0.2401578366246131,
"grad_norm": 1.6813284158706665,
"learning_rate": 4.626691348094777e-05,
"loss": 1.6692,
"step": 6634
},
{
"epoch": 0.24128006950603653,
"grad_norm": 1.5593857765197754,
"learning_rate": 4.622264489304762e-05,
"loss": 1.6811,
"step": 6665
},
{
"epoch": 0.24240230238745994,
"grad_norm": 1.5681389570236206,
"learning_rate": 4.617813681048434e-05,
"loss": 1.689,
"step": 6696
},
{
"epoch": 0.24352453526888337,
"grad_norm": 1.6402842998504639,
"learning_rate": 4.61333897355256e-05,
"loss": 1.6621,
"step": 6727
},
{
"epoch": 0.2446467681503068,
"grad_norm": 1.642669677734375,
"learning_rate": 4.608840417313604e-05,
"loss": 1.6562,
"step": 6758
},
{
"epoch": 0.24576900103173022,
"grad_norm": 1.6442660093307495,
"learning_rate": 4.6043180630971646e-05,
"loss": 1.6721,
"step": 6789
},
{
"epoch": 0.24689123391315365,
"grad_norm": 1.5577408075332642,
"learning_rate": 4.599771961937391e-05,
"loss": 1.6837,
"step": 6820
},
{
"epoch": 0.2480134667945771,
"grad_norm": 1.8555899858474731,
"learning_rate": 4.5952021651364204e-05,
"loss": 1.6739,
"step": 6851
},
{
"epoch": 0.2491356996760005,
"grad_norm": 1.667812466621399,
"learning_rate": 4.590608724263786e-05,
"loss": 1.6704,
"step": 6882
},
{
"epoch": 0.25025793255742396,
"grad_norm": 1.6642868518829346,
"learning_rate": 4.585991691155845e-05,
"loss": 1.6784,
"step": 6913
},
{
"epoch": 0.25138016543884734,
"grad_norm": 1.6429824829101562,
"learning_rate": 4.581351117915188e-05,
"loss": 1.6729,
"step": 6944
},
{
"epoch": 0.25250239832027077,
"grad_norm": 1.6268694400787354,
"learning_rate": 4.5766870569100534e-05,
"loss": 1.6657,
"step": 6975
},
{
"epoch": 0.2536246312016942,
"grad_norm": 1.496177315711975,
"learning_rate": 4.571999560773736e-05,
"loss": 1.6611,
"step": 7006
},
{
"epoch": 0.25474686408311764,
"grad_norm": 1.7032805681228638,
"learning_rate": 4.5672886824039915e-05,
"loss": 1.6816,
"step": 7037
},
{
"epoch": 0.2558690969645411,
"grad_norm": 1.791925072669983,
"learning_rate": 4.5625544749624435e-05,
"loss": 1.6689,
"step": 7068
},
{
"epoch": 0.2569913298459645,
"grad_norm": 1.5614711046218872,
"learning_rate": 4.5577969918739794e-05,
"loss": 1.6647,
"step": 7099
},
{
"epoch": 0.2581135627273879,
"grad_norm": 1.517112135887146,
"learning_rate": 4.5530162868261486e-05,
"loss": 1.6614,
"step": 7130
},
{
"epoch": 0.2592357956088113,
"grad_norm": 1.5636824369430542,
"learning_rate": 4.548212413768558e-05,
"loss": 1.6599,
"step": 7161
},
{
"epoch": 0.26035802849023476,
"grad_norm": 1.5803399085998535,
"learning_rate": 4.543385426912261e-05,
"loss": 1.6558,
"step": 7192
},
{
"epoch": 0.2614802613716582,
"grad_norm": 1.6228526830673218,
"learning_rate": 4.53853538072915e-05,
"loss": 1.6778,
"step": 7223
},
{
"epoch": 0.26260249425308163,
"grad_norm": 1.5660549402236938,
"learning_rate": 4.533662329951336e-05,
"loss": 1.6827,
"step": 7254
},
{
"epoch": 0.26372472713450507,
"grad_norm": 1.555421233177185,
"learning_rate": 4.528766329570536e-05,
"loss": 1.6755,
"step": 7285
},
{
"epoch": 0.26484696001592845,
"grad_norm": 1.603285312652588,
"learning_rate": 4.523847434837447e-05,
"loss": 1.6455,
"step": 7316
},
{
"epoch": 0.2659691928973519,
"grad_norm": 1.510772943496704,
"learning_rate": 4.518905701261128e-05,
"loss": 1.6736,
"step": 7347
},
{
"epoch": 0.2670914257787753,
"grad_norm": 1.6260360479354858,
"learning_rate": 4.5139411846083715e-05,
"loss": 1.6643,
"step": 7378
},
{
"epoch": 0.26821365866019875,
"grad_norm": 3.0237209796905518,
"learning_rate": 4.508953940903073e-05,
"loss": 1.6615,
"step": 7409
},
{
"epoch": 0.2693358915416222,
"grad_norm": 1.4725430011749268,
"learning_rate": 4.5039440264255994e-05,
"loss": 1.6582,
"step": 7440
},
{
"epoch": 0.2704581244230456,
"grad_norm": 1.5135307312011719,
"learning_rate": 4.498911497712155e-05,
"loss": 1.6754,
"step": 7471
},
{
"epoch": 0.271580357304469,
"grad_norm": 1.5741811990737915,
"learning_rate": 4.493856411554142e-05,
"loss": 1.6889,
"step": 7502
},
{
"epoch": 0.27270259018589244,
"grad_norm": 1.5469688177108765,
"learning_rate": 4.4887788249975206e-05,
"loss": 1.6542,
"step": 7533
},
{
"epoch": 0.27382482306731587,
"grad_norm": 1.4596927165985107,
"learning_rate": 4.4836787953421656e-05,
"loss": 1.6365,
"step": 7564
},
{
"epoch": 0.2749470559487393,
"grad_norm": 1.566522479057312,
"learning_rate": 4.478556380141218e-05,
"loss": 1.657,
"step": 7595
},
{
"epoch": 0.27606928883016274,
"grad_norm": 1.5141624212265015,
"learning_rate": 4.4734116372004375e-05,
"loss": 1.6695,
"step": 7626
},
{
"epoch": 0.2771915217115862,
"grad_norm": 1.4138630628585815,
"learning_rate": 4.4682446245775477e-05,
"loss": 1.6638,
"step": 7657
},
{
"epoch": 0.27831375459300955,
"grad_norm": 1.4885402917861938,
"learning_rate": 4.463055400581586e-05,
"loss": 1.6817,
"step": 7688
},
{
"epoch": 0.279435987474433,
"grad_norm": 1.645486831665039,
"learning_rate": 4.4578440237722374e-05,
"loss": 1.6392,
"step": 7719
},
{
"epoch": 0.2805582203558564,
"grad_norm": 1.5977535247802734,
"learning_rate": 4.452610552959183e-05,
"loss": 1.6557,
"step": 7750
},
{
"epoch": 0.28168045323727986,
"grad_norm": 1.6347745656967163,
"learning_rate": 4.447355047201428e-05,
"loss": 1.6573,
"step": 7781
},
{
"epoch": 0.2828026861187033,
"grad_norm": 1.5288081169128418,
"learning_rate": 4.4420775658066414e-05,
"loss": 1.638,
"step": 7812
},
{
"epoch": 0.28392491900012673,
"grad_norm": 1.4643625020980835,
"learning_rate": 4.436778168330484e-05,
"loss": 1.6402,
"step": 7843
},
{
"epoch": 0.2850471518815501,
"grad_norm": 1.568663239479065,
"learning_rate": 4.4314569145759353e-05,
"loss": 1.6565,
"step": 7874
},
{
"epoch": 0.28616938476297354,
"grad_norm": 1.476515293121338,
"learning_rate": 4.42611386459262e-05,
"loss": 1.6709,
"step": 7905
},
{
"epoch": 0.287291617644397,
"grad_norm": 1.532404899597168,
"learning_rate": 4.420749078676133e-05,
"loss": 1.6333,
"step": 7936
},
{
"epoch": 0.2884138505258204,
"grad_norm": 1.5388779640197754,
"learning_rate": 4.4153626173673516e-05,
"loss": 1.6494,
"step": 7967
},
{
"epoch": 0.28953608340724385,
"grad_norm": 1.5787324905395508,
"learning_rate": 4.409954541451762e-05,
"loss": 1.6362,
"step": 7998
},
{
"epoch": 0.2906583162886673,
"grad_norm": 1.4780092239379883,
"learning_rate": 4.404524911958764e-05,
"loss": 1.643,
"step": 8029
},
{
"epoch": 0.29178054917009066,
"grad_norm": 1.5434736013412476,
"learning_rate": 4.399073790160989e-05,
"loss": 1.6472,
"step": 8060
},
{
"epoch": 0.2929027820515141,
"grad_norm": 1.4898840188980103,
"learning_rate": 4.393601237573607e-05,
"loss": 1.6483,
"step": 8091
},
{
"epoch": 0.29402501493293753,
"grad_norm": 1.5529502630233765,
"learning_rate": 4.388107315953628e-05,
"loss": 1.6291,
"step": 8122
},
{
"epoch": 0.29514724781436097,
"grad_norm": 1.4831997156143188,
"learning_rate": 4.382592087299212e-05,
"loss": 1.6518,
"step": 8153
},
{
"epoch": 0.2962694806957844,
"grad_norm": 1.4568578004837036,
"learning_rate": 4.377055613848964e-05,
"loss": 1.6465,
"step": 8184
},
{
"epoch": 0.29739171357720784,
"grad_norm": 1.4941576719284058,
"learning_rate": 4.3714979580812355e-05,
"loss": 1.634,
"step": 8215
},
{
"epoch": 0.2985139464586312,
"grad_norm": 1.5891722440719604,
"learning_rate": 4.365919182713416e-05,
"loss": 1.6422,
"step": 8246
},
{
"epoch": 0.29963617934005465,
"grad_norm": 1.5435233116149902,
"learning_rate": 4.360319350701226e-05,
"loss": 1.6446,
"step": 8277
},
{
"epoch": 0.3007584122214781,
"grad_norm": 1.4754277467727661,
"learning_rate": 4.3546985252380115e-05,
"loss": 1.655,
"step": 8308
},
{
"epoch": 0.3018806451029015,
"grad_norm": 1.5463342666625977,
"learning_rate": 4.349056769754021e-05,
"loss": 1.6407,
"step": 8339
},
{
"epoch": 0.30300287798432496,
"grad_norm": 1.4847484827041626,
"learning_rate": 4.3433941479156994e-05,
"loss": 1.65,
"step": 8370
},
{
"epoch": 0.3041251108657484,
"grad_norm": 1.475669264793396,
"learning_rate": 4.3377107236249647e-05,
"loss": 1.6398,
"step": 8401
},
{
"epoch": 0.3052473437471718,
"grad_norm": 1.558566689491272,
"learning_rate": 4.332006561018488e-05,
"loss": 1.6501,
"step": 8432
},
{
"epoch": 0.3063695766285952,
"grad_norm": 1.5497310161590576,
"learning_rate": 4.3262817244669683e-05,
"loss": 1.6371,
"step": 8463
},
{
"epoch": 0.30749180951001864,
"grad_norm": 1.464553952217102,
"learning_rate": 4.3205362785744083e-05,
"loss": 1.6766,
"step": 8494
},
{
"epoch": 0.3086140423914421,
"grad_norm": 1.5198413133621216,
"learning_rate": 4.314770288177384e-05,
"loss": 1.633,
"step": 8525
},
{
"epoch": 0.3097362752728655,
"grad_norm": 1.5493290424346924,
"learning_rate": 4.308983818344313e-05,
"loss": 1.6465,
"step": 8556
},
{
"epoch": 0.31085850815428895,
"grad_norm": 1.4413405656814575,
"learning_rate": 4.3031769343747206e-05,
"loss": 1.6463,
"step": 8587
},
{
"epoch": 0.31198074103571233,
"grad_norm": 1.508507251739502,
"learning_rate": 4.297349701798505e-05,
"loss": 1.6262,
"step": 8618
},
{
"epoch": 0.31310297391713576,
"grad_norm": 1.560054063796997,
"learning_rate": 4.2915021863751916e-05,
"loss": 1.6484,
"step": 8649
},
{
"epoch": 0.3142252067985592,
"grad_norm": 1.495651125907898,
"learning_rate": 4.285634454093198e-05,
"loss": 1.6329,
"step": 8680
},
{
"epoch": 0.31534743967998263,
"grad_norm": 1.481740117073059,
"learning_rate": 4.279746571169086e-05,
"loss": 1.6274,
"step": 8711
},
{
"epoch": 0.31646967256140607,
"grad_norm": 1.53792142868042,
"learning_rate": 4.2738386040468136e-05,
"loss": 1.6252,
"step": 8742
},
{
"epoch": 0.31759190544282945,
"grad_norm": 1.4411643743515015,
"learning_rate": 4.2679106193969866e-05,
"loss": 1.6423,
"step": 8773
},
{
"epoch": 0.3187141383242529,
"grad_norm": 1.5158967971801758,
"learning_rate": 4.261962684116106e-05,
"loss": 1.6596,
"step": 8804
},
{
"epoch": 0.3198363712056763,
"grad_norm": 1.6026604175567627,
"learning_rate": 4.2559948653258145e-05,
"loss": 1.6399,
"step": 8835
},
{
"epoch": 0.32095860408709975,
"grad_norm": 1.4422760009765625,
"learning_rate": 4.250007230372134e-05,
"loss": 1.646,
"step": 8866
},
{
"epoch": 0.3220808369685232,
"grad_norm": 1.4450057744979858,
"learning_rate": 4.2439998468247126e-05,
"loss": 1.6311,
"step": 8897
},
{
"epoch": 0.3232030698499466,
"grad_norm": 1.432768702507019,
"learning_rate": 4.2379727824760566e-05,
"loss": 1.6234,
"step": 8928
},
{
"epoch": 0.32432530273137,
"grad_norm": 1.5206103324890137,
"learning_rate": 4.231926105340768e-05,
"loss": 1.6268,
"step": 8959
},
{
"epoch": 0.32544753561279344,
"grad_norm": 1.5703397989273071,
"learning_rate": 4.225859883654776e-05,
"loss": 1.6409,
"step": 8990
},
{
"epoch": 0.32656976849421687,
"grad_norm": 1.4549362659454346,
"learning_rate": 4.219774185874569e-05,
"loss": 1.6471,
"step": 9021
},
{
"epoch": 0.3276920013756403,
"grad_norm": 1.669263243675232,
"learning_rate": 4.213669080676418e-05,
"loss": 1.6355,
"step": 9052
},
{
"epoch": 0.32881423425706374,
"grad_norm": 1.4004725217819214,
"learning_rate": 4.2075446369556056e-05,
"loss": 1.6046,
"step": 9083
},
{
"epoch": 0.3299364671384872,
"grad_norm": 1.4844101667404175,
"learning_rate": 4.201400923825648e-05,
"loss": 1.6357,
"step": 9114
},
{
"epoch": 0.33105870001991056,
"grad_norm": 1.5377836227416992,
"learning_rate": 4.195238010617511e-05,
"loss": 1.6425,
"step": 9145
},
{
"epoch": 0.332180932901334,
"grad_norm": 1.4880887269973755,
"learning_rate": 4.1890559668788344e-05,
"loss": 1.6368,
"step": 9176
},
{
"epoch": 0.3333031657827574,
"grad_norm": 1.5786559581756592,
"learning_rate": 4.1828548623731405e-05,
"loss": 1.6327,
"step": 9207
},
{
"epoch": 0.33442539866418086,
"grad_norm": 1.4619288444519043,
"learning_rate": 4.1766347670790506e-05,
"loss": 1.6431,
"step": 9238
},
{
"epoch": 0.3355476315456043,
"grad_norm": 1.4946295022964478,
"learning_rate": 4.170395751189495e-05,
"loss": 1.6265,
"step": 9269
},
{
"epoch": 0.33666986442702773,
"grad_norm": 1.4698960781097412,
"learning_rate": 4.164137885110921e-05,
"loss": 1.6356,
"step": 9300
},
{
"epoch": 0.3377920973084511,
"grad_norm": 1.4136701822280884,
"learning_rate": 4.157861239462495e-05,
"loss": 1.606,
"step": 9331
},
{
"epoch": 0.33891433018987455,
"grad_norm": 1.5250601768493652,
"learning_rate": 4.1515658850753114e-05,
"loss": 1.6266,
"step": 9362
},
{
"epoch": 0.340036563071298,
"grad_norm": 1.5827070474624634,
"learning_rate": 4.145251892991588e-05,
"loss": 1.618,
"step": 9393
},
{
"epoch": 0.3411587959527214,
"grad_norm": 1.4887738227844238,
"learning_rate": 4.138919334463868e-05,
"loss": 1.6196,
"step": 9424
},
{
"epoch": 0.34228102883414485,
"grad_norm": 1.5627696514129639,
"learning_rate": 4.1325682809542124e-05,
"loss": 1.6155,
"step": 9455
},
{
"epoch": 0.3434032617155683,
"grad_norm": 1.4552607536315918,
"learning_rate": 4.126198804133398e-05,
"loss": 1.6272,
"step": 9486
},
{
"epoch": 0.34452549459699167,
"grad_norm": 1.5104546546936035,
"learning_rate": 4.1198109758801055e-05,
"loss": 1.6245,
"step": 9517
},
{
"epoch": 0.3456477274784151,
"grad_norm": 1.4588383436203003,
"learning_rate": 4.113404868280107e-05,
"loss": 1.6285,
"step": 9548
},
{
"epoch": 0.34676996035983854,
"grad_norm": 1.40166437625885,
"learning_rate": 4.106980553625457e-05,
"loss": 1.6181,
"step": 9579
},
{
"epoch": 0.34789219324126197,
"grad_norm": 1.4949356317520142,
"learning_rate": 4.100538104413674e-05,
"loss": 1.6148,
"step": 9610
},
{
"epoch": 0.3490144261226854,
"grad_norm": 1.4863393306732178,
"learning_rate": 4.09407759334692e-05,
"loss": 1.6218,
"step": 9641
},
{
"epoch": 0.35013665900410884,
"grad_norm": 1.4831593036651611,
"learning_rate": 4.087599093331186e-05,
"loss": 1.6201,
"step": 9672
},
{
"epoch": 0.3512588918855322,
"grad_norm": 1.487328052520752,
"learning_rate": 4.081102677475462e-05,
"loss": 1.6203,
"step": 9703
},
{
"epoch": 0.35238112476695566,
"grad_norm": 1.560600996017456,
"learning_rate": 4.0745884190909194e-05,
"loss": 1.6099,
"step": 9734
},
{
"epoch": 0.3535033576483791,
"grad_norm": 1.45511794090271,
"learning_rate": 4.0680563916900796e-05,
"loss": 1.6494,
"step": 9765
},
{
"epoch": 0.3546255905298025,
"grad_norm": 1.4966280460357666,
"learning_rate": 4.0615066689859815e-05,
"loss": 1.6157,
"step": 9796
},
{
"epoch": 0.35574782341122596,
"grad_norm": 1.4888532161712646,
"learning_rate": 4.0549393248913584e-05,
"loss": 1.6203,
"step": 9827
},
{
"epoch": 0.3568700562926494,
"grad_norm": 1.5495861768722534,
"learning_rate": 4.048354433517794e-05,
"loss": 1.6131,
"step": 9858
},
{
"epoch": 0.3579922891740728,
"grad_norm": 1.4991432428359985,
"learning_rate": 4.0417520691748916e-05,
"loss": 1.6371,
"step": 9889
},
{
"epoch": 0.3591145220554962,
"grad_norm": 1.5163663625717163,
"learning_rate": 4.035132306369438e-05,
"loss": 1.5911,
"step": 9920
},
{
"epoch": 0.36023675493691965,
"grad_norm": 1.439622402191162,
"learning_rate": 4.028495219804555e-05,
"loss": 1.6218,
"step": 9951
},
{
"epoch": 0.3613589878183431,
"grad_norm": 1.4068893194198608,
"learning_rate": 4.021840884378864e-05,
"loss": 1.6284,
"step": 9982
},
{
"epoch": 0.3624812206997665,
"grad_norm": 1.4577332735061646,
"learning_rate": 4.015169375185633e-05,
"loss": 1.6104,
"step": 10013
},
{
"epoch": 0.36360345358118995,
"grad_norm": 1.448833703994751,
"learning_rate": 4.0084807675119396e-05,
"loss": 1.6299,
"step": 10044
},
{
"epoch": 0.36472568646261333,
"grad_norm": 1.440450668334961,
"learning_rate": 4.0017751368378106e-05,
"loss": 1.6255,
"step": 10075
},
{
"epoch": 0.36584791934403676,
"grad_norm": 1.3380858898162842,
"learning_rate": 3.995052558835377e-05,
"loss": 1.6162,
"step": 10106
},
{
"epoch": 0.3669701522254602,
"grad_norm": 1.4549713134765625,
"learning_rate": 3.988313109368017e-05,
"loss": 1.6181,
"step": 10137
},
{
"epoch": 0.36809238510688363,
"grad_norm": 1.4933863878250122,
"learning_rate": 3.981556864489504e-05,
"loss": 1.634,
"step": 10168
},
{
"epoch": 0.36921461798830707,
"grad_norm": 1.5157703161239624,
"learning_rate": 3.974783900443142e-05,
"loss": 1.6258,
"step": 10199
},
{
"epoch": 0.3703368508697305,
"grad_norm": 1.464006781578064,
"learning_rate": 3.9679942936609095e-05,
"loss": 1.6235,
"step": 10230
},
{
"epoch": 0.3714590837511539,
"grad_norm": 1.3768154382705688,
"learning_rate": 3.961188120762596e-05,
"loss": 1.6044,
"step": 10261
},
{
"epoch": 0.3725813166325773,
"grad_norm": 1.4427024126052856,
"learning_rate": 3.954365458554938e-05,
"loss": 1.6403,
"step": 10292
},
{
"epoch": 0.37370354951400075,
"grad_norm": 1.3831264972686768,
"learning_rate": 3.947526384030751e-05,
"loss": 1.6136,
"step": 10323
},
{
"epoch": 0.3748257823954242,
"grad_norm": 1.4275633096694946,
"learning_rate": 3.9406709743680624e-05,
"loss": 1.6167,
"step": 10354
},
{
"epoch": 0.3759480152768476,
"grad_norm": 1.4378384351730347,
"learning_rate": 3.9337993069292366e-05,
"loss": 1.6231,
"step": 10385
},
{
"epoch": 0.37707024815827106,
"grad_norm": 1.3743884563446045,
"learning_rate": 3.926911459260109e-05,
"loss": 1.6171,
"step": 10416
},
{
"epoch": 0.37819248103969444,
"grad_norm": 1.496160864830017,
"learning_rate": 3.920007509089102e-05,
"loss": 1.6234,
"step": 10447
},
{
"epoch": 0.3793147139211179,
"grad_norm": 1.4610028266906738,
"learning_rate": 3.913087534326357e-05,
"loss": 1.5963,
"step": 10478
},
{
"epoch": 0.3804369468025413,
"grad_norm": 1.483314037322998,
"learning_rate": 3.9061516130628475e-05,
"loss": 1.6021,
"step": 10509
},
{
"epoch": 0.38155917968396474,
"grad_norm": 1.4944846630096436,
"learning_rate": 3.8991998235695025e-05,
"loss": 1.5833,
"step": 10540
},
{
"epoch": 0.3826814125653882,
"grad_norm": 1.3831861019134521,
"learning_rate": 3.8922322442963224e-05,
"loss": 1.624,
"step": 10571
},
{
"epoch": 0.3838036454468116,
"grad_norm": 1.4178634881973267,
"learning_rate": 3.885248953871491e-05,
"loss": 1.6188,
"step": 10602
},
{
"epoch": 0.384925878328235,
"grad_norm": 1.4889320135116577,
"learning_rate": 3.8782500311004915e-05,
"loss": 1.608,
"step": 10633
},
{
"epoch": 0.38604811120965843,
"grad_norm": 1.3335620164871216,
"learning_rate": 3.871235554965218e-05,
"loss": 1.6182,
"step": 10664
},
{
"epoch": 0.38717034409108186,
"grad_norm": 1.4620449542999268,
"learning_rate": 3.864205604623078e-05,
"loss": 1.5848,
"step": 10695
},
{
"epoch": 0.3882925769725053,
"grad_norm": 1.3857917785644531,
"learning_rate": 3.857160259406107e-05,
"loss": 1.6048,
"step": 10726
},
{
"epoch": 0.38941480985392873,
"grad_norm": 1.4226957559585571,
"learning_rate": 3.8500995988200674e-05,
"loss": 1.6052,
"step": 10757
},
{
"epoch": 0.39053704273535217,
"grad_norm": 1.478182077407837,
"learning_rate": 3.843023702543556e-05,
"loss": 1.6268,
"step": 10788
},
{
"epoch": 0.39165927561677555,
"grad_norm": 1.431401014328003,
"learning_rate": 3.8359326504270984e-05,
"loss": 1.6176,
"step": 10819
},
{
"epoch": 0.392781508498199,
"grad_norm": 1.339880108833313,
"learning_rate": 3.828826522492255e-05,
"loss": 1.5902,
"step": 10850
},
{
"epoch": 0.3939037413796224,
"grad_norm": 1.4537174701690674,
"learning_rate": 3.821705398930713e-05,
"loss": 1.6107,
"step": 10881
},
{
"epoch": 0.39502597426104585,
"grad_norm": 1.3559256792068481,
"learning_rate": 3.814569360103385e-05,
"loss": 1.5879,
"step": 10912
},
{
"epoch": 0.3961482071424693,
"grad_norm": 1.3561891317367554,
"learning_rate": 3.807418486539499e-05,
"loss": 1.6162,
"step": 10943
},
{
"epoch": 0.3972704400238927,
"grad_norm": 1.471112847328186,
"learning_rate": 3.80025285893569e-05,
"loss": 1.5968,
"step": 10974
},
{
"epoch": 0.3983926729053161,
"grad_norm": 1.3438925743103027,
"learning_rate": 3.793072558155093e-05,
"loss": 1.5876,
"step": 11005
},
{
"epoch": 0.39951490578673954,
"grad_norm": 1.4102482795715332,
"learning_rate": 3.785877665226426e-05,
"loss": 1.5886,
"step": 11036
},
{
"epoch": 0.400637138668163,
"grad_norm": 1.4435259103775024,
"learning_rate": 3.778668261343079e-05,
"loss": 1.5999,
"step": 11067
},
{
"epoch": 0.4017593715495864,
"grad_norm": 1.4556541442871094,
"learning_rate": 3.771444427862192e-05,
"loss": 1.6185,
"step": 11098
},
{
"epoch": 0.40288160443100984,
"grad_norm": 1.370553970336914,
"learning_rate": 3.7642062463037465e-05,
"loss": 1.6005,
"step": 11129
},
{
"epoch": 0.4040038373124333,
"grad_norm": 1.368855595588684,
"learning_rate": 3.7569537983496373e-05,
"loss": 1.6024,
"step": 11160
},
{
"epoch": 0.40512607019385666,
"grad_norm": 1.4200265407562256,
"learning_rate": 3.749687165842753e-05,
"loss": 1.6082,
"step": 11191
},
{
"epoch": 0.4062483030752801,
"grad_norm": 1.4704499244689941,
"learning_rate": 3.7424064307860536e-05,
"loss": 1.6227,
"step": 11222
},
{
"epoch": 0.40737053595670353,
"grad_norm": 1.3868876695632935,
"learning_rate": 3.735111675341645e-05,
"loss": 1.6008,
"step": 11253
},
{
"epoch": 0.40849276883812696,
"grad_norm": 1.473650574684143,
"learning_rate": 3.7278029818298524e-05,
"loss": 1.5825,
"step": 11284
},
{
"epoch": 0.4096150017195504,
"grad_norm": 1.412559986114502,
"learning_rate": 3.720480432728287e-05,
"loss": 1.5971,
"step": 11315
},
{
"epoch": 0.41073723460097383,
"grad_norm": 1.4288370609283447,
"learning_rate": 3.71314411067092e-05,
"loss": 1.6079,
"step": 11346
},
{
"epoch": 0.4118594674823972,
"grad_norm": 1.4781348705291748,
"learning_rate": 3.70579409844715e-05,
"loss": 1.5904,
"step": 11377
},
{
"epoch": 0.41298170036382065,
"grad_norm": 1.377030611038208,
"learning_rate": 3.698430479000865e-05,
"loss": 1.5804,
"step": 11408
},
{
"epoch": 0.4141039332452441,
"grad_norm": 1.4176589250564575,
"learning_rate": 3.691053335429509e-05,
"loss": 1.6046,
"step": 11439
},
{
"epoch": 0.4152261661266675,
"grad_norm": 1.4933243989944458,
"learning_rate": 3.683662750983147e-05,
"loss": 1.6018,
"step": 11470
},
{
"epoch": 0.41634839900809095,
"grad_norm": 1.4382365942001343,
"learning_rate": 3.676258809063518e-05,
"loss": 1.5962,
"step": 11501
},
{
"epoch": 0.4174706318895144,
"grad_norm": 1.468005657196045,
"learning_rate": 3.6688415932231004e-05,
"loss": 1.6044,
"step": 11532
},
{
"epoch": 0.41859286477093777,
"grad_norm": 1.4858007431030273,
"learning_rate": 3.661411187164166e-05,
"loss": 1.5973,
"step": 11563
},
{
"epoch": 0.4197150976523612,
"grad_norm": 1.457524061203003,
"learning_rate": 3.65396767473784e-05,
"loss": 1.5872,
"step": 11594
},
{
"epoch": 0.42083733053378464,
"grad_norm": 1.4685806035995483,
"learning_rate": 3.6465111399431465e-05,
"loss": 1.6072,
"step": 11625
},
{
"epoch": 0.42195956341520807,
"grad_norm": 1.4355812072753906,
"learning_rate": 3.6390416669260674e-05,
"loss": 1.6005,
"step": 11656
},
{
"epoch": 0.4230817962966315,
"grad_norm": 1.4105843305587769,
"learning_rate": 3.63155933997859e-05,
"loss": 1.5999,
"step": 11687
},
{
"epoch": 0.42420402917805494,
"grad_norm": 1.4515639543533325,
"learning_rate": 3.624064243537758e-05,
"loss": 1.5903,
"step": 11718
},
{
"epoch": 0.4253262620594783,
"grad_norm": 1.4507205486297607,
"learning_rate": 3.616556462184716e-05,
"loss": 1.6004,
"step": 11749
},
{
"epoch": 0.42644849494090176,
"grad_norm": 1.3846348524093628,
"learning_rate": 3.609036080643755e-05,
"loss": 1.5878,
"step": 11780
},
{
"epoch": 0.4275707278223252,
"grad_norm": 1.4062190055847168,
"learning_rate": 3.60150318378136e-05,
"loss": 1.6049,
"step": 11811
},
{
"epoch": 0.4286929607037486,
"grad_norm": 1.5231355428695679,
"learning_rate": 3.5939578566052465e-05,
"loss": 1.5972,
"step": 11842
},
{
"epoch": 0.42981519358517206,
"grad_norm": 1.4500449895858765,
"learning_rate": 3.586400184263408e-05,
"loss": 1.5918,
"step": 11873
},
{
"epoch": 0.4309374264665955,
"grad_norm": 1.415440559387207,
"learning_rate": 3.578830252043148e-05,
"loss": 1.6111,
"step": 11904
},
{
"epoch": 0.4320596593480189,
"grad_norm": 1.3857108354568481,
"learning_rate": 3.571248145370125e-05,
"loss": 1.5882,
"step": 11935
},
{
"epoch": 0.4331818922294423,
"grad_norm": 1.442830204963684,
"learning_rate": 3.5636539498073794e-05,
"loss": 1.587,
"step": 11966
},
{
"epoch": 0.43430412511086575,
"grad_norm": 1.3706488609313965,
"learning_rate": 3.556047751054378e-05,
"loss": 1.5942,
"step": 11997
},
{
"epoch": 0.4354263579922892,
"grad_norm": 1.450567364692688,
"learning_rate": 3.548429634946039e-05,
"loss": 1.6011,
"step": 12028
},
{
"epoch": 0.4365485908737126,
"grad_norm": 1.4172272682189941,
"learning_rate": 3.540799687451768e-05,
"loss": 1.5726,
"step": 12059
},
{
"epoch": 0.43767082375513605,
"grad_norm": 1.4156157970428467,
"learning_rate": 3.533157994674485e-05,
"loss": 1.5848,
"step": 12090
},
{
"epoch": 0.43879305663655943,
"grad_norm": 1.3843419551849365,
"learning_rate": 3.5255046428496546e-05,
"loss": 1.5893,
"step": 12121
},
{
"epoch": 0.43991528951798287,
"grad_norm": 1.43569815158844,
"learning_rate": 3.517839718344311e-05,
"loss": 1.5922,
"step": 12152
},
{
"epoch": 0.4410375223994063,
"grad_norm": 1.4200314283370972,
"learning_rate": 3.510163307656086e-05,
"loss": 1.6047,
"step": 12183
},
{
"epoch": 0.44215975528082974,
"grad_norm": 1.4956674575805664,
"learning_rate": 3.5024754974122324e-05,
"loss": 1.5802,
"step": 12214
},
{
"epoch": 0.44328198816225317,
"grad_norm": 1.4289231300354004,
"learning_rate": 3.494776374368643e-05,
"loss": 1.6193,
"step": 12245
},
{
"epoch": 0.4444042210436766,
"grad_norm": 1.389282464981079,
"learning_rate": 3.4870660254088724e-05,
"loss": 1.5977,
"step": 12276
},
{
"epoch": 0.4455264539251,
"grad_norm": 1.4207974672317505,
"learning_rate": 3.479344537543164e-05,
"loss": 1.5789,
"step": 12307
},
{
"epoch": 0.4466486868065234,
"grad_norm": 1.355353832244873,
"learning_rate": 3.4716119979074565e-05,
"loss": 1.5889,
"step": 12338
},
{
"epoch": 0.44777091968794686,
"grad_norm": 1.3336408138275146,
"learning_rate": 3.463868493762412e-05,
"loss": 1.5865,
"step": 12369
},
{
"epoch": 0.4488931525693703,
"grad_norm": 1.5265244245529175,
"learning_rate": 3.456114112492418e-05,
"loss": 1.5993,
"step": 12400
},
{
"epoch": 0.4500153854507937,
"grad_norm": 1.4629555940628052,
"learning_rate": 3.4483489416046164e-05,
"loss": 1.5982,
"step": 12431
},
{
"epoch": 0.45113761833221716,
"grad_norm": 1.43988835811615,
"learning_rate": 3.440573068727905e-05,
"loss": 1.5816,
"step": 12462
},
{
"epoch": 0.45225985121364054,
"grad_norm": 1.4607633352279663,
"learning_rate": 3.4327865816119495e-05,
"loss": 1.571,
"step": 12493
},
{
"epoch": 0.453382084095064,
"grad_norm": 1.3664649724960327,
"learning_rate": 3.4249895681262025e-05,
"loss": 1.5736,
"step": 12524
},
{
"epoch": 0.4545043169764874,
"grad_norm": 1.436094880104065,
"learning_rate": 3.417182116258899e-05,
"loss": 1.5829,
"step": 12555
},
{
"epoch": 0.45562654985791085,
"grad_norm": 1.3681309223175049,
"learning_rate": 3.409364314116074e-05,
"loss": 1.5938,
"step": 12586
},
{
"epoch": 0.4567487827393343,
"grad_norm": 1.3929277658462524,
"learning_rate": 3.401536249920559e-05,
"loss": 1.572,
"step": 12617
},
{
"epoch": 0.4578710156207577,
"grad_norm": 1.3980777263641357,
"learning_rate": 3.393698012010998e-05,
"loss": 1.5941,
"step": 12648
},
{
"epoch": 0.4589932485021811,
"grad_norm": 1.4055850505828857,
"learning_rate": 3.385849688840839e-05,
"loss": 1.5818,
"step": 12679
},
{
"epoch": 0.46011548138360453,
"grad_norm": 1.3678046464920044,
"learning_rate": 3.3779913689773414e-05,
"loss": 1.5759,
"step": 12710
},
{
"epoch": 0.46123771426502796,
"grad_norm": 1.468201994895935,
"learning_rate": 3.370123141100578e-05,
"loss": 1.5792,
"step": 12741
},
{
"epoch": 0.4623599471464514,
"grad_norm": 1.346614122390747,
"learning_rate": 3.3622450940024305e-05,
"loss": 1.5983,
"step": 12772
},
{
"epoch": 0.46348218002787483,
"grad_norm": 1.3895704746246338,
"learning_rate": 3.35435731658559e-05,
"loss": 1.5809,
"step": 12803
},
{
"epoch": 0.46460441290929827,
"grad_norm": 1.3664804697036743,
"learning_rate": 3.346459897862552e-05,
"loss": 1.5788,
"step": 12834
},
{
"epoch": 0.46572664579072165,
"grad_norm": 1.4561264514923096,
"learning_rate": 3.338552926954613e-05,
"loss": 1.5867,
"step": 12865
},
{
"epoch": 0.4668488786721451,
"grad_norm": 1.3407316207885742,
"learning_rate": 3.330636493090868e-05,
"loss": 1.5729,
"step": 12896
},
{
"epoch": 0.4679711115535685,
"grad_norm": 1.3465179204940796,
"learning_rate": 3.322710685607193e-05,
"loss": 1.5915,
"step": 12927
},
{
"epoch": 0.46909334443499195,
"grad_norm": 1.553585171699524,
"learning_rate": 3.314775593945251e-05,
"loss": 1.5875,
"step": 12958
},
{
"epoch": 0.4702155773164154,
"grad_norm": 1.3964170217514038,
"learning_rate": 3.3068313076514714e-05,
"loss": 1.5783,
"step": 12989
},
{
"epoch": 0.47133781019783877,
"grad_norm": 1.3884953260421753,
"learning_rate": 3.298877916376047e-05,
"loss": 1.5577,
"step": 13020
},
{
"epoch": 0.4724600430792622,
"grad_norm": 1.3421337604522705,
"learning_rate": 3.290915509871915e-05,
"loss": 1.5791,
"step": 13051
},
{
"epoch": 0.47358227596068564,
"grad_norm": 1.297429084777832,
"learning_rate": 3.282944177993753e-05,
"loss": 1.5699,
"step": 13082
},
{
"epoch": 0.4747045088421091,
"grad_norm": 1.3672280311584473,
"learning_rate": 3.274964010696957e-05,
"loss": 1.5711,
"step": 13113
},
{
"epoch": 0.4758267417235325,
"grad_norm": 1.4202091693878174,
"learning_rate": 3.266975098036629e-05,
"loss": 1.5679,
"step": 13144
},
{
"epoch": 0.47694897460495594,
"grad_norm": 1.383973479270935,
"learning_rate": 3.258977530166562e-05,
"loss": 1.6019,
"step": 13175
},
{
"epoch": 0.4780712074863793,
"grad_norm": 1.3134119510650635,
"learning_rate": 3.250971397338227e-05,
"loss": 1.5721,
"step": 13206
},
{
"epoch": 0.47919344036780276,
"grad_norm": 1.3229272365570068,
"learning_rate": 3.2429567898997404e-05,
"loss": 1.5812,
"step": 13237
},
{
"epoch": 0.4803156732492262,
"grad_norm": 1.2991341352462769,
"learning_rate": 3.234933798294859e-05,
"loss": 1.5793,
"step": 13268
},
{
"epoch": 0.48143790613064963,
"grad_norm": 1.384522795677185,
"learning_rate": 3.2269025130619535e-05,
"loss": 1.5592,
"step": 13299
},
{
"epoch": 0.48256013901207306,
"grad_norm": 1.3743617534637451,
"learning_rate": 3.218863024832985e-05,
"loss": 1.5785,
"step": 13330
},
{
"epoch": 0.4836823718934965,
"grad_norm": 1.4512649774551392,
"learning_rate": 3.2108154243324864e-05,
"loss": 1.5703,
"step": 13361
},
{
"epoch": 0.4848046047749199,
"grad_norm": 1.2982932329177856,
"learning_rate": 3.2027598023765345e-05,
"loss": 1.5609,
"step": 13392
},
{
"epoch": 0.4859268376563433,
"grad_norm": 1.3747495412826538,
"learning_rate": 3.194696249871729e-05,
"loss": 1.5766,
"step": 13423
},
{
"epoch": 0.48704907053776675,
"grad_norm": 1.3155137300491333,
"learning_rate": 3.186624857814164e-05,
"loss": 1.57,
"step": 13454
},
{
"epoch": 0.4881713034191902,
"grad_norm": 1.4094924926757812,
"learning_rate": 3.178545717288401e-05,
"loss": 1.5855,
"step": 13485
},
{
"epoch": 0.4892935363006136,
"grad_norm": 1.3931294679641724,
"learning_rate": 3.170458919466444e-05,
"loss": 1.5486,
"step": 13516
},
{
"epoch": 0.49041576918203705,
"grad_norm": 1.48263418674469,
"learning_rate": 3.1623645556067063e-05,
"loss": 1.5829,
"step": 13547
},
{
"epoch": 0.49153800206346043,
"grad_norm": 1.3016873598098755,
"learning_rate": 3.154262717052985e-05,
"loss": 1.5808,
"step": 13578
},
{
"epoch": 0.49266023494488387,
"grad_norm": 1.623724102973938,
"learning_rate": 3.146153495233426e-05,
"loss": 1.5582,
"step": 13609
},
{
"epoch": 0.4937824678263073,
"grad_norm": 1.3603851795196533,
"learning_rate": 3.1380369816594944e-05,
"loss": 1.5703,
"step": 13640
},
{
"epoch": 0.49490470070773074,
"grad_norm": 1.4793063402175903,
"learning_rate": 3.129913267924946e-05,
"loss": 1.5739,
"step": 13671
},
{
"epoch": 0.4960269335891542,
"grad_norm": 1.4615710973739624,
"learning_rate": 3.121782445704782e-05,
"loss": 1.5846,
"step": 13702
},
{
"epoch": 0.4971491664705776,
"grad_norm": 1.419823408126831,
"learning_rate": 3.11364460675423e-05,
"loss": 1.5702,
"step": 13733
},
{
"epoch": 0.498271399352001,
"grad_norm": 1.429337501525879,
"learning_rate": 3.1054998429076934e-05,
"loss": 1.5825,
"step": 13764
},
{
"epoch": 0.4993936322334244,
"grad_norm": 1.3171850442886353,
"learning_rate": 3.097348246077728e-05,
"loss": 1.5721,
"step": 13795
},
{
"epoch": 0.5005158651148479,
"grad_norm": 1.487111210823059,
"learning_rate": 3.0891899082539924e-05,
"loss": 1.5879,
"step": 13826
},
{
"epoch": 0.5016380979962712,
"grad_norm": 1.4311749935150146,
"learning_rate": 3.0810249215022233e-05,
"loss": 1.5843,
"step": 13857
},
{
"epoch": 0.5027603308776947,
"grad_norm": 1.468863844871521,
"learning_rate": 3.0728533779631865e-05,
"loss": 1.5884,
"step": 13888
},
{
"epoch": 0.5038825637591181,
"grad_norm": 1.3970764875411987,
"learning_rate": 3.064675369851637e-05,
"loss": 1.5769,
"step": 13919
},
{
"epoch": 0.5050047966405415,
"grad_norm": 1.3623278141021729,
"learning_rate": 3.056490989455289e-05,
"loss": 1.5706,
"step": 13950
},
{
"epoch": 0.506127029521965,
"grad_norm": 1.3077219724655151,
"learning_rate": 3.0483003291337596e-05,
"loss": 1.5761,
"step": 13981
},
{
"epoch": 0.5072492624033884,
"grad_norm": 1.3295941352844238,
"learning_rate": 3.040103481317539e-05,
"loss": 1.5776,
"step": 14012
},
{
"epoch": 0.5083714952848118,
"grad_norm": 1.3900631666183472,
"learning_rate": 3.03190053850694e-05,
"loss": 1.5777,
"step": 14043
},
{
"epoch": 0.5094937281662353,
"grad_norm": 1.3359615802764893,
"learning_rate": 3.0236915932710573e-05,
"loss": 1.5569,
"step": 14074
},
{
"epoch": 0.5106159610476587,
"grad_norm": 1.2790296077728271,
"learning_rate": 3.0154767382467232e-05,
"loss": 1.5598,
"step": 14105
},
{
"epoch": 0.5117381939290822,
"grad_norm": 1.5767478942871094,
"learning_rate": 3.0072560661374582e-05,
"loss": 1.5483,
"step": 14136
},
{
"epoch": 0.5128604268105056,
"grad_norm": 1.343381404876709,
"learning_rate": 2.999029669712431e-05,
"loss": 1.5689,
"step": 14167
},
{
"epoch": 0.513982659691929,
"grad_norm": 1.4147651195526123,
"learning_rate": 2.990797641805408e-05,
"loss": 1.5643,
"step": 14198
},
{
"epoch": 0.5151048925733523,
"grad_norm": 1.3360931873321533,
"learning_rate": 2.982560075313704e-05,
"loss": 1.5689,
"step": 14229
},
{
"epoch": 0.5162271254547758,
"grad_norm": 1.458016037940979,
"learning_rate": 2.9743170631971368e-05,
"loss": 1.5633,
"step": 14260
},
{
"epoch": 0.5173493583361992,
"grad_norm": 1.430955171585083,
"learning_rate": 2.9660686984769792e-05,
"loss": 1.5559,
"step": 14291
},
{
"epoch": 0.5184715912176227,
"grad_norm": 1.3806464672088623,
"learning_rate": 2.9578150742349047e-05,
"loss": 1.577,
"step": 14322
},
{
"epoch": 0.5195938240990461,
"grad_norm": 1.359813928604126,
"learning_rate": 2.949556283611942e-05,
"loss": 1.5485,
"step": 14353
},
{
"epoch": 0.5207160569804695,
"grad_norm": 1.4222601652145386,
"learning_rate": 2.9412924198074206e-05,
"loss": 1.575,
"step": 14384
},
{
"epoch": 0.521838289861893,
"grad_norm": 1.3186180591583252,
"learning_rate": 2.9330235760779208e-05,
"loss": 1.5744,
"step": 14415
},
{
"epoch": 0.5229605227433164,
"grad_norm": 1.3309999704360962,
"learning_rate": 2.9247498457362188e-05,
"loss": 1.5664,
"step": 14446
},
{
"epoch": 0.5240827556247398,
"grad_norm": 1.368514060974121,
"learning_rate": 2.9164713221502373e-05,
"loss": 1.56,
"step": 14477
},
{
"epoch": 0.5252049885061633,
"grad_norm": 1.3132268190383911,
"learning_rate": 2.9081880987419912e-05,
"loss": 1.563,
"step": 14508
},
{
"epoch": 0.5263272213875867,
"grad_norm": 1.431347131729126,
"learning_rate": 2.8999002689865296e-05,
"loss": 1.5612,
"step": 14539
},
{
"epoch": 0.5274494542690101,
"grad_norm": 1.303941249847412,
"learning_rate": 2.8916079264108852e-05,
"loss": 1.5601,
"step": 14570
},
{
"epoch": 0.5285716871504335,
"grad_norm": 1.4077236652374268,
"learning_rate": 2.883311164593017e-05,
"loss": 1.5516,
"step": 14601
},
{
"epoch": 0.5296939200318569,
"grad_norm": 1.3132708072662354,
"learning_rate": 2.875010077160754e-05,
"loss": 1.5538,
"step": 14632
},
{
"epoch": 0.5308161529132803,
"grad_norm": 1.2660679817199707,
"learning_rate": 2.866704757790741e-05,
"loss": 1.5652,
"step": 14663
},
{
"epoch": 0.5319383857947038,
"grad_norm": 1.4541290998458862,
"learning_rate": 2.858395300207376e-05,
"loss": 1.5602,
"step": 14694
},
{
"epoch": 0.5330606186761272,
"grad_norm": 1.3694487810134888,
"learning_rate": 2.8500817981817607e-05,
"loss": 1.5483,
"step": 14725
},
{
"epoch": 0.5341828515575506,
"grad_norm": 1.3493553400039673,
"learning_rate": 2.8417643455306336e-05,
"loss": 1.5539,
"step": 14756
},
{
"epoch": 0.5353050844389741,
"grad_norm": 1.4280232191085815,
"learning_rate": 2.8334430361153185e-05,
"loss": 1.5672,
"step": 14787
},
{
"epoch": 0.5364273173203975,
"grad_norm": 1.3430079221725464,
"learning_rate": 2.8251179638406612e-05,
"loss": 1.5474,
"step": 14818
},
{
"epoch": 0.5375495502018209,
"grad_norm": 1.3380746841430664,
"learning_rate": 2.8167892226539704e-05,
"loss": 1.5508,
"step": 14849
},
{
"epoch": 0.5386717830832444,
"grad_norm": 1.3501845598220825,
"learning_rate": 2.8084569065439588e-05,
"loss": 1.5656,
"step": 14880
},
{
"epoch": 0.5397940159646678,
"grad_norm": 1.3564043045043945,
"learning_rate": 2.8001211095396807e-05,
"loss": 1.5726,
"step": 14911
},
{
"epoch": 0.5409162488460912,
"grad_norm": 1.3949267864227295,
"learning_rate": 2.791781925709473e-05,
"loss": 1.5635,
"step": 14942
},
{
"epoch": 0.5420384817275146,
"grad_norm": 1.4317481517791748,
"learning_rate": 2.7834394491598908e-05,
"loss": 1.5447,
"step": 14973
},
{
"epoch": 0.543160714608938,
"grad_norm": 1.396610140800476,
"learning_rate": 2.7750937740346485e-05,
"loss": 1.557,
"step": 15004
},
{
"epoch": 0.5442829474903614,
"grad_norm": 1.369884967803955,
"learning_rate": 2.7667449945135564e-05,
"loss": 1.5672,
"step": 15035
},
{
"epoch": 0.5454051803717849,
"grad_norm": 1.4686237573623657,
"learning_rate": 2.7583932048114557e-05,
"loss": 1.572,
"step": 15066
},
{
"epoch": 0.5465274132532083,
"grad_norm": 1.524717926979065,
"learning_rate": 2.7500384991771587e-05,
"loss": 1.5537,
"step": 15097
},
{
"epoch": 0.5476496461346317,
"grad_norm": 1.3461147546768188,
"learning_rate": 2.7416809718923825e-05,
"loss": 1.5321,
"step": 15128
},
{
"epoch": 0.5487718790160552,
"grad_norm": 1.3704477548599243,
"learning_rate": 2.7333207172706864e-05,
"loss": 1.5677,
"step": 15159
},
{
"epoch": 0.5498941118974786,
"grad_norm": 1.3601664304733276,
"learning_rate": 2.7249578296564088e-05,
"loss": 1.5577,
"step": 15190
},
{
"epoch": 0.551016344778902,
"grad_norm": 1.4055489301681519,
"learning_rate": 2.7165924034235973e-05,
"loss": 1.5453,
"step": 15221
},
{
"epoch": 0.5521385776603255,
"grad_norm": 1.3587946891784668,
"learning_rate": 2.708224532974953e-05,
"loss": 1.5401,
"step": 15252
},
{
"epoch": 0.5532608105417489,
"grad_norm": 1.3209632635116577,
"learning_rate": 2.6998543127407538e-05,
"loss": 1.5383,
"step": 15283
},
{
"epoch": 0.5543830434231724,
"grad_norm": 1.294921636581421,
"learning_rate": 2.6914818371777988e-05,
"loss": 1.5734,
"step": 15314
},
{
"epoch": 0.5555052763045957,
"grad_norm": 1.6017462015151978,
"learning_rate": 2.6831072007683373e-05,
"loss": 1.5702,
"step": 15345
},
{
"epoch": 0.5566275091860191,
"grad_norm": 1.3644670248031616,
"learning_rate": 2.6747304980190018e-05,
"loss": 1.571,
"step": 15376
},
{
"epoch": 0.5577497420674425,
"grad_norm": 1.3694461584091187,
"learning_rate": 2.6663518234597453e-05,
"loss": 1.5398,
"step": 15407
},
{
"epoch": 0.558871974948866,
"grad_norm": 1.3380069732666016,
"learning_rate": 2.6579712716427696e-05,
"loss": 1.5628,
"step": 15438
},
{
"epoch": 0.5599942078302894,
"grad_norm": 1.322144627571106,
"learning_rate": 2.6495889371414652e-05,
"loss": 1.5682,
"step": 15469
},
{
"epoch": 0.5611164407117128,
"grad_norm": 1.3240221738815308,
"learning_rate": 2.6412049145493367e-05,
"loss": 1.5506,
"step": 15500
},
{
"epoch": 0.5622386735931363,
"grad_norm": 1.3131070137023926,
"learning_rate": 2.632819298478939e-05,
"loss": 1.5529,
"step": 15531
},
{
"epoch": 0.5633609064745597,
"grad_norm": 1.3907220363616943,
"learning_rate": 2.6244321835608105e-05,
"loss": 1.547,
"step": 15562
},
{
"epoch": 0.5644831393559832,
"grad_norm": 1.233981966972351,
"learning_rate": 2.6160436644424024e-05,
"loss": 1.5377,
"step": 15593
},
{
"epoch": 0.5656053722374066,
"grad_norm": 1.443326711654663,
"learning_rate": 2.6076538357870133e-05,
"loss": 1.5788,
"step": 15624
},
{
"epoch": 0.56672760511883,
"grad_norm": 1.4688999652862549,
"learning_rate": 2.5992627922727196e-05,
"loss": 1.5629,
"step": 15655
},
{
"epoch": 0.5678498380002535,
"grad_norm": 1.3365731239318848,
"learning_rate": 2.5908706285913066e-05,
"loss": 1.5544,
"step": 15686
},
{
"epoch": 0.5689720708816768,
"grad_norm": 1.3793649673461914,
"learning_rate": 2.5824774394472008e-05,
"loss": 1.5317,
"step": 15717
},
{
"epoch": 0.5700943037631002,
"grad_norm": 1.417433738708496,
"learning_rate": 2.5740833195563996e-05,
"loss": 1.5506,
"step": 15748
},
{
"epoch": 0.5712165366445237,
"grad_norm": 1.346710443496704,
"learning_rate": 2.5656883636454067e-05,
"loss": 1.5462,
"step": 15779
},
{
"epoch": 0.5723387695259471,
"grad_norm": 1.4065468311309814,
"learning_rate": 2.557292666450159e-05,
"loss": 1.5464,
"step": 15810
},
{
"epoch": 0.5734610024073705,
"grad_norm": 1.3797588348388672,
"learning_rate": 2.5488963227149566e-05,
"loss": 1.565,
"step": 15841
},
{
"epoch": 0.574583235288794,
"grad_norm": 1.2842196226119995,
"learning_rate": 2.5404994271913983e-05,
"loss": 1.5489,
"step": 15872
},
{
"epoch": 0.5757054681702174,
"grad_norm": 1.368696689605713,
"learning_rate": 2.5321020746373085e-05,
"loss": 1.5358,
"step": 15903
},
{
"epoch": 0.5768277010516408,
"grad_norm": 1.3306961059570312,
"learning_rate": 2.52370435981567e-05,
"loss": 1.541,
"step": 15934
},
{
"epoch": 0.5779499339330643,
"grad_norm": 1.286727786064148,
"learning_rate": 2.5153063774935533e-05,
"loss": 1.533,
"step": 15965
},
{
"epoch": 0.5790721668144877,
"grad_norm": 1.434964656829834,
"learning_rate": 2.506908222441045e-05,
"loss": 1.5404,
"step": 15996
},
{
"epoch": 0.5801943996959111,
"grad_norm": 1.3955284357070923,
"learning_rate": 2.498509989430187e-05,
"loss": 1.5532,
"step": 16027
},
{
"epoch": 0.5813166325773346,
"grad_norm": 1.3676408529281616,
"learning_rate": 2.4901117732338958e-05,
"loss": 1.5263,
"step": 16058
},
{
"epoch": 0.5824388654587579,
"grad_norm": 1.3900113105773926,
"learning_rate": 2.481713668624899e-05,
"loss": 1.5465,
"step": 16089
},
{
"epoch": 0.5835610983401813,
"grad_norm": 1.3808554410934448,
"learning_rate": 2.4733157703746663e-05,
"loss": 1.5332,
"step": 16120
},
{
"epoch": 0.5846833312216048,
"grad_norm": 1.2974086999893188,
"learning_rate": 2.4649181732523392e-05,
"loss": 1.5562,
"step": 16151
},
{
"epoch": 0.5858055641030282,
"grad_norm": 1.4109300374984741,
"learning_rate": 2.4565209720236582e-05,
"loss": 1.5273,
"step": 16182
},
{
"epoch": 0.5869277969844516,
"grad_norm": 1.3626701831817627,
"learning_rate": 2.4481242614498975e-05,
"loss": 1.5311,
"step": 16213
},
{
"epoch": 0.5880500298658751,
"grad_norm": 1.3017241954803467,
"learning_rate": 2.439728136286796e-05,
"loss": 1.5522,
"step": 16244
},
{
"epoch": 0.5891722627472985,
"grad_norm": 1.349171757698059,
"learning_rate": 2.4313326912834852e-05,
"loss": 1.5262,
"step": 16275
},
{
"epoch": 0.5902944956287219,
"grad_norm": 1.3548376560211182,
"learning_rate": 2.4229380211814206e-05,
"loss": 1.5455,
"step": 16306
},
{
"epoch": 0.5914167285101454,
"grad_norm": 1.412003755569458,
"learning_rate": 2.4145442207133124e-05,
"loss": 1.5634,
"step": 16337
},
{
"epoch": 0.5925389613915688,
"grad_norm": 1.3400499820709229,
"learning_rate": 2.406151384602059e-05,
"loss": 1.5398,
"step": 16368
},
{
"epoch": 0.5936611942729922,
"grad_norm": 1.3035651445388794,
"learning_rate": 2.3977596075596747e-05,
"loss": 1.5289,
"step": 16399
},
{
"epoch": 0.5947834271544157,
"grad_norm": 1.322824478149414,
"learning_rate": 2.3893689842862223e-05,
"loss": 1.5509,
"step": 16430
},
{
"epoch": 0.595905660035839,
"grad_norm": 1.3810386657714844,
"learning_rate": 2.3809796094687475e-05,
"loss": 1.5439,
"step": 16461
},
{
"epoch": 0.5970278929172624,
"grad_norm": 1.399760127067566,
"learning_rate": 2.372591577780202e-05,
"loss": 1.5459,
"step": 16492
},
{
"epoch": 0.5981501257986859,
"grad_norm": 1.3253116607666016,
"learning_rate": 2.3642049838783838e-05,
"loss": 1.5556,
"step": 16523
},
{
"epoch": 0.5992723586801093,
"grad_norm": 1.3376234769821167,
"learning_rate": 2.3558199224048666e-05,
"loss": 1.5322,
"step": 16554
},
{
"epoch": 0.6003945915615327,
"grad_norm": 1.274533748626709,
"learning_rate": 2.347436487983929e-05,
"loss": 1.5288,
"step": 16585
},
{
"epoch": 0.6015168244429562,
"grad_norm": 1.3756400346755981,
"learning_rate": 2.3390547752214888e-05,
"loss": 1.5287,
"step": 16616
},
{
"epoch": 0.6026390573243796,
"grad_norm": 1.391845941543579,
"learning_rate": 2.330674878704035e-05,
"loss": 1.5329,
"step": 16647
},
{
"epoch": 0.603761290205803,
"grad_norm": 1.414237380027771,
"learning_rate": 2.322296892997561e-05,
"loss": 1.5482,
"step": 16678
},
{
"epoch": 0.6048835230872265,
"grad_norm": 1.3953816890716553,
"learning_rate": 2.313920912646497e-05,
"loss": 1.5372,
"step": 16709
},
{
"epoch": 0.6060057559686499,
"grad_norm": 1.3669557571411133,
"learning_rate": 2.305547032172643e-05,
"loss": 1.5522,
"step": 16740
},
{
"epoch": 0.6071279888500734,
"grad_norm": 1.3847616910934448,
"learning_rate": 2.2971753460741014e-05,
"loss": 1.5314,
"step": 16771
},
{
"epoch": 0.6082502217314968,
"grad_norm": 1.2923661470413208,
"learning_rate": 2.288805948824212e-05,
"loss": 1.5434,
"step": 16802
},
{
"epoch": 0.6093724546129201,
"grad_norm": 1.3146955966949463,
"learning_rate": 2.2804389348704858e-05,
"loss": 1.5442,
"step": 16833
},
{
"epoch": 0.6104946874943435,
"grad_norm": 1.362166166305542,
"learning_rate": 2.2720743986335374e-05,
"loss": 1.546,
"step": 16864
},
{
"epoch": 0.611616920375767,
"grad_norm": 1.3853099346160889,
"learning_rate": 2.2637124345060233e-05,
"loss": 1.5385,
"step": 16895
},
{
"epoch": 0.6127391532571904,
"grad_norm": 1.3611940145492554,
"learning_rate": 2.2553531368515695e-05,
"loss": 1.5577,
"step": 16926
},
{
"epoch": 0.6138613861386139,
"grad_norm": 1.3302477598190308,
"learning_rate": 2.2469966000037144e-05,
"loss": 1.5566,
"step": 16957
},
{
"epoch": 0.6149836190200373,
"grad_norm": 1.3969210386276245,
"learning_rate": 2.2386429182648417e-05,
"loss": 1.5459,
"step": 16988
},
{
"epoch": 0.6161058519014607,
"grad_norm": 1.3878018856048584,
"learning_rate": 2.230292185905114e-05,
"loss": 1.5295,
"step": 17019
},
{
"epoch": 0.6172280847828842,
"grad_norm": 1.3366162776947021,
"learning_rate": 2.2219444971614116e-05,
"loss": 1.5485,
"step": 17050
},
{
"epoch": 0.6183503176643076,
"grad_norm": 1.3503491878509521,
"learning_rate": 2.2135999462362655e-05,
"loss": 1.5266,
"step": 17081
},
{
"epoch": 0.619472550545731,
"grad_norm": 1.3379223346710205,
"learning_rate": 2.2052586272968003e-05,
"loss": 1.5366,
"step": 17112
},
{
"epoch": 0.6205947834271545,
"grad_norm": 1.299849033355713,
"learning_rate": 2.196920634473666e-05,
"loss": 1.5315,
"step": 17143
},
{
"epoch": 0.6217170163085779,
"grad_norm": 1.3590292930603027,
"learning_rate": 2.1885860618599787e-05,
"loss": 1.5332,
"step": 17174
},
{
"epoch": 0.6228392491900012,
"grad_norm": 1.3150153160095215,
"learning_rate": 2.1802550035102577e-05,
"loss": 1.5197,
"step": 17205
},
{
"epoch": 0.6239614820714247,
"grad_norm": 1.3216016292572021,
"learning_rate": 2.171927553439363e-05,
"loss": 1.5344,
"step": 17236
},
{
"epoch": 0.6250837149528481,
"grad_norm": 1.3521660566329956,
"learning_rate": 2.1636038056214376e-05,
"loss": 1.5236,
"step": 17267
},
{
"epoch": 0.6262059478342715,
"grad_norm": 1.4077104330062866,
"learning_rate": 2.155283853988844e-05,
"loss": 1.5318,
"step": 17298
},
{
"epoch": 0.627328180715695,
"grad_norm": 1.4986066818237305,
"learning_rate": 2.146967792431106e-05,
"loss": 1.5466,
"step": 17329
},
{
"epoch": 0.6284504135971184,
"grad_norm": 1.2227765321731567,
"learning_rate": 2.138655714793849e-05,
"loss": 1.5345,
"step": 17360
},
{
"epoch": 0.6295726464785418,
"grad_norm": 1.3314886093139648,
"learning_rate": 2.1303477148777367e-05,
"loss": 1.5376,
"step": 17391
},
{
"epoch": 0.6306948793599653,
"grad_norm": 1.3682267665863037,
"learning_rate": 2.122043886437421e-05,
"loss": 1.5313,
"step": 17422
},
{
"epoch": 0.6318171122413887,
"grad_norm": 1.3226497173309326,
"learning_rate": 2.1137443231804765e-05,
"loss": 1.5361,
"step": 17453
},
{
"epoch": 0.6329393451228121,
"grad_norm": 1.3603419065475464,
"learning_rate": 2.105449118766347e-05,
"loss": 1.5353,
"step": 17484
},
{
"epoch": 0.6340615780042356,
"grad_norm": 1.3611435890197754,
"learning_rate": 2.097158366805287e-05,
"loss": 1.5449,
"step": 17515
},
{
"epoch": 0.6351838108856589,
"grad_norm": 1.3318766355514526,
"learning_rate": 2.0888721608573047e-05,
"loss": 1.5194,
"step": 17546
},
{
"epoch": 0.6363060437670823,
"grad_norm": 1.3144105672836304,
"learning_rate": 2.0805905944311087e-05,
"loss": 1.5288,
"step": 17577
},
{
"epoch": 0.6374282766485058,
"grad_norm": 1.3346774578094482,
"learning_rate": 2.0723137609830497e-05,
"loss": 1.5278,
"step": 17608
},
{
"epoch": 0.6385505095299292,
"grad_norm": 1.4217780828475952,
"learning_rate": 2.0640417539160686e-05,
"loss": 1.5467,
"step": 17639
},
{
"epoch": 0.6396727424113526,
"grad_norm": 1.3335380554199219,
"learning_rate": 2.0557746665786427e-05,
"loss": 1.5506,
"step": 17670
},
{
"epoch": 0.6407949752927761,
"grad_norm": 1.3793307542800903,
"learning_rate": 2.0475125922637256e-05,
"loss": 1.5172,
"step": 17701
},
{
"epoch": 0.6419172081741995,
"grad_norm": 1.3435157537460327,
"learning_rate": 2.0392556242077047e-05,
"loss": 1.5137,
"step": 17732
},
{
"epoch": 0.6430394410556229,
"grad_norm": 1.3066918849945068,
"learning_rate": 2.031003855589343e-05,
"loss": 1.5184,
"step": 17763
},
{
"epoch": 0.6441616739370464,
"grad_norm": 1.4214332103729248,
"learning_rate": 2.022757379528727e-05,
"loss": 1.5239,
"step": 17794
},
{
"epoch": 0.6452839068184698,
"grad_norm": 1.3571085929870605,
"learning_rate": 2.0145162890862184e-05,
"loss": 1.5234,
"step": 17825
},
{
"epoch": 0.6464061396998932,
"grad_norm": 1.2680344581604004,
"learning_rate": 2.0062806772614022e-05,
"loss": 1.5207,
"step": 17856
},
{
"epoch": 0.6475283725813167,
"grad_norm": 1.3365403413772583,
"learning_rate": 1.9980506369920392e-05,
"loss": 1.5457,
"step": 17887
},
{
"epoch": 0.64865060546274,
"grad_norm": 1.3576997518539429,
"learning_rate": 1.989826261153015e-05,
"loss": 1.516,
"step": 17918
},
{
"epoch": 0.6497728383441634,
"grad_norm": 1.3189170360565186,
"learning_rate": 1.9816076425552923e-05,
"loss": 1.5204,
"step": 17949
},
{
"epoch": 0.6508950712255869,
"grad_norm": 1.2855075597763062,
"learning_rate": 1.9733948739448676e-05,
"loss": 1.5131,
"step": 17980
},
{
"epoch": 0.6520173041070103,
"grad_norm": 1.3004227876663208,
"learning_rate": 1.9651880480017155e-05,
"loss": 1.5495,
"step": 18011
},
{
"epoch": 0.6531395369884337,
"grad_norm": 1.3858931064605713,
"learning_rate": 1.9569872573387516e-05,
"loss": 1.529,
"step": 18042
},
{
"epoch": 0.6542617698698572,
"grad_norm": 1.378490686416626,
"learning_rate": 1.9487925945007854e-05,
"loss": 1.5281,
"step": 18073
},
{
"epoch": 0.6553840027512806,
"grad_norm": 1.317062258720398,
"learning_rate": 1.9406041519634726e-05,
"loss": 1.5294,
"step": 18104
},
{
"epoch": 0.656506235632704,
"grad_norm": 1.313314437866211,
"learning_rate": 1.932422022132275e-05,
"loss": 1.5343,
"step": 18135
},
{
"epoch": 0.6576284685141275,
"grad_norm": 1.3339669704437256,
"learning_rate": 1.924246297341414e-05,
"loss": 1.5203,
"step": 18166
},
{
"epoch": 0.6587507013955509,
"grad_norm": 1.298256516456604,
"learning_rate": 1.9160770698528338e-05,
"loss": 1.5297,
"step": 18197
},
{
"epoch": 0.6598729342769744,
"grad_norm": 1.322373628616333,
"learning_rate": 1.907914431855156e-05,
"loss": 1.5307,
"step": 18228
},
{
"epoch": 0.6609951671583978,
"grad_norm": 1.403425931930542,
"learning_rate": 1.8997584754626412e-05,
"loss": 1.5279,
"step": 18259
},
{
"epoch": 0.6621174000398211,
"grad_norm": 1.3005762100219727,
"learning_rate": 1.8916092927141486e-05,
"loss": 1.5325,
"step": 18290
},
{
"epoch": 0.6632396329212445,
"grad_norm": 1.3655368089675903,
"learning_rate": 1.883466975572098e-05,
"loss": 1.54,
"step": 18321
},
{
"epoch": 0.664361865802668,
"grad_norm": 1.376219391822815,
"learning_rate": 1.8753316159214312e-05,
"loss": 1.518,
"step": 18352
},
{
"epoch": 0.6654840986840914,
"grad_norm": 1.3264917135238647,
"learning_rate": 1.8672033055685766e-05,
"loss": 1.5108,
"step": 18383
},
{
"epoch": 0.6666063315655149,
"grad_norm": 1.4083831310272217,
"learning_rate": 1.8590821362404116e-05,
"loss": 1.5252,
"step": 18414
},
{
"epoch": 0.6677285644469383,
"grad_norm": 1.302178978919983,
"learning_rate": 1.8509681995832294e-05,
"loss": 1.4972,
"step": 18445
},
{
"epoch": 0.6688507973283617,
"grad_norm": 1.3290973901748657,
"learning_rate": 1.8428615871617004e-05,
"loss": 1.5343,
"step": 18476
},
{
"epoch": 0.6699730302097852,
"grad_norm": 1.4198294878005981,
"learning_rate": 1.8347623904578448e-05,
"loss": 1.5272,
"step": 18507
},
{
"epoch": 0.6710952630912086,
"grad_norm": 1.2832363843917847,
"learning_rate": 1.8266707008699975e-05,
"loss": 1.5351,
"step": 18538
},
{
"epoch": 0.672217495972632,
"grad_norm": 1.367154836654663,
"learning_rate": 1.818586609711774e-05,
"loss": 1.5236,
"step": 18569
},
{
"epoch": 0.6733397288540555,
"grad_norm": 1.3867367506027222,
"learning_rate": 1.8105102082110462e-05,
"loss": 1.5141,
"step": 18600
},
{
"epoch": 0.6744619617354789,
"grad_norm": 1.3272528648376465,
"learning_rate": 1.8024415875089058e-05,
"loss": 1.5459,
"step": 18631
},
{
"epoch": 0.6755841946169022,
"grad_norm": 1.4012340307235718,
"learning_rate": 1.7943808386586407e-05,
"loss": 1.5082,
"step": 18662
},
{
"epoch": 0.6767064274983257,
"grad_norm": 1.3309136629104614,
"learning_rate": 1.7863280526247073e-05,
"loss": 1.5207,
"step": 18693
},
{
"epoch": 0.6778286603797491,
"grad_norm": 1.3469054698944092,
"learning_rate": 1.7782833202817003e-05,
"loss": 1.5301,
"step": 18724
},
{
"epoch": 0.6789508932611725,
"grad_norm": 1.3786745071411133,
"learning_rate": 1.7702467324133327e-05,
"loss": 1.5236,
"step": 18755
},
{
"epoch": 0.680073126142596,
"grad_norm": 1.3620835542678833,
"learning_rate": 1.7622183797114042e-05,
"loss": 1.5288,
"step": 18786
},
{
"epoch": 0.6811953590240194,
"grad_norm": 1.3298254013061523,
"learning_rate": 1.7541983527747838e-05,
"loss": 1.5208,
"step": 18817
},
{
"epoch": 0.6823175919054428,
"grad_norm": 1.2911970615386963,
"learning_rate": 1.746186742108387e-05,
"loss": 1.5172,
"step": 18848
},
{
"epoch": 0.6834398247868663,
"grad_norm": 1.30719792842865,
"learning_rate": 1.73818363812215e-05,
"loss": 1.5206,
"step": 18879
},
{
"epoch": 0.6845620576682897,
"grad_norm": 1.3682974576950073,
"learning_rate": 1.7301891311300153e-05,
"loss": 1.5126,
"step": 18910
},
{
"epoch": 0.6856842905497131,
"grad_norm": 1.3172578811645508,
"learning_rate": 1.7222033113489055e-05,
"loss": 1.506,
"step": 18941
},
{
"epoch": 0.6868065234311366,
"grad_norm": 1.3976131677627563,
"learning_rate": 1.7142262688977127e-05,
"loss": 1.5161,
"step": 18972
},
{
"epoch": 0.68792875631256,
"grad_norm": 1.3834096193313599,
"learning_rate": 1.7062580937962764e-05,
"loss": 1.5156,
"step": 19003
},
{
"epoch": 0.6890509891939833,
"grad_norm": 1.2939929962158203,
"learning_rate": 1.698298875964369e-05,
"loss": 1.5111,
"step": 19034
},
{
"epoch": 0.6901732220754068,
"grad_norm": 1.416242241859436,
"learning_rate": 1.690348705220684e-05,
"loss": 1.5112,
"step": 19065
},
{
"epoch": 0.6912954549568302,
"grad_norm": 1.4598749876022339,
"learning_rate": 1.6824076712818156e-05,
"loss": 1.5074,
"step": 19096
},
{
"epoch": 0.6924176878382536,
"grad_norm": 1.403602123260498,
"learning_rate": 1.6744758637612533e-05,
"loss": 1.5049,
"step": 19127
},
{
"epoch": 0.6935399207196771,
"grad_norm": 1.328615665435791,
"learning_rate": 1.6665533721683664e-05,
"loss": 1.5182,
"step": 19158
},
{
"epoch": 0.6946621536011005,
"grad_norm": 1.3603520393371582,
"learning_rate": 1.6586402859073974e-05,
"loss": 1.5303,
"step": 19189
},
{
"epoch": 0.6957843864825239,
"grad_norm": 1.4252129793167114,
"learning_rate": 1.6507366942764463e-05,
"loss": 1.5364,
"step": 19220
},
{
"epoch": 0.6969066193639474,
"grad_norm": 1.2863671779632568,
"learning_rate": 1.6428426864664732e-05,
"loss": 1.5243,
"step": 19251
},
{
"epoch": 0.6980288522453708,
"grad_norm": 1.298772931098938,
"learning_rate": 1.6349583515602816e-05,
"loss": 1.5254,
"step": 19282
},
{
"epoch": 0.6991510851267942,
"grad_norm": 1.3208067417144775,
"learning_rate": 1.6270837785315208e-05,
"loss": 1.517,
"step": 19313
},
{
"epoch": 0.7002733180082177,
"grad_norm": 1.4582445621490479,
"learning_rate": 1.619219056243676e-05,
"loss": 1.5156,
"step": 19344
},
{
"epoch": 0.7013955508896411,
"grad_norm": 1.3674423694610596,
"learning_rate": 1.6113642734490698e-05,
"loss": 1.5056,
"step": 19375
},
{
"epoch": 0.7025177837710644,
"grad_norm": 1.289265513420105,
"learning_rate": 1.6035195187878577e-05,
"loss": 1.5151,
"step": 19406
},
{
"epoch": 0.7036400166524879,
"grad_norm": 1.3161633014678955,
"learning_rate": 1.5956848807870305e-05,
"loss": 1.5206,
"step": 19437
},
{
"epoch": 0.7047622495339113,
"grad_norm": 1.3161797523498535,
"learning_rate": 1.587860447859413e-05,
"loss": 1.5132,
"step": 19468
},
{
"epoch": 0.7058844824153347,
"grad_norm": 1.3772165775299072,
"learning_rate": 1.5800463083026686e-05,
"loss": 1.5273,
"step": 19499
},
{
"epoch": 0.7070067152967582,
"grad_norm": 1.3191962242126465,
"learning_rate": 1.572242550298298e-05,
"loss": 1.5238,
"step": 19530
},
{
"epoch": 0.7081289481781816,
"grad_norm": 1.3758587837219238,
"learning_rate": 1.56444926191065e-05,
"loss": 1.5242,
"step": 19561
},
{
"epoch": 0.709251181059605,
"grad_norm": 1.3456153869628906,
"learning_rate": 1.5566665310859257e-05,
"loss": 1.5109,
"step": 19592
},
{
"epoch": 0.7103734139410285,
"grad_norm": 1.3654590845108032,
"learning_rate": 1.5488944456511846e-05,
"loss": 1.5092,
"step": 19623
},
{
"epoch": 0.7114956468224519,
"grad_norm": 1.2868263721466064,
"learning_rate": 1.5411330933133546e-05,
"loss": 1.534,
"step": 19654
},
{
"epoch": 0.7126178797038754,
"grad_norm": 1.3140943050384521,
"learning_rate": 1.533382561658241e-05,
"loss": 1.5381,
"step": 19685
},
{
"epoch": 0.7137401125852988,
"grad_norm": 1.353061556816101,
"learning_rate": 1.525642938149541e-05,
"loss": 1.5133,
"step": 19716
},
{
"epoch": 0.7148623454667222,
"grad_norm": 1.378933072090149,
"learning_rate": 1.5179143101278536e-05,
"loss": 1.514,
"step": 19747
},
{
"epoch": 0.7159845783481456,
"grad_norm": 1.3969671726226807,
"learning_rate": 1.5101967648096955e-05,
"loss": 1.5255,
"step": 19778
},
{
"epoch": 0.717106811229569,
"grad_norm": 1.3627468347549438,
"learning_rate": 1.5024903892865172e-05,
"loss": 1.5168,
"step": 19809
},
{
"epoch": 0.7182290441109924,
"grad_norm": 1.3613289594650269,
"learning_rate": 1.4947952705237184e-05,
"loss": 1.532,
"step": 19840
},
{
"epoch": 0.7193512769924159,
"grad_norm": 1.3214402198791504,
"learning_rate": 1.4871114953596682e-05,
"loss": 1.5236,
"step": 19871
},
{
"epoch": 0.7204735098738393,
"grad_norm": 1.3939237594604492,
"learning_rate": 1.4794391505047256e-05,
"loss": 1.521,
"step": 19902
},
{
"epoch": 0.7215957427552627,
"grad_norm": 1.384696364402771,
"learning_rate": 1.4717783225402596e-05,
"loss": 1.5118,
"step": 19933
},
{
"epoch": 0.7227179756366862,
"grad_norm": 1.286145806312561,
"learning_rate": 1.4641290979176735e-05,
"loss": 1.522,
"step": 19964
},
{
"epoch": 0.7238402085181096,
"grad_norm": 1.380027413368225,
"learning_rate": 1.4564915629574246e-05,
"loss": 1.5147,
"step": 19995
},
{
"epoch": 0.724962441399533,
"grad_norm": 1.372430443763733,
"learning_rate": 1.4488658038480601e-05,
"loss": 1.5132,
"step": 20026
},
{
"epoch": 0.7260846742809565,
"grad_norm": 1.3200669288635254,
"learning_rate": 1.4412519066452323e-05,
"loss": 1.4935,
"step": 20057
},
{
"epoch": 0.7272069071623799,
"grad_norm": 1.3791152238845825,
"learning_rate": 1.4336499572707373e-05,
"loss": 1.5242,
"step": 20088
},
{
"epoch": 0.7283291400438033,
"grad_norm": 1.287310004234314,
"learning_rate": 1.4260600415115433e-05,
"loss": 1.5098,
"step": 20119
},
{
"epoch": 0.7294513729252267,
"grad_norm": 1.307353138923645,
"learning_rate": 1.4184822450188137e-05,
"loss": 1.5098,
"step": 20150
},
{
"epoch": 0.7305736058066501,
"grad_norm": 1.3487526178359985,
"learning_rate": 1.410916653306954e-05,
"loss": 1.5167,
"step": 20181
},
{
"epoch": 0.7316958386880735,
"grad_norm": 1.3626441955566406,
"learning_rate": 1.403363351752639e-05,
"loss": 1.5005,
"step": 20212
},
{
"epoch": 0.732818071569497,
"grad_norm": 1.3192275762557983,
"learning_rate": 1.3958224255938485e-05,
"loss": 1.5191,
"step": 20243
},
{
"epoch": 0.7339403044509204,
"grad_norm": 1.336755633354187,
"learning_rate": 1.388293959928911e-05,
"loss": 1.5223,
"step": 20274
},
{
"epoch": 0.7350625373323438,
"grad_norm": 1.3645100593566895,
"learning_rate": 1.3807780397155379e-05,
"loss": 1.5156,
"step": 20305
},
{
"epoch": 0.7361847702137673,
"grad_norm": 1.3681402206420898,
"learning_rate": 1.3732747497698655e-05,
"loss": 1.5065,
"step": 20336
},
{
"epoch": 0.7373070030951907,
"grad_norm": 1.3669005632400513,
"learning_rate": 1.3657841747655038e-05,
"loss": 1.5148,
"step": 20367
},
{
"epoch": 0.7384292359766141,
"grad_norm": 1.349400281906128,
"learning_rate": 1.3583063992325706e-05,
"loss": 1.5234,
"step": 20398
},
{
"epoch": 0.7395514688580376,
"grad_norm": 1.3764326572418213,
"learning_rate": 1.3508415075567496e-05,
"loss": 1.5019,
"step": 20429
},
{
"epoch": 0.740673701739461,
"grad_norm": 1.5416663885116577,
"learning_rate": 1.343389583978327e-05,
"loss": 1.5188,
"step": 20460
},
{
"epoch": 0.7417959346208844,
"grad_norm": 1.3264429569244385,
"learning_rate": 1.3359507125912468e-05,
"loss": 1.5041,
"step": 20491
},
{
"epoch": 0.7429181675023078,
"grad_norm": 1.3554550409317017,
"learning_rate": 1.3285249773421627e-05,
"loss": 1.5207,
"step": 20522
},
{
"epoch": 0.7440404003837312,
"grad_norm": 1.31184983253479,
"learning_rate": 1.3211124620294884e-05,
"loss": 1.5257,
"step": 20553
},
{
"epoch": 0.7451626332651546,
"grad_norm": 1.3225113153457642,
"learning_rate": 1.313713250302451e-05,
"loss": 1.5196,
"step": 20584
},
{
"epoch": 0.7462848661465781,
"grad_norm": 1.3386696577072144,
"learning_rate": 1.3063274256601479e-05,
"loss": 1.5174,
"step": 20615
},
{
"epoch": 0.7474070990280015,
"grad_norm": 1.423807978630066,
"learning_rate": 1.2989550714506086e-05,
"loss": 1.4968,
"step": 20646
},
{
"epoch": 0.7485293319094249,
"grad_norm": 1.2833530902862549,
"learning_rate": 1.291596270869846e-05,
"loss": 1.491,
"step": 20677
},
{
"epoch": 0.7496515647908484,
"grad_norm": 1.2796401977539062,
"learning_rate": 1.284251106960927e-05,
"loss": 1.5062,
"step": 20708
},
{
"epoch": 0.7507737976722718,
"grad_norm": 1.3797061443328857,
"learning_rate": 1.2769196626130263e-05,
"loss": 1.5152,
"step": 20739
},
{
"epoch": 0.7518960305536952,
"grad_norm": 1.4489312171936035,
"learning_rate": 1.2696020205604969e-05,
"loss": 1.5122,
"step": 20770
},
{
"epoch": 0.7530182634351187,
"grad_norm": 1.3305705785751343,
"learning_rate": 1.2622982633819359e-05,
"loss": 1.5143,
"step": 20801
},
{
"epoch": 0.7541404963165421,
"grad_norm": 1.3734405040740967,
"learning_rate": 1.2550084734992484e-05,
"loss": 1.513,
"step": 20832
},
{
"epoch": 0.7552627291979656,
"grad_norm": 1.2886455059051514,
"learning_rate": 1.247732733176724e-05,
"loss": 1.489,
"step": 20863
},
{
"epoch": 0.7563849620793889,
"grad_norm": 1.4357209205627441,
"learning_rate": 1.2404711245201044e-05,
"loss": 1.5179,
"step": 20894
},
{
"epoch": 0.7575071949608123,
"grad_norm": 1.294068455696106,
"learning_rate": 1.2332237294756535e-05,
"loss": 1.5151,
"step": 20925
},
{
"epoch": 0.7586294278422357,
"grad_norm": 1.3966395854949951,
"learning_rate": 1.225990629829241e-05,
"loss": 1.5127,
"step": 20956
},
{
"epoch": 0.7597516607236592,
"grad_norm": 1.3190878629684448,
"learning_rate": 1.2187719072054136e-05,
"loss": 1.5063,
"step": 20987
},
{
"epoch": 0.7608738936050826,
"grad_norm": 1.2927324771881104,
"learning_rate": 1.2115676430664735e-05,
"loss": 1.4923,
"step": 21018
},
{
"epoch": 0.761996126486506,
"grad_norm": 1.3363546133041382,
"learning_rate": 1.2043779187115647e-05,
"loss": 1.4928,
"step": 21049
},
{
"epoch": 0.7631183593679295,
"grad_norm": 1.322825312614441,
"learning_rate": 1.1972028152757476e-05,
"loss": 1.5116,
"step": 21080
},
{
"epoch": 0.7642405922493529,
"grad_norm": 1.396026372909546,
"learning_rate": 1.1900424137290889e-05,
"loss": 1.5089,
"step": 21111
},
{
"epoch": 0.7653628251307764,
"grad_norm": 1.358963966369629,
"learning_rate": 1.1828967948757482e-05,
"loss": 1.505,
"step": 21142
},
{
"epoch": 0.7664850580121998,
"grad_norm": 1.3169891834259033,
"learning_rate": 1.175766039353062e-05,
"loss": 1.5115,
"step": 21173
},
{
"epoch": 0.7676072908936232,
"grad_norm": 1.3406434059143066,
"learning_rate": 1.1686502276306382e-05,
"loss": 1.5093,
"step": 21204
},
{
"epoch": 0.7687295237750467,
"grad_norm": 1.3709667921066284,
"learning_rate": 1.1615494400094445e-05,
"loss": 1.5017,
"step": 21235
},
{
"epoch": 0.76985175665647,
"grad_norm": 1.4957972764968872,
"learning_rate": 1.1544637566209029e-05,
"loss": 1.5121,
"step": 21266
},
{
"epoch": 0.7709739895378934,
"grad_norm": 1.3525892496109009,
"learning_rate": 1.1473932574259886e-05,
"loss": 1.4934,
"step": 21297
},
{
"epoch": 0.7720962224193169,
"grad_norm": 1.3251068592071533,
"learning_rate": 1.1403380222143247e-05,
"loss": 1.4858,
"step": 21328
},
{
"epoch": 0.7732184553007403,
"grad_norm": 1.3417954444885254,
"learning_rate": 1.1332981306032808e-05,
"loss": 1.5119,
"step": 21359
}
],
"logging_steps": 31,
"max_steps": 30517,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 3052,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.5847615378155897e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}