4B-inst-e1 / trainer_state.json
intervitens's picture
Upload folder using huggingface_hub
11e7863 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 916,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001091703056768559,
"grad_norm": 13.289404298642646,
"learning_rate": 5.000000000000001e-07,
"loss": 1.284,
"step": 1
},
{
"epoch": 0.002183406113537118,
"grad_norm": 7.292217497278142,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.2399,
"step": 2
},
{
"epoch": 0.0032751091703056767,
"grad_norm": 9.8568572384971,
"learning_rate": 1.5e-06,
"loss": 1.2389,
"step": 3
},
{
"epoch": 0.004366812227074236,
"grad_norm": 9.989204669156571,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.2508,
"step": 4
},
{
"epoch": 0.0054585152838427945,
"grad_norm": 7.838284019029892,
"learning_rate": 2.5e-06,
"loss": 1.1875,
"step": 5
},
{
"epoch": 0.006550218340611353,
"grad_norm": 4.602193197170571,
"learning_rate": 3e-06,
"loss": 1.1885,
"step": 6
},
{
"epoch": 0.007641921397379912,
"grad_norm": 7.670062089782238,
"learning_rate": 3.5e-06,
"loss": 1.1405,
"step": 7
},
{
"epoch": 0.008733624454148471,
"grad_norm": 6.4393475774820095,
"learning_rate": 4.000000000000001e-06,
"loss": 1.1794,
"step": 8
},
{
"epoch": 0.009825327510917031,
"grad_norm": 5.301388008979529,
"learning_rate": 4.5e-06,
"loss": 1.1664,
"step": 9
},
{
"epoch": 0.010917030567685589,
"grad_norm": 4.776716991989775,
"learning_rate": 5e-06,
"loss": 1.1303,
"step": 10
},
{
"epoch": 0.012008733624454149,
"grad_norm": 2.5583402498764554,
"learning_rate": 5.500000000000001e-06,
"loss": 1.0836,
"step": 11
},
{
"epoch": 0.013100436681222707,
"grad_norm": 2.901999707087547,
"learning_rate": 6e-06,
"loss": 1.0848,
"step": 12
},
{
"epoch": 0.014192139737991267,
"grad_norm": 1.972981929251036,
"learning_rate": 6.5000000000000004e-06,
"loss": 1.1405,
"step": 13
},
{
"epoch": 0.015283842794759825,
"grad_norm": 2.2759650970605385,
"learning_rate": 7e-06,
"loss": 1.1014,
"step": 14
},
{
"epoch": 0.016375545851528384,
"grad_norm": 2.0304488449996088,
"learning_rate": 7.500000000000001e-06,
"loss": 1.0181,
"step": 15
},
{
"epoch": 0.017467248908296942,
"grad_norm": 1.662782341007406,
"learning_rate": 8.000000000000001e-06,
"loss": 1.0578,
"step": 16
},
{
"epoch": 0.018558951965065504,
"grad_norm": 1.5360585531700661,
"learning_rate": 8.5e-06,
"loss": 1.0547,
"step": 17
},
{
"epoch": 0.019650655021834062,
"grad_norm": 1.4473228929395678,
"learning_rate": 9e-06,
"loss": 1.0592,
"step": 18
},
{
"epoch": 0.02074235807860262,
"grad_norm": 1.5813095248190974,
"learning_rate": 9.5e-06,
"loss": 1.0583,
"step": 19
},
{
"epoch": 0.021834061135371178,
"grad_norm": 1.5411563366540608,
"learning_rate": 1e-05,
"loss": 1.0476,
"step": 20
},
{
"epoch": 0.02292576419213974,
"grad_norm": 1.5268980990168357,
"learning_rate": 1.0500000000000001e-05,
"loss": 1.0136,
"step": 21
},
{
"epoch": 0.024017467248908297,
"grad_norm": 1.6136299704496118,
"learning_rate": 1.1000000000000001e-05,
"loss": 1.0151,
"step": 22
},
{
"epoch": 0.025109170305676855,
"grad_norm": 1.4361573100654315,
"learning_rate": 1.15e-05,
"loss": 0.9862,
"step": 23
},
{
"epoch": 0.026200873362445413,
"grad_norm": 1.498362096620755,
"learning_rate": 1.2e-05,
"loss": 1.0541,
"step": 24
},
{
"epoch": 0.027292576419213975,
"grad_norm": 1.4737481885978707,
"learning_rate": 1.25e-05,
"loss": 1.0067,
"step": 25
},
{
"epoch": 0.028384279475982533,
"grad_norm": 1.3136557951278753,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.0575,
"step": 26
},
{
"epoch": 0.02947598253275109,
"grad_norm": 1.370294058486715,
"learning_rate": 1.3500000000000001e-05,
"loss": 1.0432,
"step": 27
},
{
"epoch": 0.03056768558951965,
"grad_norm": 1.4165018727433811,
"learning_rate": 1.4e-05,
"loss": 0.9864,
"step": 28
},
{
"epoch": 0.03165938864628821,
"grad_norm": 1.4047679041248446,
"learning_rate": 1.45e-05,
"loss": 0.964,
"step": 29
},
{
"epoch": 0.03275109170305677,
"grad_norm": 1.3627840469204053,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.976,
"step": 30
},
{
"epoch": 0.03384279475982533,
"grad_norm": 1.3177557349824027,
"learning_rate": 1.55e-05,
"loss": 0.9612,
"step": 31
},
{
"epoch": 0.034934497816593885,
"grad_norm": 1.2982379287553165,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.8974,
"step": 32
},
{
"epoch": 0.036026200873362446,
"grad_norm": 1.2526530829979805,
"learning_rate": 1.65e-05,
"loss": 0.9451,
"step": 33
},
{
"epoch": 0.03711790393013101,
"grad_norm": 1.2621127637568288,
"learning_rate": 1.7e-05,
"loss": 0.9786,
"step": 34
},
{
"epoch": 0.03820960698689956,
"grad_norm": 1.2146341024686296,
"learning_rate": 1.7500000000000002e-05,
"loss": 1.0163,
"step": 35
},
{
"epoch": 0.039301310043668124,
"grad_norm": 1.2607075617736583,
"learning_rate": 1.8e-05,
"loss": 0.9784,
"step": 36
},
{
"epoch": 0.04039301310043668,
"grad_norm": 1.279591496396912,
"learning_rate": 1.8500000000000002e-05,
"loss": 0.977,
"step": 37
},
{
"epoch": 0.04148471615720524,
"grad_norm": 1.2430348905926205,
"learning_rate": 1.9e-05,
"loss": 0.9571,
"step": 38
},
{
"epoch": 0.0425764192139738,
"grad_norm": 1.2918696702298937,
"learning_rate": 1.95e-05,
"loss": 0.9074,
"step": 39
},
{
"epoch": 0.043668122270742356,
"grad_norm": 1.218492845990098,
"learning_rate": 2e-05,
"loss": 0.9394,
"step": 40
},
{
"epoch": 0.04475982532751092,
"grad_norm": 1.2012319777333658,
"learning_rate": 1.999998463283948e-05,
"loss": 0.9207,
"step": 41
},
{
"epoch": 0.04585152838427948,
"grad_norm": 1.144437778164406,
"learning_rate": 1.9999938531405142e-05,
"loss": 0.9119,
"step": 42
},
{
"epoch": 0.04694323144104803,
"grad_norm": 1.1570563596566175,
"learning_rate": 1.9999861695838682e-05,
"loss": 0.9513,
"step": 43
},
{
"epoch": 0.048034934497816595,
"grad_norm": 1.210352870960915,
"learning_rate": 1.9999754126376247e-05,
"loss": 1.0351,
"step": 44
},
{
"epoch": 0.04912663755458515,
"grad_norm": 1.1710603987199897,
"learning_rate": 1.9999615823348444e-05,
"loss": 0.9141,
"step": 45
},
{
"epoch": 0.05021834061135371,
"grad_norm": 1.206539628486198,
"learning_rate": 1.9999446787180338e-05,
"loss": 0.8935,
"step": 46
},
{
"epoch": 0.05131004366812227,
"grad_norm": 1.152816325560954,
"learning_rate": 1.999924701839145e-05,
"loss": 0.935,
"step": 47
},
{
"epoch": 0.05240174672489083,
"grad_norm": 1.2244741767187948,
"learning_rate": 1.9999016517595752e-05,
"loss": 0.9221,
"step": 48
},
{
"epoch": 0.05349344978165939,
"grad_norm": 1.268582156983677,
"learning_rate": 1.999875528550168e-05,
"loss": 0.9813,
"step": 49
},
{
"epoch": 0.05458515283842795,
"grad_norm": 1.3643851641335847,
"learning_rate": 1.999846332291211e-05,
"loss": 0.9756,
"step": 50
},
{
"epoch": 0.055676855895196505,
"grad_norm": 1.2528094271196928,
"learning_rate": 1.9998140630724365e-05,
"loss": 0.9473,
"step": 51
},
{
"epoch": 0.056768558951965066,
"grad_norm": 1.2550897534969043,
"learning_rate": 1.9997787209930222e-05,
"loss": 0.994,
"step": 52
},
{
"epoch": 0.05786026200873363,
"grad_norm": 1.2492909440859432,
"learning_rate": 1.9997403061615898e-05,
"loss": 0.9755,
"step": 53
},
{
"epoch": 0.05895196506550218,
"grad_norm": 1.3242409227473286,
"learning_rate": 1.9996988186962044e-05,
"loss": 0.9597,
"step": 54
},
{
"epoch": 0.060043668122270744,
"grad_norm": 1.2218569673827555,
"learning_rate": 1.9996542587243747e-05,
"loss": 0.903,
"step": 55
},
{
"epoch": 0.0611353711790393,
"grad_norm": 1.2731882020274534,
"learning_rate": 1.9996066263830533e-05,
"loss": 0.9958,
"step": 56
},
{
"epoch": 0.06222707423580786,
"grad_norm": 1.4086693673102497,
"learning_rate": 1.999555921818634e-05,
"loss": 1.0131,
"step": 57
},
{
"epoch": 0.06331877729257641,
"grad_norm": 1.1634949732644826,
"learning_rate": 1.9995021451869548e-05,
"loss": 0.9712,
"step": 58
},
{
"epoch": 0.06441048034934498,
"grad_norm": 1.2690994660555552,
"learning_rate": 1.9994452966532943e-05,
"loss": 0.9359,
"step": 59
},
{
"epoch": 0.06550218340611354,
"grad_norm": 1.1985992015767803,
"learning_rate": 1.9993853763923724e-05,
"loss": 0.9378,
"step": 60
},
{
"epoch": 0.0665938864628821,
"grad_norm": 1.1335004730759732,
"learning_rate": 1.9993223845883496e-05,
"loss": 0.9377,
"step": 61
},
{
"epoch": 0.06768558951965066,
"grad_norm": 1.2705905274575864,
"learning_rate": 1.999256321434828e-05,
"loss": 1.0114,
"step": 62
},
{
"epoch": 0.06877729257641921,
"grad_norm": 1.1788654125970075,
"learning_rate": 1.999187187134847e-05,
"loss": 0.931,
"step": 63
},
{
"epoch": 0.06986899563318777,
"grad_norm": 1.2619228641910007,
"learning_rate": 1.999114981900887e-05,
"loss": 0.9497,
"step": 64
},
{
"epoch": 0.07096069868995633,
"grad_norm": 1.2234170872884378,
"learning_rate": 1.9990397059548655e-05,
"loss": 0.9863,
"step": 65
},
{
"epoch": 0.07205240174672489,
"grad_norm": 1.1757146049009946,
"learning_rate": 1.9989613595281384e-05,
"loss": 0.9152,
"step": 66
},
{
"epoch": 0.07314410480349345,
"grad_norm": 1.1437402318653815,
"learning_rate": 1.998879942861498e-05,
"loss": 0.9346,
"step": 67
},
{
"epoch": 0.07423580786026202,
"grad_norm": 1.2068538367952386,
"learning_rate": 1.9987954562051724e-05,
"loss": 0.9206,
"step": 68
},
{
"epoch": 0.07532751091703056,
"grad_norm": 1.17298454542619,
"learning_rate": 1.9987078998188264e-05,
"loss": 0.8971,
"step": 69
},
{
"epoch": 0.07641921397379912,
"grad_norm": 1.2149320044056757,
"learning_rate": 1.998617273971558e-05,
"loss": 0.9663,
"step": 70
},
{
"epoch": 0.07751091703056769,
"grad_norm": 1.2081292741308731,
"learning_rate": 1.9985235789418995e-05,
"loss": 0.9423,
"step": 71
},
{
"epoch": 0.07860262008733625,
"grad_norm": 1.143031294548552,
"learning_rate": 1.998426815017817e-05,
"loss": 0.8929,
"step": 72
},
{
"epoch": 0.07969432314410481,
"grad_norm": 1.1968165551994472,
"learning_rate": 1.998326982496707e-05,
"loss": 0.918,
"step": 73
},
{
"epoch": 0.08078602620087336,
"grad_norm": 1.1833288825915689,
"learning_rate": 1.9982240816853983e-05,
"loss": 0.9261,
"step": 74
},
{
"epoch": 0.08187772925764192,
"grad_norm": 1.217048005558642,
"learning_rate": 1.998118112900149e-05,
"loss": 0.9599,
"step": 75
},
{
"epoch": 0.08296943231441048,
"grad_norm": 1.1480745598282631,
"learning_rate": 1.9980090764666486e-05,
"loss": 0.9612,
"step": 76
},
{
"epoch": 0.08406113537117904,
"grad_norm": 1.1471554189545394,
"learning_rate": 1.9978969727200115e-05,
"loss": 0.9281,
"step": 77
},
{
"epoch": 0.0851528384279476,
"grad_norm": 1.1830446201397407,
"learning_rate": 1.9977818020047816e-05,
"loss": 0.9312,
"step": 78
},
{
"epoch": 0.08624454148471616,
"grad_norm": 1.219436681873757,
"learning_rate": 1.9976635646749286e-05,
"loss": 0.9563,
"step": 79
},
{
"epoch": 0.08733624454148471,
"grad_norm": 1.19047205985711,
"learning_rate": 1.9975422610938463e-05,
"loss": 0.9471,
"step": 80
},
{
"epoch": 0.08842794759825327,
"grad_norm": 1.1708685165258725,
"learning_rate": 1.997417891634353e-05,
"loss": 0.9584,
"step": 81
},
{
"epoch": 0.08951965065502183,
"grad_norm": 1.2011650126934366,
"learning_rate": 1.9972904566786903e-05,
"loss": 0.9019,
"step": 82
},
{
"epoch": 0.0906113537117904,
"grad_norm": 1.2862256431550625,
"learning_rate": 1.9971599566185205e-05,
"loss": 0.9516,
"step": 83
},
{
"epoch": 0.09170305676855896,
"grad_norm": 1.107666246039431,
"learning_rate": 1.9970263918549274e-05,
"loss": 0.8906,
"step": 84
},
{
"epoch": 0.0927947598253275,
"grad_norm": 1.2388104557486592,
"learning_rate": 1.996889762798412e-05,
"loss": 0.9032,
"step": 85
},
{
"epoch": 0.09388646288209607,
"grad_norm": 1.2604186198564908,
"learning_rate": 1.996750069868895e-05,
"loss": 0.9896,
"step": 86
},
{
"epoch": 0.09497816593886463,
"grad_norm": 1.1374917097605022,
"learning_rate": 1.9966073134957137e-05,
"loss": 0.8745,
"step": 87
},
{
"epoch": 0.09606986899563319,
"grad_norm": 1.1700957445316564,
"learning_rate": 1.9964614941176194e-05,
"loss": 0.9068,
"step": 88
},
{
"epoch": 0.09716157205240175,
"grad_norm": 1.1538371134746046,
"learning_rate": 1.996312612182778e-05,
"loss": 0.9632,
"step": 89
},
{
"epoch": 0.0982532751091703,
"grad_norm": 1.160813654766113,
"learning_rate": 1.9961606681487685e-05,
"loss": 0.9592,
"step": 90
},
{
"epoch": 0.09934497816593886,
"grad_norm": 1.1687218710757636,
"learning_rate": 1.996005662482581e-05,
"loss": 0.9314,
"step": 91
},
{
"epoch": 0.10043668122270742,
"grad_norm": 1.1625020584138288,
"learning_rate": 1.9958475956606133e-05,
"loss": 0.9364,
"step": 92
},
{
"epoch": 0.10152838427947598,
"grad_norm": 1.1942587572603351,
"learning_rate": 1.9956864681686746e-05,
"loss": 0.9535,
"step": 93
},
{
"epoch": 0.10262008733624454,
"grad_norm": 1.179796870351773,
"learning_rate": 1.9955222805019786e-05,
"loss": 0.9419,
"step": 94
},
{
"epoch": 0.1037117903930131,
"grad_norm": 1.2186746600859906,
"learning_rate": 1.995355033165145e-05,
"loss": 0.9423,
"step": 95
},
{
"epoch": 0.10480349344978165,
"grad_norm": 1.174682198448618,
"learning_rate": 1.995184726672197e-05,
"loss": 0.9489,
"step": 96
},
{
"epoch": 0.10589519650655022,
"grad_norm": 1.2307959174474505,
"learning_rate": 1.9950113615465604e-05,
"loss": 0.8974,
"step": 97
},
{
"epoch": 0.10698689956331878,
"grad_norm": 1.1772887705652375,
"learning_rate": 1.994834938321061e-05,
"loss": 0.9524,
"step": 98
},
{
"epoch": 0.10807860262008734,
"grad_norm": 2.155776889552603,
"learning_rate": 1.9946554575379236e-05,
"loss": 1.0222,
"step": 99
},
{
"epoch": 0.1091703056768559,
"grad_norm": 1.2829049345821588,
"learning_rate": 1.9944729197487702e-05,
"loss": 0.8802,
"step": 100
},
{
"epoch": 0.11026200873362445,
"grad_norm": 1.095198322268674,
"learning_rate": 1.9942873255146186e-05,
"loss": 0.898,
"step": 101
},
{
"epoch": 0.11135371179039301,
"grad_norm": 1.123579265286952,
"learning_rate": 1.9940986754058792e-05,
"loss": 0.9071,
"step": 102
},
{
"epoch": 0.11244541484716157,
"grad_norm": 1.1093669744258132,
"learning_rate": 1.9939069700023564e-05,
"loss": 0.9566,
"step": 103
},
{
"epoch": 0.11353711790393013,
"grad_norm": 1.1641010979372868,
"learning_rate": 1.9937122098932428e-05,
"loss": 0.8996,
"step": 104
},
{
"epoch": 0.1146288209606987,
"grad_norm": 1.073655782753635,
"learning_rate": 1.9935143956771208e-05,
"loss": 0.9215,
"step": 105
},
{
"epoch": 0.11572052401746726,
"grad_norm": 1.1599093450874336,
"learning_rate": 1.9933135279619592e-05,
"loss": 0.9647,
"step": 106
},
{
"epoch": 0.1168122270742358,
"grad_norm": 1.1060908821324016,
"learning_rate": 1.993109607365111e-05,
"loss": 0.8704,
"step": 107
},
{
"epoch": 0.11790393013100436,
"grad_norm": 1.2199264081103358,
"learning_rate": 1.992902634513312e-05,
"loss": 0.9317,
"step": 108
},
{
"epoch": 0.11899563318777293,
"grad_norm": 1.076819952797705,
"learning_rate": 1.99269261004268e-05,
"loss": 0.8867,
"step": 109
},
{
"epoch": 0.12008733624454149,
"grad_norm": 1.1167800769323946,
"learning_rate": 1.9924795345987103e-05,
"loss": 0.9363,
"step": 110
},
{
"epoch": 0.12117903930131005,
"grad_norm": 1.166793744032167,
"learning_rate": 1.992263408836276e-05,
"loss": 0.9018,
"step": 111
},
{
"epoch": 0.1222707423580786,
"grad_norm": 1.1612724847460054,
"learning_rate": 1.9920442334196248e-05,
"loss": 0.9855,
"step": 112
},
{
"epoch": 0.12336244541484716,
"grad_norm": 1.1715585023326405,
"learning_rate": 1.9918220090223778e-05,
"loss": 0.9389,
"step": 113
},
{
"epoch": 0.12445414847161572,
"grad_norm": 1.082870769593479,
"learning_rate": 1.9915967363275264e-05,
"loss": 0.9018,
"step": 114
},
{
"epoch": 0.12554585152838427,
"grad_norm": 1.0926308702520453,
"learning_rate": 1.991368416027431e-05,
"loss": 0.923,
"step": 115
},
{
"epoch": 0.12663755458515283,
"grad_norm": 1.109680296339338,
"learning_rate": 1.9911370488238185e-05,
"loss": 0.9309,
"step": 116
},
{
"epoch": 0.1277292576419214,
"grad_norm": 1.1377031719748456,
"learning_rate": 1.99090263542778e-05,
"loss": 0.9221,
"step": 117
},
{
"epoch": 0.12882096069868995,
"grad_norm": 1.140021798184673,
"learning_rate": 1.99066517655977e-05,
"loss": 0.9459,
"step": 118
},
{
"epoch": 0.1299126637554585,
"grad_norm": 1.100102085760919,
"learning_rate": 1.990424672949601e-05,
"loss": 0.9494,
"step": 119
},
{
"epoch": 0.13100436681222707,
"grad_norm": 3.398393246795348,
"learning_rate": 1.9901811253364458e-05,
"loss": 0.9903,
"step": 120
},
{
"epoch": 0.13209606986899564,
"grad_norm": 1.1532733341471273,
"learning_rate": 1.9899345344688305e-05,
"loss": 0.9165,
"step": 121
},
{
"epoch": 0.1331877729257642,
"grad_norm": 1.1660734374963202,
"learning_rate": 1.9896849011046356e-05,
"loss": 0.9208,
"step": 122
},
{
"epoch": 0.13427947598253276,
"grad_norm": 1.2457338239550233,
"learning_rate": 1.9894322260110927e-05,
"loss": 0.9177,
"step": 123
},
{
"epoch": 0.13537117903930132,
"grad_norm": 1.1844092920395781,
"learning_rate": 1.989176509964781e-05,
"loss": 0.9356,
"step": 124
},
{
"epoch": 0.13646288209606988,
"grad_norm": 1.1588958575251505,
"learning_rate": 1.988917753751627e-05,
"loss": 0.8881,
"step": 125
},
{
"epoch": 0.13755458515283842,
"grad_norm": 1.1697350579685266,
"learning_rate": 1.9886559581669e-05,
"loss": 0.8726,
"step": 126
},
{
"epoch": 0.13864628820960698,
"grad_norm": 1.153883284055616,
"learning_rate": 1.9883911240152104e-05,
"loss": 0.945,
"step": 127
},
{
"epoch": 0.13973799126637554,
"grad_norm": 1.152362270438983,
"learning_rate": 1.988123252110509e-05,
"loss": 0.9029,
"step": 128
},
{
"epoch": 0.1408296943231441,
"grad_norm": 1.1501233711862964,
"learning_rate": 1.987852343276081e-05,
"loss": 0.9192,
"step": 129
},
{
"epoch": 0.14192139737991266,
"grad_norm": 1.174818934233749,
"learning_rate": 1.9875783983445473e-05,
"loss": 0.9452,
"step": 130
},
{
"epoch": 0.14301310043668122,
"grad_norm": 1.0908738128846232,
"learning_rate": 1.9873014181578588e-05,
"loss": 0.8974,
"step": 131
},
{
"epoch": 0.14410480349344978,
"grad_norm": 1.138767021515309,
"learning_rate": 1.9870214035672945e-05,
"loss": 0.9276,
"step": 132
},
{
"epoch": 0.14519650655021835,
"grad_norm": 1.0929665225790037,
"learning_rate": 1.9867383554334606e-05,
"loss": 0.9654,
"step": 133
},
{
"epoch": 0.1462882096069869,
"grad_norm": 1.07283100414792,
"learning_rate": 1.9864522746262867e-05,
"loss": 0.8918,
"step": 134
},
{
"epoch": 0.14737991266375547,
"grad_norm": 1.1601439978914436,
"learning_rate": 1.9861631620250224e-05,
"loss": 0.9745,
"step": 135
},
{
"epoch": 0.14847161572052403,
"grad_norm": 1.1206737520514654,
"learning_rate": 1.985871018518236e-05,
"loss": 0.9347,
"step": 136
},
{
"epoch": 0.14956331877729256,
"grad_norm": 1.0911753315113302,
"learning_rate": 1.9855758450038104e-05,
"loss": 0.8781,
"step": 137
},
{
"epoch": 0.15065502183406113,
"grad_norm": 1.0928800650594352,
"learning_rate": 1.9852776423889414e-05,
"loss": 0.89,
"step": 138
},
{
"epoch": 0.1517467248908297,
"grad_norm": 1.087821888975063,
"learning_rate": 1.9849764115901347e-05,
"loss": 0.8416,
"step": 139
},
{
"epoch": 0.15283842794759825,
"grad_norm": 1.1367000509515228,
"learning_rate": 1.984672153533202e-05,
"loss": 0.9285,
"step": 140
},
{
"epoch": 0.1539301310043668,
"grad_norm": 1.0777067492755554,
"learning_rate": 1.9843648691532608e-05,
"loss": 0.8983,
"step": 141
},
{
"epoch": 0.15502183406113537,
"grad_norm": 1.1252760246699316,
"learning_rate": 1.9840545593947286e-05,
"loss": 0.9037,
"step": 142
},
{
"epoch": 0.15611353711790393,
"grad_norm": 1.1026718224889325,
"learning_rate": 1.9837412252113208e-05,
"loss": 0.871,
"step": 143
},
{
"epoch": 0.1572052401746725,
"grad_norm": 1.09520601629096,
"learning_rate": 1.9834248675660484e-05,
"loss": 0.8835,
"step": 144
},
{
"epoch": 0.15829694323144106,
"grad_norm": 1.1334330913011768,
"learning_rate": 1.9831054874312167e-05,
"loss": 0.896,
"step": 145
},
{
"epoch": 0.15938864628820962,
"grad_norm": 1.120111966883419,
"learning_rate": 1.9827830857884173e-05,
"loss": 0.9636,
"step": 146
},
{
"epoch": 0.16048034934497818,
"grad_norm": 1.1400337285874271,
"learning_rate": 1.9824576636285306e-05,
"loss": 0.892,
"step": 147
},
{
"epoch": 0.1615720524017467,
"grad_norm": 1.1111987618475436,
"learning_rate": 1.982129221951719e-05,
"loss": 0.9267,
"step": 148
},
{
"epoch": 0.16266375545851527,
"grad_norm": 1.0890011222581428,
"learning_rate": 1.9817977617674263e-05,
"loss": 0.8851,
"step": 149
},
{
"epoch": 0.16375545851528384,
"grad_norm": 1.118643797208063,
"learning_rate": 1.9814632840943728e-05,
"loss": 1.0023,
"step": 150
},
{
"epoch": 0.1648471615720524,
"grad_norm": 1.2142861082842908,
"learning_rate": 1.981125789960552e-05,
"loss": 0.9428,
"step": 151
},
{
"epoch": 0.16593886462882096,
"grad_norm": 1.2002950376492378,
"learning_rate": 1.9807852804032306e-05,
"loss": 0.9224,
"step": 152
},
{
"epoch": 0.16703056768558952,
"grad_norm": 1.1653737685719343,
"learning_rate": 1.9804417564689405e-05,
"loss": 0.8864,
"step": 153
},
{
"epoch": 0.16812227074235808,
"grad_norm": 1.1850492851012728,
"learning_rate": 1.98009521921348e-05,
"loss": 0.9356,
"step": 154
},
{
"epoch": 0.16921397379912664,
"grad_norm": 1.182698281481592,
"learning_rate": 1.979745669701907e-05,
"loss": 0.8862,
"step": 155
},
{
"epoch": 0.1703056768558952,
"grad_norm": 1.1180078893407057,
"learning_rate": 1.9793931090085385e-05,
"loss": 0.8888,
"step": 156
},
{
"epoch": 0.17139737991266377,
"grad_norm": 1.142716433760269,
"learning_rate": 1.979037538216946e-05,
"loss": 0.9084,
"step": 157
},
{
"epoch": 0.17248908296943233,
"grad_norm": 1.1171304481378381,
"learning_rate": 1.9786789584199523e-05,
"loss": 0.8867,
"step": 158
},
{
"epoch": 0.17358078602620086,
"grad_norm": 1.1659904010027182,
"learning_rate": 1.9783173707196278e-05,
"loss": 0.9069,
"step": 159
},
{
"epoch": 0.17467248908296942,
"grad_norm": 1.045365083133747,
"learning_rate": 1.9779527762272877e-05,
"loss": 0.9092,
"step": 160
},
{
"epoch": 0.17576419213973798,
"grad_norm": 1.1201793382666256,
"learning_rate": 1.9775851760634886e-05,
"loss": 0.9424,
"step": 161
},
{
"epoch": 0.17685589519650655,
"grad_norm": 1.0677966150679934,
"learning_rate": 1.977214571358025e-05,
"loss": 0.9442,
"step": 162
},
{
"epoch": 0.1779475982532751,
"grad_norm": 1.0719807152314682,
"learning_rate": 1.9768409632499244e-05,
"loss": 0.8923,
"step": 163
},
{
"epoch": 0.17903930131004367,
"grad_norm": 1.0694626417364073,
"learning_rate": 1.976464352887447e-05,
"loss": 0.8583,
"step": 164
},
{
"epoch": 0.18013100436681223,
"grad_norm": 1.1275814819553525,
"learning_rate": 1.9760847414280783e-05,
"loss": 0.9339,
"step": 165
},
{
"epoch": 0.1812227074235808,
"grad_norm": 1.0935826775961888,
"learning_rate": 1.9757021300385288e-05,
"loss": 0.8894,
"step": 166
},
{
"epoch": 0.18231441048034935,
"grad_norm": 1.1136557849424122,
"learning_rate": 1.9753165198947284e-05,
"loss": 0.9035,
"step": 167
},
{
"epoch": 0.18340611353711792,
"grad_norm": 1.1716854599828845,
"learning_rate": 1.9749279121818235e-05,
"loss": 0.9004,
"step": 168
},
{
"epoch": 0.18449781659388648,
"grad_norm": 1.0838952277900018,
"learning_rate": 1.9745363080941745e-05,
"loss": 0.9016,
"step": 169
},
{
"epoch": 0.185589519650655,
"grad_norm": 1.1593918034011053,
"learning_rate": 1.974141708835349e-05,
"loss": 0.9122,
"step": 170
},
{
"epoch": 0.18668122270742357,
"grad_norm": 1.1432705462896862,
"learning_rate": 1.973744115618121e-05,
"loss": 0.8681,
"step": 171
},
{
"epoch": 0.18777292576419213,
"grad_norm": 1.073066538159113,
"learning_rate": 1.973343529664467e-05,
"loss": 0.8685,
"step": 172
},
{
"epoch": 0.1888646288209607,
"grad_norm": 1.139229770536511,
"learning_rate": 1.9729399522055603e-05,
"loss": 0.8824,
"step": 173
},
{
"epoch": 0.18995633187772926,
"grad_norm": 1.103073711802851,
"learning_rate": 1.9725333844817688e-05,
"loss": 0.8713,
"step": 174
},
{
"epoch": 0.19104803493449782,
"grad_norm": 1.1096932570978872,
"learning_rate": 1.972123827742651e-05,
"loss": 0.9336,
"step": 175
},
{
"epoch": 0.19213973799126638,
"grad_norm": 1.1372987569252102,
"learning_rate": 1.971711283246951e-05,
"loss": 0.8666,
"step": 176
},
{
"epoch": 0.19323144104803494,
"grad_norm": 1.2409826348380575,
"learning_rate": 1.9712957522625974e-05,
"loss": 0.945,
"step": 177
},
{
"epoch": 0.1943231441048035,
"grad_norm": 1.0864161507998062,
"learning_rate": 1.9708772360666958e-05,
"loss": 0.9041,
"step": 178
},
{
"epoch": 0.19541484716157206,
"grad_norm": 1.0687833462949532,
"learning_rate": 1.970455735945527e-05,
"loss": 0.9002,
"step": 179
},
{
"epoch": 0.1965065502183406,
"grad_norm": 1.1122888170914484,
"learning_rate": 1.9700312531945444e-05,
"loss": 0.9068,
"step": 180
},
{
"epoch": 0.19759825327510916,
"grad_norm": 1.0671231307410538,
"learning_rate": 1.9696037891183652e-05,
"loss": 0.8582,
"step": 181
},
{
"epoch": 0.19868995633187772,
"grad_norm": 1.231978886418304,
"learning_rate": 1.9691733450307723e-05,
"loss": 0.8718,
"step": 182
},
{
"epoch": 0.19978165938864628,
"grad_norm": 1.0513585608675498,
"learning_rate": 1.968739922254706e-05,
"loss": 0.8678,
"step": 183
},
{
"epoch": 0.20087336244541484,
"grad_norm": 1.1004131801370571,
"learning_rate": 1.9683035221222617e-05,
"loss": 0.882,
"step": 184
},
{
"epoch": 0.2019650655021834,
"grad_norm": 1.1119022799292562,
"learning_rate": 1.9678641459746858e-05,
"loss": 0.8375,
"step": 185
},
{
"epoch": 0.20305676855895197,
"grad_norm": 1.2828585823405474,
"learning_rate": 1.967421795162371e-05,
"loss": 0.9672,
"step": 186
},
{
"epoch": 0.20414847161572053,
"grad_norm": 1.0960032312397867,
"learning_rate": 1.9669764710448523e-05,
"loss": 0.826,
"step": 187
},
{
"epoch": 0.2052401746724891,
"grad_norm": 1.136561558965275,
"learning_rate": 1.9665281749908034e-05,
"loss": 0.8788,
"step": 188
},
{
"epoch": 0.20633187772925765,
"grad_norm": 1.0573043883882423,
"learning_rate": 1.966076908378032e-05,
"loss": 0.8727,
"step": 189
},
{
"epoch": 0.2074235807860262,
"grad_norm": 1.0822840901766584,
"learning_rate": 1.9656226725934745e-05,
"loss": 0.8898,
"step": 190
},
{
"epoch": 0.20851528384279475,
"grad_norm": 1.1120619489254118,
"learning_rate": 1.9651654690331945e-05,
"loss": 0.8518,
"step": 191
},
{
"epoch": 0.2096069868995633,
"grad_norm": 1.0468864059258323,
"learning_rate": 1.964705299102376e-05,
"loss": 0.8557,
"step": 192
},
{
"epoch": 0.21069868995633187,
"grad_norm": 1.4278927628034137,
"learning_rate": 1.96424216421532e-05,
"loss": 0.8696,
"step": 193
},
{
"epoch": 0.21179039301310043,
"grad_norm": 1.147251174802579,
"learning_rate": 1.96377606579544e-05,
"loss": 0.89,
"step": 194
},
{
"epoch": 0.212882096069869,
"grad_norm": 1.1186032822531382,
"learning_rate": 1.963307005275258e-05,
"loss": 0.881,
"step": 195
},
{
"epoch": 0.21397379912663755,
"grad_norm": 1.0832284053417196,
"learning_rate": 1.9628349840963997e-05,
"loss": 0.9257,
"step": 196
},
{
"epoch": 0.21506550218340612,
"grad_norm": 1.116941116046061,
"learning_rate": 1.96236000370959e-05,
"loss": 0.8976,
"step": 197
},
{
"epoch": 0.21615720524017468,
"grad_norm": 1.023005979642442,
"learning_rate": 1.9618820655746488e-05,
"loss": 0.8532,
"step": 198
},
{
"epoch": 0.21724890829694324,
"grad_norm": 1.1200494064955775,
"learning_rate": 1.9614011711604863e-05,
"loss": 0.9159,
"step": 199
},
{
"epoch": 0.2183406113537118,
"grad_norm": 1.0966895099156009,
"learning_rate": 1.9609173219450998e-05,
"loss": 0.8763,
"step": 200
},
{
"epoch": 0.21943231441048036,
"grad_norm": 1.1424389070902334,
"learning_rate": 1.960430519415566e-05,
"loss": 0.9287,
"step": 201
},
{
"epoch": 0.2205240174672489,
"grad_norm": 1.1544455317485216,
"learning_rate": 1.9599407650680397e-05,
"loss": 0.9619,
"step": 202
},
{
"epoch": 0.22161572052401746,
"grad_norm": 1.0466089924901163,
"learning_rate": 1.959448060407748e-05,
"loss": 0.9045,
"step": 203
},
{
"epoch": 0.22270742358078602,
"grad_norm": 1.0824896062471592,
"learning_rate": 1.958952406948985e-05,
"loss": 0.9132,
"step": 204
},
{
"epoch": 0.22379912663755458,
"grad_norm": 1.1093467073603955,
"learning_rate": 1.9584538062151076e-05,
"loss": 0.8482,
"step": 205
},
{
"epoch": 0.22489082969432314,
"grad_norm": 1.0619679279007732,
"learning_rate": 1.9579522597385315e-05,
"loss": 0.9068,
"step": 206
},
{
"epoch": 0.2259825327510917,
"grad_norm": 4.38385984663046,
"learning_rate": 1.957447769060726e-05,
"loss": 0.9276,
"step": 207
},
{
"epoch": 0.22707423580786026,
"grad_norm": 1.2265776059461564,
"learning_rate": 1.956940335732209e-05,
"loss": 0.92,
"step": 208
},
{
"epoch": 0.22816593886462883,
"grad_norm": 1.132581284506294,
"learning_rate": 1.956429961312542e-05,
"loss": 0.8389,
"step": 209
},
{
"epoch": 0.2292576419213974,
"grad_norm": 1.109888606752121,
"learning_rate": 1.9559166473703265e-05,
"loss": 0.8816,
"step": 210
},
{
"epoch": 0.23034934497816595,
"grad_norm": 1.2201116053725314,
"learning_rate": 1.9554003954831975e-05,
"loss": 0.9209,
"step": 211
},
{
"epoch": 0.2314410480349345,
"grad_norm": 1.039809201288955,
"learning_rate": 1.9548812072378208e-05,
"loss": 0.8509,
"step": 212
},
{
"epoch": 0.23253275109170304,
"grad_norm": 1.1486993213225967,
"learning_rate": 1.9543590842298856e-05,
"loss": 0.916,
"step": 213
},
{
"epoch": 0.2336244541484716,
"grad_norm": 1.1166716250519264,
"learning_rate": 1.9538340280641018e-05,
"loss": 0.9079,
"step": 214
},
{
"epoch": 0.23471615720524017,
"grad_norm": 1.0901968030731561,
"learning_rate": 1.9533060403541937e-05,
"loss": 0.9141,
"step": 215
},
{
"epoch": 0.23580786026200873,
"grad_norm": 1.2117539613654056,
"learning_rate": 1.9527751227228964e-05,
"loss": 0.8885,
"step": 216
},
{
"epoch": 0.2368995633187773,
"grad_norm": 1.0732394066038071,
"learning_rate": 1.9522412768019485e-05,
"loss": 0.825,
"step": 217
},
{
"epoch": 0.23799126637554585,
"grad_norm": 1.1061545688900685,
"learning_rate": 1.9517045042320893e-05,
"loss": 0.8777,
"step": 218
},
{
"epoch": 0.2390829694323144,
"grad_norm": 1.1462666937584411,
"learning_rate": 1.9511648066630528e-05,
"loss": 0.9296,
"step": 219
},
{
"epoch": 0.24017467248908297,
"grad_norm": 1.0583366131175636,
"learning_rate": 1.950622185753563e-05,
"loss": 0.8803,
"step": 220
},
{
"epoch": 0.24126637554585154,
"grad_norm": 1.234586850245016,
"learning_rate": 1.9500766431713284e-05,
"loss": 0.9204,
"step": 221
},
{
"epoch": 0.2423580786026201,
"grad_norm": 1.1576185571814945,
"learning_rate": 1.949528180593037e-05,
"loss": 0.8806,
"step": 222
},
{
"epoch": 0.24344978165938866,
"grad_norm": 1.1061637979478478,
"learning_rate": 1.9489767997043513e-05,
"loss": 0.9004,
"step": 223
},
{
"epoch": 0.2445414847161572,
"grad_norm": 1.1137629138437286,
"learning_rate": 1.9484225021999032e-05,
"loss": 0.9081,
"step": 224
},
{
"epoch": 0.24563318777292575,
"grad_norm": 1.0960347699600748,
"learning_rate": 1.947865289783288e-05,
"loss": 0.9296,
"step": 225
},
{
"epoch": 0.24672489082969432,
"grad_norm": 1.039350963491418,
"learning_rate": 1.9473051641670606e-05,
"loss": 0.8883,
"step": 226
},
{
"epoch": 0.24781659388646288,
"grad_norm": 1.0866918799532765,
"learning_rate": 1.9467421270727292e-05,
"loss": 0.9159,
"step": 227
},
{
"epoch": 0.24890829694323144,
"grad_norm": 1.093705184952943,
"learning_rate": 1.9461761802307494e-05,
"loss": 0.9412,
"step": 228
},
{
"epoch": 0.25,
"grad_norm": 1.2947600816095237,
"learning_rate": 1.9456073253805214e-05,
"loss": 0.8916,
"step": 229
},
{
"epoch": 0.25109170305676853,
"grad_norm": 1.1659459607951335,
"learning_rate": 1.9450355642703812e-05,
"loss": 0.8984,
"step": 230
},
{
"epoch": 0.2521834061135371,
"grad_norm": 1.0686347894819066,
"learning_rate": 1.9444608986575983e-05,
"loss": 0.8169,
"step": 231
},
{
"epoch": 0.25327510917030566,
"grad_norm": 1.081412554616493,
"learning_rate": 1.9438833303083677e-05,
"loss": 0.8356,
"step": 232
},
{
"epoch": 0.25436681222707425,
"grad_norm": 1.1259063624208197,
"learning_rate": 1.943302860997807e-05,
"loss": 0.8907,
"step": 233
},
{
"epoch": 0.2554585152838428,
"grad_norm": 3.913366519387283,
"learning_rate": 1.9427194925099494e-05,
"loss": 0.8454,
"step": 234
},
{
"epoch": 0.25655021834061137,
"grad_norm": 1.2143857617378389,
"learning_rate": 1.942133226637738e-05,
"loss": 0.8486,
"step": 235
},
{
"epoch": 0.2576419213973799,
"grad_norm": 1.1627640633320033,
"learning_rate": 1.941544065183021e-05,
"loss": 0.8935,
"step": 236
},
{
"epoch": 0.2587336244541485,
"grad_norm": 1.0628171650504668,
"learning_rate": 1.9409520099565463e-05,
"loss": 0.9077,
"step": 237
},
{
"epoch": 0.259825327510917,
"grad_norm": 1.218465436130322,
"learning_rate": 1.940357062777956e-05,
"loss": 0.8603,
"step": 238
},
{
"epoch": 0.2609170305676856,
"grad_norm": 1.1387894450341074,
"learning_rate": 1.939759225475779e-05,
"loss": 0.8989,
"step": 239
},
{
"epoch": 0.26200873362445415,
"grad_norm": 1.1726845697998067,
"learning_rate": 1.939158499887428e-05,
"loss": 0.8979,
"step": 240
},
{
"epoch": 0.2631004366812227,
"grad_norm": 1.119987541750633,
"learning_rate": 1.9385548878591925e-05,
"loss": 0.8676,
"step": 241
},
{
"epoch": 0.26419213973799127,
"grad_norm": 1.1196661256503484,
"learning_rate": 1.9379483912462326e-05,
"loss": 0.8699,
"step": 242
},
{
"epoch": 0.2652838427947598,
"grad_norm": 1.0649434577941834,
"learning_rate": 1.937339011912575e-05,
"loss": 0.8744,
"step": 243
},
{
"epoch": 0.2663755458515284,
"grad_norm": 1.1304833471428484,
"learning_rate": 1.9367267517311057e-05,
"loss": 0.9176,
"step": 244
},
{
"epoch": 0.26746724890829693,
"grad_norm": 1.0999222694926711,
"learning_rate": 1.9361116125835645e-05,
"loss": 0.9084,
"step": 245
},
{
"epoch": 0.2685589519650655,
"grad_norm": 1.1117779685873486,
"learning_rate": 1.9354935963605395e-05,
"loss": 0.9227,
"step": 246
},
{
"epoch": 0.26965065502183405,
"grad_norm": 1.0980143373141853,
"learning_rate": 1.9348727049614623e-05,
"loss": 0.8546,
"step": 247
},
{
"epoch": 0.27074235807860264,
"grad_norm": 1.044338143636063,
"learning_rate": 1.9342489402945997e-05,
"loss": 0.8226,
"step": 248
},
{
"epoch": 0.2718340611353712,
"grad_norm": 1.024743860918796,
"learning_rate": 1.933622304277051e-05,
"loss": 0.908,
"step": 249
},
{
"epoch": 0.27292576419213976,
"grad_norm": 1.0175454403113442,
"learning_rate": 1.932992798834739e-05,
"loss": 0.8619,
"step": 250
},
{
"epoch": 0.2740174672489083,
"grad_norm": 1.0517054850354142,
"learning_rate": 1.9323604259024058e-05,
"loss": 0.8951,
"step": 251
},
{
"epoch": 0.27510917030567683,
"grad_norm": 1.0893504511321779,
"learning_rate": 1.9317251874236066e-05,
"loss": 0.9239,
"step": 252
},
{
"epoch": 0.2762008733624454,
"grad_norm": 1.0864146357207372,
"learning_rate": 1.9310870853507043e-05,
"loss": 0.8384,
"step": 253
},
{
"epoch": 0.27729257641921395,
"grad_norm": 1.0941739834235782,
"learning_rate": 1.9304461216448612e-05,
"loss": 0.8796,
"step": 254
},
{
"epoch": 0.27838427947598254,
"grad_norm": 1.0936293887799418,
"learning_rate": 1.929802298276037e-05,
"loss": 0.86,
"step": 255
},
{
"epoch": 0.2794759825327511,
"grad_norm": 1.0722380682685169,
"learning_rate": 1.9291556172229784e-05,
"loss": 0.8999,
"step": 256
},
{
"epoch": 0.28056768558951967,
"grad_norm": 1.0187008687239898,
"learning_rate": 1.928506080473216e-05,
"loss": 0.8493,
"step": 257
},
{
"epoch": 0.2816593886462882,
"grad_norm": 1.0626955444528252,
"learning_rate": 1.9278536900230564e-05,
"loss": 0.8672,
"step": 258
},
{
"epoch": 0.2827510917030568,
"grad_norm": 1.1529872565057653,
"learning_rate": 1.9271984478775776e-05,
"loss": 0.9069,
"step": 259
},
{
"epoch": 0.2838427947598253,
"grad_norm": 1.021822512630616,
"learning_rate": 1.9265403560506223e-05,
"loss": 0.8016,
"step": 260
},
{
"epoch": 0.2849344978165939,
"grad_norm": 1.077071455011887,
"learning_rate": 1.9258794165647904e-05,
"loss": 0.922,
"step": 261
},
{
"epoch": 0.28602620087336245,
"grad_norm": 1.02625158220979,
"learning_rate": 1.9252156314514353e-05,
"loss": 0.8756,
"step": 262
},
{
"epoch": 0.287117903930131,
"grad_norm": 1.071479499275788,
"learning_rate": 1.9245490027506544e-05,
"loss": 0.8883,
"step": 263
},
{
"epoch": 0.28820960698689957,
"grad_norm": 1.0311583545444096,
"learning_rate": 1.9238795325112867e-05,
"loss": 0.8505,
"step": 264
},
{
"epoch": 0.2893013100436681,
"grad_norm": 1.0397037870150452,
"learning_rate": 1.9232072227909033e-05,
"loss": 0.8736,
"step": 265
},
{
"epoch": 0.2903930131004367,
"grad_norm": 1.0810821707671576,
"learning_rate": 1.9225320756558023e-05,
"loss": 0.8578,
"step": 266
},
{
"epoch": 0.2914847161572052,
"grad_norm": 1.0344879510590226,
"learning_rate": 1.9218540931810027e-05,
"loss": 0.8613,
"step": 267
},
{
"epoch": 0.2925764192139738,
"grad_norm": 1.0718787461179933,
"learning_rate": 1.9211732774502372e-05,
"loss": 0.8708,
"step": 268
},
{
"epoch": 0.29366812227074235,
"grad_norm": 1.079544308085131,
"learning_rate": 1.9204896305559474e-05,
"loss": 0.8676,
"step": 269
},
{
"epoch": 0.29475982532751094,
"grad_norm": 1.0270429568112234,
"learning_rate": 1.919803154599275e-05,
"loss": 0.8731,
"step": 270
},
{
"epoch": 0.29585152838427947,
"grad_norm": 1.0100906029409351,
"learning_rate": 1.919113851690058e-05,
"loss": 0.8821,
"step": 271
},
{
"epoch": 0.29694323144104806,
"grad_norm": 1.5042233737791588,
"learning_rate": 1.9184217239468213e-05,
"loss": 0.9698,
"step": 272
},
{
"epoch": 0.2980349344978166,
"grad_norm": 1.0859546180077055,
"learning_rate": 1.9177267734967727e-05,
"loss": 0.8897,
"step": 273
},
{
"epoch": 0.29912663755458513,
"grad_norm": 1.025542463223278,
"learning_rate": 1.9170290024757958e-05,
"loss": 0.8476,
"step": 274
},
{
"epoch": 0.3002183406113537,
"grad_norm": 1.0993519364312672,
"learning_rate": 1.9163284130284417e-05,
"loss": 0.9075,
"step": 275
},
{
"epoch": 0.30131004366812225,
"grad_norm": 1.0927829067402064,
"learning_rate": 1.915625007307925e-05,
"loss": 0.8239,
"step": 276
},
{
"epoch": 0.30240174672489084,
"grad_norm": 1.1243233455921855,
"learning_rate": 1.914918787476115e-05,
"loss": 0.8297,
"step": 277
},
{
"epoch": 0.3034934497816594,
"grad_norm": 1.0879664494442347,
"learning_rate": 1.914209755703531e-05,
"loss": 0.8871,
"step": 278
},
{
"epoch": 0.30458515283842796,
"grad_norm": 1.094207883593622,
"learning_rate": 1.9134979141693333e-05,
"loss": 0.8862,
"step": 279
},
{
"epoch": 0.3056768558951965,
"grad_norm": 1.1171396644422387,
"learning_rate": 1.912783265061319e-05,
"loss": 0.8719,
"step": 280
},
{
"epoch": 0.3067685589519651,
"grad_norm": 1.080936511227442,
"learning_rate": 1.9120658105759138e-05,
"loss": 0.8944,
"step": 281
},
{
"epoch": 0.3078602620087336,
"grad_norm": 1.1204710720033177,
"learning_rate": 1.9113455529181645e-05,
"loss": 0.8525,
"step": 282
},
{
"epoch": 0.3089519650655022,
"grad_norm": 1.0747416230222755,
"learning_rate": 1.9106224943017355e-05,
"loss": 0.885,
"step": 283
},
{
"epoch": 0.31004366812227074,
"grad_norm": 1.1007595003526711,
"learning_rate": 1.9098966369488967e-05,
"loss": 0.8806,
"step": 284
},
{
"epoch": 0.3111353711790393,
"grad_norm": 1.1272704353445784,
"learning_rate": 1.9091679830905225e-05,
"loss": 0.8561,
"step": 285
},
{
"epoch": 0.31222707423580787,
"grad_norm": 1.047445304068462,
"learning_rate": 1.908436534966081e-05,
"loss": 0.9214,
"step": 286
},
{
"epoch": 0.3133187772925764,
"grad_norm": 1.1850281966860399,
"learning_rate": 1.907702294823628e-05,
"loss": 0.9584,
"step": 287
},
{
"epoch": 0.314410480349345,
"grad_norm": 1.057267707952885,
"learning_rate": 1.9069652649198004e-05,
"loss": 0.8708,
"step": 288
},
{
"epoch": 0.3155021834061135,
"grad_norm": 1.0491354134226445,
"learning_rate": 1.9062254475198107e-05,
"loss": 0.8429,
"step": 289
},
{
"epoch": 0.3165938864628821,
"grad_norm": 1.071884430813601,
"learning_rate": 1.9054828448974363e-05,
"loss": 0.8839,
"step": 290
},
{
"epoch": 0.31768558951965065,
"grad_norm": 1.0139303388937126,
"learning_rate": 1.9047374593350166e-05,
"loss": 0.8614,
"step": 291
},
{
"epoch": 0.31877729257641924,
"grad_norm": 1.0305651242570724,
"learning_rate": 1.9039892931234434e-05,
"loss": 0.8824,
"step": 292
},
{
"epoch": 0.31986899563318777,
"grad_norm": 1.0507521878832264,
"learning_rate": 1.9032383485621547e-05,
"loss": 0.8182,
"step": 293
},
{
"epoch": 0.32096069868995636,
"grad_norm": 1.0217727036915132,
"learning_rate": 1.9024846279591275e-05,
"loss": 0.8643,
"step": 294
},
{
"epoch": 0.3220524017467249,
"grad_norm": 1.157334719383855,
"learning_rate": 1.901728133630871e-05,
"loss": 0.8379,
"step": 295
},
{
"epoch": 0.3231441048034934,
"grad_norm": 1.0997360316521576,
"learning_rate": 1.900968867902419e-05,
"loss": 0.864,
"step": 296
},
{
"epoch": 0.324235807860262,
"grad_norm": 1.1108332410646824,
"learning_rate": 1.9002068331073237e-05,
"loss": 0.9096,
"step": 297
},
{
"epoch": 0.32532751091703055,
"grad_norm": 1.0926477915837476,
"learning_rate": 1.899442031587647e-05,
"loss": 0.8797,
"step": 298
},
{
"epoch": 0.32641921397379914,
"grad_norm": 1.1002227344486895,
"learning_rate": 1.898674465693954e-05,
"loss": 0.8819,
"step": 299
},
{
"epoch": 0.32751091703056767,
"grad_norm": 1.0841231073305384,
"learning_rate": 1.8979041377853068e-05,
"loss": 0.8634,
"step": 300
},
{
"epoch": 0.32860262008733626,
"grad_norm": 1.0942944122131784,
"learning_rate": 1.897131050229256e-05,
"loss": 0.9292,
"step": 301
},
{
"epoch": 0.3296943231441048,
"grad_norm": 1.0715243516004385,
"learning_rate": 1.8963552054018335e-05,
"loss": 0.8716,
"step": 302
},
{
"epoch": 0.3307860262008734,
"grad_norm": 0.9927779537186682,
"learning_rate": 1.8955766056875456e-05,
"loss": 0.8362,
"step": 303
},
{
"epoch": 0.3318777292576419,
"grad_norm": 1.0640155771936333,
"learning_rate": 1.8947952534793663e-05,
"loss": 0.8649,
"step": 304
},
{
"epoch": 0.3329694323144105,
"grad_norm": 1.053838113711653,
"learning_rate": 1.8940111511787277e-05,
"loss": 0.867,
"step": 305
},
{
"epoch": 0.33406113537117904,
"grad_norm": 1.006526089115897,
"learning_rate": 1.8932243011955154e-05,
"loss": 0.9006,
"step": 306
},
{
"epoch": 0.3351528384279476,
"grad_norm": 1.0728197830124255,
"learning_rate": 1.8924347059480595e-05,
"loss": 0.8567,
"step": 307
},
{
"epoch": 0.33624454148471616,
"grad_norm": 1.05344829703244,
"learning_rate": 1.891642367863127e-05,
"loss": 0.8772,
"step": 308
},
{
"epoch": 0.3373362445414847,
"grad_norm": 1.0645124448972854,
"learning_rate": 1.890847289375916e-05,
"loss": 0.9114,
"step": 309
},
{
"epoch": 0.3384279475982533,
"grad_norm": 1.0747344407375048,
"learning_rate": 1.8900494729300453e-05,
"loss": 0.8583,
"step": 310
},
{
"epoch": 0.3395196506550218,
"grad_norm": 1.0290709872101504,
"learning_rate": 1.88924892097755e-05,
"loss": 0.8742,
"step": 311
},
{
"epoch": 0.3406113537117904,
"grad_norm": 1.0708426763000087,
"learning_rate": 1.8884456359788725e-05,
"loss": 0.8905,
"step": 312
},
{
"epoch": 0.34170305676855894,
"grad_norm": 1.1036268372797233,
"learning_rate": 1.8876396204028543e-05,
"loss": 0.8412,
"step": 313
},
{
"epoch": 0.34279475982532753,
"grad_norm": 1.682452908084444,
"learning_rate": 1.8868308767267294e-05,
"loss": 0.9803,
"step": 314
},
{
"epoch": 0.34388646288209607,
"grad_norm": 1.0618935535379088,
"learning_rate": 1.8860194074361168e-05,
"loss": 0.8668,
"step": 315
},
{
"epoch": 0.34497816593886466,
"grad_norm": 1.2769841836547489,
"learning_rate": 1.8852052150250123e-05,
"loss": 0.8677,
"step": 316
},
{
"epoch": 0.3460698689956332,
"grad_norm": 1.0224935661717929,
"learning_rate": 1.884388301995781e-05,
"loss": 0.8243,
"step": 317
},
{
"epoch": 0.3471615720524017,
"grad_norm": 1.0306490619845388,
"learning_rate": 1.8835686708591495e-05,
"loss": 0.8013,
"step": 318
},
{
"epoch": 0.3482532751091703,
"grad_norm": 1.0625559163468348,
"learning_rate": 1.882746324134199e-05,
"loss": 0.8922,
"step": 319
},
{
"epoch": 0.34934497816593885,
"grad_norm": 1.1094178005123845,
"learning_rate": 1.881921264348355e-05,
"loss": 0.875,
"step": 320
},
{
"epoch": 0.35043668122270744,
"grad_norm": 1.019854115964412,
"learning_rate": 1.8810934940373843e-05,
"loss": 0.8288,
"step": 321
},
{
"epoch": 0.35152838427947597,
"grad_norm": 1.129491723926347,
"learning_rate": 1.8802630157453817e-05,
"loss": 0.8893,
"step": 322
},
{
"epoch": 0.35262008733624456,
"grad_norm": 2.407984766571855,
"learning_rate": 1.8794298320247665e-05,
"loss": 0.8996,
"step": 323
},
{
"epoch": 0.3537117903930131,
"grad_norm": 1.0668869542446686,
"learning_rate": 1.878593945436272e-05,
"loss": 0.8829,
"step": 324
},
{
"epoch": 0.3548034934497817,
"grad_norm": 1.027197909624692,
"learning_rate": 1.8777553585489386e-05,
"loss": 0.8869,
"step": 325
},
{
"epoch": 0.3558951965065502,
"grad_norm": 1.1180374790635763,
"learning_rate": 1.8769140739401063e-05,
"loss": 0.8918,
"step": 326
},
{
"epoch": 0.3569868995633188,
"grad_norm": 1.0468999001715216,
"learning_rate": 1.8760700941954066e-05,
"loss": 0.8623,
"step": 327
},
{
"epoch": 0.35807860262008734,
"grad_norm": 1.0603483002623066,
"learning_rate": 1.8752234219087538e-05,
"loss": 0.8979,
"step": 328
},
{
"epoch": 0.35917030567685587,
"grad_norm": 1.0943787564656549,
"learning_rate": 1.8743740596823373e-05,
"loss": 0.889,
"step": 329
},
{
"epoch": 0.36026200873362446,
"grad_norm": 1.040251056957098,
"learning_rate": 1.873522010126615e-05,
"loss": 0.8615,
"step": 330
},
{
"epoch": 0.361353711790393,
"grad_norm": 1.0252415932639054,
"learning_rate": 1.8726672758603028e-05,
"loss": 0.822,
"step": 331
},
{
"epoch": 0.3624454148471616,
"grad_norm": 1.0498064378341836,
"learning_rate": 1.871809859510368e-05,
"loss": 0.876,
"step": 332
},
{
"epoch": 0.3635371179039301,
"grad_norm": 3.1234497627187476,
"learning_rate": 1.8709497637120222e-05,
"loss": 1.0257,
"step": 333
},
{
"epoch": 0.3646288209606987,
"grad_norm": 1.1941436771593972,
"learning_rate": 1.8700869911087115e-05,
"loss": 0.9056,
"step": 334
},
{
"epoch": 0.36572052401746724,
"grad_norm": 1.0664111934492433,
"learning_rate": 1.8692215443521086e-05,
"loss": 0.8953,
"step": 335
},
{
"epoch": 0.36681222707423583,
"grad_norm": 3.941553612022084,
"learning_rate": 1.8683534261021058e-05,
"loss": 0.9104,
"step": 336
},
{
"epoch": 0.36790393013100436,
"grad_norm": 1.1169531095255332,
"learning_rate": 1.867482639026805e-05,
"loss": 0.9231,
"step": 337
},
{
"epoch": 0.36899563318777295,
"grad_norm": 1.0345131327197774,
"learning_rate": 1.8666091858025113e-05,
"loss": 0.8831,
"step": 338
},
{
"epoch": 0.3700873362445415,
"grad_norm": 1.0197343975131627,
"learning_rate": 1.865733069113724e-05,
"loss": 0.8138,
"step": 339
},
{
"epoch": 0.37117903930131,
"grad_norm": 1.1099897380567005,
"learning_rate": 1.8648542916531283e-05,
"loss": 0.8628,
"step": 340
},
{
"epoch": 0.3722707423580786,
"grad_norm": 1.1244141351963375,
"learning_rate": 1.863972856121587e-05,
"loss": 0.8417,
"step": 341
},
{
"epoch": 0.37336244541484714,
"grad_norm": 1.0138120177828884,
"learning_rate": 1.8630887652281325e-05,
"loss": 0.8069,
"step": 342
},
{
"epoch": 0.37445414847161573,
"grad_norm": 1.0651674776026643,
"learning_rate": 1.8622020216899578e-05,
"loss": 0.8388,
"step": 343
},
{
"epoch": 0.37554585152838427,
"grad_norm": 1.0324245520605353,
"learning_rate": 1.8613126282324092e-05,
"loss": 0.8435,
"step": 344
},
{
"epoch": 0.37663755458515286,
"grad_norm": 1.0564097917077362,
"learning_rate": 1.860420587588977e-05,
"loss": 0.8118,
"step": 345
},
{
"epoch": 0.3777292576419214,
"grad_norm": 1.1426965106863376,
"learning_rate": 1.859525902501288e-05,
"loss": 0.8844,
"step": 346
},
{
"epoch": 0.37882096069869,
"grad_norm": 1.0998660035228724,
"learning_rate": 1.8586285757190952e-05,
"loss": 0.861,
"step": 347
},
{
"epoch": 0.3799126637554585,
"grad_norm": 1.0891194187264164,
"learning_rate": 1.8577286100002723e-05,
"loss": 0.8693,
"step": 348
},
{
"epoch": 0.38100436681222705,
"grad_norm": 1.0445800669656011,
"learning_rate": 1.8568260081108026e-05,
"loss": 0.8549,
"step": 349
},
{
"epoch": 0.38209606986899564,
"grad_norm": 1.0095555737733146,
"learning_rate": 1.8559207728247716e-05,
"loss": 0.8372,
"step": 350
},
{
"epoch": 0.38318777292576417,
"grad_norm": 1.0782684914388916,
"learning_rate": 1.8550129069243585e-05,
"loss": 0.905,
"step": 351
},
{
"epoch": 0.38427947598253276,
"grad_norm": 1.0732992425493888,
"learning_rate": 1.8541024131998277e-05,
"loss": 0.8223,
"step": 352
},
{
"epoch": 0.3853711790393013,
"grad_norm": 1.0895010265352918,
"learning_rate": 1.8531892944495197e-05,
"loss": 0.8398,
"step": 353
},
{
"epoch": 0.3864628820960699,
"grad_norm": 1.0360428026938602,
"learning_rate": 1.852273553479843e-05,
"loss": 0.877,
"step": 354
},
{
"epoch": 0.3875545851528384,
"grad_norm": 1.962940812783276,
"learning_rate": 1.8513551931052654e-05,
"loss": 0.9007,
"step": 355
},
{
"epoch": 0.388646288209607,
"grad_norm": 1.1920882816536065,
"learning_rate": 1.850434216148305e-05,
"loss": 0.9108,
"step": 356
},
{
"epoch": 0.38973799126637554,
"grad_norm": 1.0752529630656973,
"learning_rate": 1.8495106254395217e-05,
"loss": 0.8958,
"step": 357
},
{
"epoch": 0.39082969432314413,
"grad_norm": 1.1468554032302092,
"learning_rate": 1.8485844238175096e-05,
"loss": 0.8918,
"step": 358
},
{
"epoch": 0.39192139737991266,
"grad_norm": 1.0162198739363895,
"learning_rate": 1.8476556141288858e-05,
"loss": 0.8029,
"step": 359
},
{
"epoch": 0.3930131004366812,
"grad_norm": 1.0071775391810776,
"learning_rate": 1.8467241992282842e-05,
"loss": 0.7979,
"step": 360
},
{
"epoch": 0.3941048034934498,
"grad_norm": 1.2228374262090314,
"learning_rate": 1.845790181978345e-05,
"loss": 0.8762,
"step": 361
},
{
"epoch": 0.3951965065502183,
"grad_norm": 1.0304357048780401,
"learning_rate": 1.8448535652497073e-05,
"loss": 0.8561,
"step": 362
},
{
"epoch": 0.3962882096069869,
"grad_norm": 1.1760020019475943,
"learning_rate": 1.8439143519209982e-05,
"loss": 0.881,
"step": 363
},
{
"epoch": 0.39737991266375544,
"grad_norm": 1.0672221787016125,
"learning_rate": 1.8429725448788267e-05,
"loss": 0.8421,
"step": 364
},
{
"epoch": 0.39847161572052403,
"grad_norm": 1.0929075918275972,
"learning_rate": 1.8420281470177728e-05,
"loss": 0.9352,
"step": 365
},
{
"epoch": 0.39956331877729256,
"grad_norm": 1.1162701706741684,
"learning_rate": 1.841081161240379e-05,
"loss": 0.907,
"step": 366
},
{
"epoch": 0.40065502183406115,
"grad_norm": 1.1076237739075554,
"learning_rate": 1.8401315904571415e-05,
"loss": 0.8667,
"step": 367
},
{
"epoch": 0.4017467248908297,
"grad_norm": 1.0712665487420094,
"learning_rate": 1.8391794375865025e-05,
"loss": 0.8315,
"step": 368
},
{
"epoch": 0.4028384279475983,
"grad_norm": 1.0898594530898202,
"learning_rate": 1.838224705554838e-05,
"loss": 0.8743,
"step": 369
},
{
"epoch": 0.4039301310043668,
"grad_norm": 1.0819675266807698,
"learning_rate": 1.8372673972964535e-05,
"loss": 0.8878,
"step": 370
},
{
"epoch": 0.40502183406113534,
"grad_norm": 1.045266116724849,
"learning_rate": 1.8363075157535696e-05,
"loss": 0.874,
"step": 371
},
{
"epoch": 0.40611353711790393,
"grad_norm": 1.0785004475662123,
"learning_rate": 1.8353450638763178e-05,
"loss": 0.8574,
"step": 372
},
{
"epoch": 0.40720524017467247,
"grad_norm": 1.002318943543793,
"learning_rate": 1.8343800446227286e-05,
"loss": 0.838,
"step": 373
},
{
"epoch": 0.40829694323144106,
"grad_norm": 1.0959648774204325,
"learning_rate": 1.833412460958723e-05,
"loss": 0.8624,
"step": 374
},
{
"epoch": 0.4093886462882096,
"grad_norm": 1.0679441203378826,
"learning_rate": 1.8324423158581034e-05,
"loss": 0.9165,
"step": 375
},
{
"epoch": 0.4104803493449782,
"grad_norm": 1.0657977208061515,
"learning_rate": 1.8314696123025456e-05,
"loss": 0.8504,
"step": 376
},
{
"epoch": 0.4115720524017467,
"grad_norm": 1.0786066636114091,
"learning_rate": 1.830494353281587e-05,
"loss": 0.8859,
"step": 377
},
{
"epoch": 0.4126637554585153,
"grad_norm": 1.0844538985964034,
"learning_rate": 1.8295165417926207e-05,
"loss": 0.8758,
"step": 378
},
{
"epoch": 0.41375545851528384,
"grad_norm": 1.1369381234410079,
"learning_rate": 1.828536180840884e-05,
"loss": 0.878,
"step": 379
},
{
"epoch": 0.4148471615720524,
"grad_norm": 1.0365327422122983,
"learning_rate": 1.827553273439449e-05,
"loss": 0.8178,
"step": 380
},
{
"epoch": 0.41593886462882096,
"grad_norm": 1.010419294509892,
"learning_rate": 1.826567822609216e-05,
"loss": 0.7879,
"step": 381
},
{
"epoch": 0.4170305676855895,
"grad_norm": 1.1313321136780952,
"learning_rate": 1.8255798313789e-05,
"loss": 0.8538,
"step": 382
},
{
"epoch": 0.4181222707423581,
"grad_norm": 1.0546556015471207,
"learning_rate": 1.8245893027850255e-05,
"loss": 0.8535,
"step": 383
},
{
"epoch": 0.4192139737991266,
"grad_norm": 1.1066352453385073,
"learning_rate": 1.823596239871915e-05,
"loss": 0.8706,
"step": 384
},
{
"epoch": 0.4203056768558952,
"grad_norm": 1.0788001936830498,
"learning_rate": 1.8226006456916796e-05,
"loss": 0.8743,
"step": 385
},
{
"epoch": 0.42139737991266374,
"grad_norm": 1.0281859293036308,
"learning_rate": 1.821602523304211e-05,
"loss": 0.8326,
"step": 386
},
{
"epoch": 0.42248908296943233,
"grad_norm": 1.0826900331711504,
"learning_rate": 1.82060187577717e-05,
"loss": 0.8939,
"step": 387
},
{
"epoch": 0.42358078602620086,
"grad_norm": 0.9839950051934626,
"learning_rate": 1.819598706185979e-05,
"loss": 0.8482,
"step": 388
},
{
"epoch": 0.42467248908296945,
"grad_norm": 1.0589092947298755,
"learning_rate": 1.8185930176138116e-05,
"loss": 0.8876,
"step": 389
},
{
"epoch": 0.425764192139738,
"grad_norm": 1.0750509472214214,
"learning_rate": 1.817584813151584e-05,
"loss": 0.877,
"step": 390
},
{
"epoch": 0.4268558951965066,
"grad_norm": 0.9919087062730784,
"learning_rate": 1.816574095897943e-05,
"loss": 0.8508,
"step": 391
},
{
"epoch": 0.4279475982532751,
"grad_norm": 1.0145840698237465,
"learning_rate": 1.8155608689592604e-05,
"loss": 0.8078,
"step": 392
},
{
"epoch": 0.42903930131004364,
"grad_norm": 1.0185084268471,
"learning_rate": 1.81454513544962e-05,
"loss": 0.8084,
"step": 393
},
{
"epoch": 0.43013100436681223,
"grad_norm": 1.0254127908397044,
"learning_rate": 1.8135268984908096e-05,
"loss": 0.8917,
"step": 394
},
{
"epoch": 0.43122270742358076,
"grad_norm": 1.0258173941195141,
"learning_rate": 1.8125061612123115e-05,
"loss": 0.8304,
"step": 395
},
{
"epoch": 0.43231441048034935,
"grad_norm": 1.0484331902250323,
"learning_rate": 1.811482926751293e-05,
"loss": 0.8323,
"step": 396
},
{
"epoch": 0.4334061135371179,
"grad_norm": 4.282417220328665,
"learning_rate": 1.810457198252595e-05,
"loss": 0.8679,
"step": 397
},
{
"epoch": 0.4344978165938865,
"grad_norm": 1.254363294463263,
"learning_rate": 1.8094289788687245e-05,
"loss": 0.827,
"step": 398
},
{
"epoch": 0.435589519650655,
"grad_norm": 1.2029279537370476,
"learning_rate": 1.8083982717598445e-05,
"loss": 0.8952,
"step": 399
},
{
"epoch": 0.4366812227074236,
"grad_norm": 1.1033747688307167,
"learning_rate": 1.8073650800937627e-05,
"loss": 0.7987,
"step": 400
},
{
"epoch": 0.43777292576419213,
"grad_norm": 1.0665321603069233,
"learning_rate": 1.8063294070459237e-05,
"loss": 0.8465,
"step": 401
},
{
"epoch": 0.4388646288209607,
"grad_norm": 1.169774102727397,
"learning_rate": 1.8052912557993983e-05,
"loss": 0.9538,
"step": 402
},
{
"epoch": 0.43995633187772926,
"grad_norm": 1.1194509195428748,
"learning_rate": 1.804250629544874e-05,
"loss": 0.8943,
"step": 403
},
{
"epoch": 0.4410480349344978,
"grad_norm": 1.0577514930567236,
"learning_rate": 1.803207531480645e-05,
"loss": 0.8548,
"step": 404
},
{
"epoch": 0.4421397379912664,
"grad_norm": 0.9973644029012136,
"learning_rate": 1.8021619648126022e-05,
"loss": 0.8669,
"step": 405
},
{
"epoch": 0.4432314410480349,
"grad_norm": 1.0523028770471372,
"learning_rate": 1.8011139327542238e-05,
"loss": 0.8684,
"step": 406
},
{
"epoch": 0.4443231441048035,
"grad_norm": 1.1262524145047217,
"learning_rate": 1.8000634385265653e-05,
"loss": 0.8784,
"step": 407
},
{
"epoch": 0.44541484716157204,
"grad_norm": 1.0496533920359254,
"learning_rate": 1.7990104853582494e-05,
"loss": 0.8894,
"step": 408
},
{
"epoch": 0.4465065502183406,
"grad_norm": 1.1201302730875389,
"learning_rate": 1.7979550764854556e-05,
"loss": 0.9027,
"step": 409
},
{
"epoch": 0.44759825327510916,
"grad_norm": 0.9929036874744435,
"learning_rate": 1.796897215151912e-05,
"loss": 0.8014,
"step": 410
},
{
"epoch": 0.44868995633187775,
"grad_norm": 1.041192261408248,
"learning_rate": 1.7958369046088837e-05,
"loss": 0.855,
"step": 411
},
{
"epoch": 0.4497816593886463,
"grad_norm": 1.0402849967481036,
"learning_rate": 1.7947741481151628e-05,
"loss": 0.8846,
"step": 412
},
{
"epoch": 0.45087336244541487,
"grad_norm": 1.1162899779922457,
"learning_rate": 1.7937089489370593e-05,
"loss": 0.8348,
"step": 413
},
{
"epoch": 0.4519650655021834,
"grad_norm": 1.0259479728400953,
"learning_rate": 1.7926413103483903e-05,
"loss": 0.8383,
"step": 414
},
{
"epoch": 0.45305676855895194,
"grad_norm": 1.0423937380857937,
"learning_rate": 1.7915712356304716e-05,
"loss": 0.8764,
"step": 415
},
{
"epoch": 0.45414847161572053,
"grad_norm": 1.0442894283040909,
"learning_rate": 1.7904987280721037e-05,
"loss": 0.8492,
"step": 416
},
{
"epoch": 0.45524017467248906,
"grad_norm": 2.3381394965875426,
"learning_rate": 1.7894237909695666e-05,
"loss": 0.9194,
"step": 417
},
{
"epoch": 0.45633187772925765,
"grad_norm": 1.0457862793643864,
"learning_rate": 1.7883464276266064e-05,
"loss": 0.9105,
"step": 418
},
{
"epoch": 0.4574235807860262,
"grad_norm": 1.0938487563819301,
"learning_rate": 1.7872666413544263e-05,
"loss": 0.8505,
"step": 419
},
{
"epoch": 0.4585152838427948,
"grad_norm": 1.0479613316053789,
"learning_rate": 1.7861844354716757e-05,
"loss": 0.8666,
"step": 420
},
{
"epoch": 0.4596069868995633,
"grad_norm": 1.0354989854912366,
"learning_rate": 1.7850998133044414e-05,
"loss": 0.8579,
"step": 421
},
{
"epoch": 0.4606986899563319,
"grad_norm": 1.0244832450034898,
"learning_rate": 1.7840127781862354e-05,
"loss": 0.8339,
"step": 422
},
{
"epoch": 0.46179039301310043,
"grad_norm": 1.0701586177160183,
"learning_rate": 1.782923333457987e-05,
"loss": 0.8655,
"step": 423
},
{
"epoch": 0.462882096069869,
"grad_norm": 1.1367159914706504,
"learning_rate": 1.78183148246803e-05,
"loss": 0.8425,
"step": 424
},
{
"epoch": 0.46397379912663755,
"grad_norm": 1.0318001555579586,
"learning_rate": 1.7807372285720945e-05,
"loss": 0.8334,
"step": 425
},
{
"epoch": 0.4650655021834061,
"grad_norm": 1.0034906079850978,
"learning_rate": 1.779640575133296e-05,
"loss": 0.7919,
"step": 426
},
{
"epoch": 0.4661572052401747,
"grad_norm": 1.0948762926658004,
"learning_rate": 1.7785415255221237e-05,
"loss": 0.8112,
"step": 427
},
{
"epoch": 0.4672489082969432,
"grad_norm": 1.0595832358250177,
"learning_rate": 1.777440083116432e-05,
"loss": 0.856,
"step": 428
},
{
"epoch": 0.4683406113537118,
"grad_norm": 1.0120238139833355,
"learning_rate": 1.7763362513014303e-05,
"loss": 0.8358,
"step": 429
},
{
"epoch": 0.46943231441048033,
"grad_norm": 1.0603420733510691,
"learning_rate": 1.7752300334696696e-05,
"loss": 0.8273,
"step": 430
},
{
"epoch": 0.4705240174672489,
"grad_norm": 1.0149418311756344,
"learning_rate": 1.774121433021036e-05,
"loss": 0.776,
"step": 431
},
{
"epoch": 0.47161572052401746,
"grad_norm": 1.0427962535184434,
"learning_rate": 1.773010453362737e-05,
"loss": 0.8414,
"step": 432
},
{
"epoch": 0.47270742358078605,
"grad_norm": 1.0541515356824127,
"learning_rate": 1.771897097909294e-05,
"loss": 0.863,
"step": 433
},
{
"epoch": 0.4737991266375546,
"grad_norm": 1.175558645343248,
"learning_rate": 1.7707813700825288e-05,
"loss": 0.8717,
"step": 434
},
{
"epoch": 0.47489082969432317,
"grad_norm": 1.0054868964481134,
"learning_rate": 1.7696632733115554e-05,
"loss": 0.8423,
"step": 435
},
{
"epoch": 0.4759825327510917,
"grad_norm": 0.9992172603418047,
"learning_rate": 1.7685428110327683e-05,
"loss": 0.8136,
"step": 436
},
{
"epoch": 0.47707423580786024,
"grad_norm": 1.000512843742466,
"learning_rate": 1.767419986689832e-05,
"loss": 0.8196,
"step": 437
},
{
"epoch": 0.4781659388646288,
"grad_norm": 1.0785231977862761,
"learning_rate": 1.7662948037336712e-05,
"loss": 0.8726,
"step": 438
},
{
"epoch": 0.47925764192139736,
"grad_norm": 1.0954902478774415,
"learning_rate": 1.7651672656224592e-05,
"loss": 0.8797,
"step": 439
},
{
"epoch": 0.48034934497816595,
"grad_norm": 1.0327664989255525,
"learning_rate": 1.7640373758216075e-05,
"loss": 0.8152,
"step": 440
},
{
"epoch": 0.4814410480349345,
"grad_norm": 1.0354412037183665,
"learning_rate": 1.7629051378037563e-05,
"loss": 0.8877,
"step": 441
},
{
"epoch": 0.48253275109170307,
"grad_norm": 1.1015989497623595,
"learning_rate": 1.761770555048762e-05,
"loss": 0.8427,
"step": 442
},
{
"epoch": 0.4836244541484716,
"grad_norm": 1.0320497473447086,
"learning_rate": 1.7606336310436873e-05,
"loss": 0.8563,
"step": 443
},
{
"epoch": 0.4847161572052402,
"grad_norm": 0.9997658446696477,
"learning_rate": 1.7594943692827913e-05,
"loss": 0.8133,
"step": 444
},
{
"epoch": 0.48580786026200873,
"grad_norm": 1.0250193435299315,
"learning_rate": 1.758352773267518e-05,
"loss": 0.8018,
"step": 445
},
{
"epoch": 0.4868995633187773,
"grad_norm": 1.039285984913906,
"learning_rate": 1.7572088465064847e-05,
"loss": 0.8292,
"step": 446
},
{
"epoch": 0.48799126637554585,
"grad_norm": 1.036818189180488,
"learning_rate": 1.756062592515473e-05,
"loss": 0.8647,
"step": 447
},
{
"epoch": 0.4890829694323144,
"grad_norm": 1.0615514046606271,
"learning_rate": 1.754914014817416e-05,
"loss": 0.8532,
"step": 448
},
{
"epoch": 0.490174672489083,
"grad_norm": 1.072361184371725,
"learning_rate": 1.7537631169423904e-05,
"loss": 0.8209,
"step": 449
},
{
"epoch": 0.4912663755458515,
"grad_norm": 1.006916101154413,
"learning_rate": 1.7526099024276017e-05,
"loss": 0.8522,
"step": 450
},
{
"epoch": 0.4923580786026201,
"grad_norm": 1.0491709821312805,
"learning_rate": 1.7514543748173768e-05,
"loss": 0.815,
"step": 451
},
{
"epoch": 0.49344978165938863,
"grad_norm": 1.112345421697922,
"learning_rate": 1.7502965376631515e-05,
"loss": 0.8733,
"step": 452
},
{
"epoch": 0.4945414847161572,
"grad_norm": 1.0196934926549626,
"learning_rate": 1.7491363945234595e-05,
"loss": 0.9038,
"step": 453
},
{
"epoch": 0.49563318777292575,
"grad_norm": 1.000891252703294,
"learning_rate": 1.7479739489639218e-05,
"loss": 0.8018,
"step": 454
},
{
"epoch": 0.49672489082969434,
"grad_norm": 1.0356650853844807,
"learning_rate": 1.7468092045572366e-05,
"loss": 0.8717,
"step": 455
},
{
"epoch": 0.4978165938864629,
"grad_norm": 1.06647966328419,
"learning_rate": 1.7456421648831658e-05,
"loss": 0.8846,
"step": 456
},
{
"epoch": 0.49890829694323147,
"grad_norm": 0.9718178063575992,
"learning_rate": 1.7444728335285272e-05,
"loss": 0.8282,
"step": 457
},
{
"epoch": 0.5,
"grad_norm": 1.0013431873045686,
"learning_rate": 1.743301214087181e-05,
"loss": 0.8001,
"step": 458
},
{
"epoch": 0.5010917030567685,
"grad_norm": 1.0740535843812218,
"learning_rate": 1.7421273101600204e-05,
"loss": 0.8754,
"step": 459
},
{
"epoch": 0.5021834061135371,
"grad_norm": 0.9995691628942387,
"learning_rate": 1.7409511253549592e-05,
"loss": 0.8498,
"step": 460
},
{
"epoch": 0.5032751091703057,
"grad_norm": 1.0484096682850725,
"learning_rate": 1.7397726632869217e-05,
"loss": 0.8627,
"step": 461
},
{
"epoch": 0.5043668122270742,
"grad_norm": 0.9787309299681939,
"learning_rate": 1.7385919275778306e-05,
"loss": 0.8515,
"step": 462
},
{
"epoch": 0.5054585152838428,
"grad_norm": 1.0553331678079094,
"learning_rate": 1.7374089218565973e-05,
"loss": 0.854,
"step": 463
},
{
"epoch": 0.5065502183406113,
"grad_norm": 1.0237550943918565,
"learning_rate": 1.7362236497591097e-05,
"loss": 0.8543,
"step": 464
},
{
"epoch": 0.50764192139738,
"grad_norm": 1.0479711130387015,
"learning_rate": 1.7350361149282204e-05,
"loss": 0.8683,
"step": 465
},
{
"epoch": 0.5087336244541485,
"grad_norm": 1.0479876677888864,
"learning_rate": 1.733846321013738e-05,
"loss": 0.8384,
"step": 466
},
{
"epoch": 0.509825327510917,
"grad_norm": 1.0123245970073138,
"learning_rate": 1.7326542716724127e-05,
"loss": 0.9322,
"step": 467
},
{
"epoch": 0.5109170305676856,
"grad_norm": 1.0074008131779049,
"learning_rate": 1.731459970567928e-05,
"loss": 0.84,
"step": 468
},
{
"epoch": 0.5120087336244541,
"grad_norm": 1.101200499888394,
"learning_rate": 1.730263421370886e-05,
"loss": 0.9147,
"step": 469
},
{
"epoch": 0.5131004366812227,
"grad_norm": 1.0144901458139852,
"learning_rate": 1.7290646277588004e-05,
"loss": 0.8137,
"step": 470
},
{
"epoch": 0.5141921397379913,
"grad_norm": 0.9662577411188962,
"learning_rate": 1.7278635934160816e-05,
"loss": 0.8216,
"step": 471
},
{
"epoch": 0.5152838427947598,
"grad_norm": 0.9953785434574902,
"learning_rate": 1.7266603220340273e-05,
"loss": 0.8596,
"step": 472
},
{
"epoch": 0.5163755458515283,
"grad_norm": 1.0264482281643867,
"learning_rate": 1.72545481731081e-05,
"loss": 0.8305,
"step": 473
},
{
"epoch": 0.517467248908297,
"grad_norm": 1.030203549642048,
"learning_rate": 1.7242470829514674e-05,
"loss": 0.8345,
"step": 474
},
{
"epoch": 0.5185589519650655,
"grad_norm": 0.9710461808326063,
"learning_rate": 1.7230371226678876e-05,
"loss": 0.8521,
"step": 475
},
{
"epoch": 0.519650655021834,
"grad_norm": 1.023370348893279,
"learning_rate": 1.7218249401788033e-05,
"loss": 0.7798,
"step": 476
},
{
"epoch": 0.5207423580786026,
"grad_norm": 1.025914668121047,
"learning_rate": 1.7206105392097736e-05,
"loss": 0.863,
"step": 477
},
{
"epoch": 0.5218340611353712,
"grad_norm": 0.9690377932600598,
"learning_rate": 1.719393923493178e-05,
"loss": 0.8914,
"step": 478
},
{
"epoch": 0.5229257641921398,
"grad_norm": 0.9594444536591099,
"learning_rate": 1.7181750967682022e-05,
"loss": 0.7758,
"step": 479
},
{
"epoch": 0.5240174672489083,
"grad_norm": 1.0614467541564174,
"learning_rate": 1.7169540627808276e-05,
"loss": 0.823,
"step": 480
},
{
"epoch": 0.5251091703056768,
"grad_norm": 1.4460773779786078,
"learning_rate": 1.7157308252838187e-05,
"loss": 0.9603,
"step": 481
},
{
"epoch": 0.5262008733624454,
"grad_norm": 1.0653191364149415,
"learning_rate": 1.7145053880367134e-05,
"loss": 0.8087,
"step": 482
},
{
"epoch": 0.527292576419214,
"grad_norm": 1.0169339093704666,
"learning_rate": 1.7132777548058103e-05,
"loss": 0.8118,
"step": 483
},
{
"epoch": 0.5283842794759825,
"grad_norm": 1.051698062978159,
"learning_rate": 1.7120479293641558e-05,
"loss": 0.8567,
"step": 484
},
{
"epoch": 0.5294759825327511,
"grad_norm": 1.0817966296346004,
"learning_rate": 1.7108159154915348e-05,
"loss": 0.8886,
"step": 485
},
{
"epoch": 0.5305676855895196,
"grad_norm": 1.0599420820404035,
"learning_rate": 1.7095817169744596e-05,
"loss": 0.8339,
"step": 486
},
{
"epoch": 0.5316593886462883,
"grad_norm": 1.0409067153327243,
"learning_rate": 1.7083453376061542e-05,
"loss": 0.8516,
"step": 487
},
{
"epoch": 0.5327510917030568,
"grad_norm": 1.044566376402019,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.8281,
"step": 488
},
{
"epoch": 0.5338427947598253,
"grad_norm": 1.0687064993710373,
"learning_rate": 1.7058660515222583e-05,
"loss": 0.8406,
"step": 489
},
{
"epoch": 0.5349344978165939,
"grad_norm": 0.999108192583162,
"learning_rate": 1.704623152426585e-05,
"loss": 0.8491,
"step": 490
},
{
"epoch": 0.5360262008733624,
"grad_norm": 1.0292912957335598,
"learning_rate": 1.7033780877194935e-05,
"loss": 0.8269,
"step": 491
},
{
"epoch": 0.537117903930131,
"grad_norm": 1.06830661497397,
"learning_rate": 1.7021308612276056e-05,
"loss": 0.8931,
"step": 492
},
{
"epoch": 0.5382096069868996,
"grad_norm": 1.084492682469156,
"learning_rate": 1.7008814767841872e-05,
"loss": 0.8693,
"step": 493
},
{
"epoch": 0.5393013100436681,
"grad_norm": 1.0454485195891257,
"learning_rate": 1.699629938229137e-05,
"loss": 0.8503,
"step": 494
},
{
"epoch": 0.5403930131004366,
"grad_norm": 1.0191867273452386,
"learning_rate": 1.6983762494089732e-05,
"loss": 0.7995,
"step": 495
},
{
"epoch": 0.5414847161572053,
"grad_norm": 1.0630877415036102,
"learning_rate": 1.6971204141768235e-05,
"loss": 0.9406,
"step": 496
},
{
"epoch": 0.5425764192139738,
"grad_norm": 0.9888822595216342,
"learning_rate": 1.695862436392412e-05,
"loss": 0.875,
"step": 497
},
{
"epoch": 0.5436681222707423,
"grad_norm": 1.057270695106973,
"learning_rate": 1.694602319922049e-05,
"loss": 0.8417,
"step": 498
},
{
"epoch": 0.5447598253275109,
"grad_norm": 1.0195541806819288,
"learning_rate": 1.6933400686386155e-05,
"loss": 0.836,
"step": 499
},
{
"epoch": 0.5458515283842795,
"grad_norm": 1.0265128911776806,
"learning_rate": 1.6920756864215558e-05,
"loss": 0.773,
"step": 500
},
{
"epoch": 0.5469432314410481,
"grad_norm": 1.0380370718284653,
"learning_rate": 1.6908091771568627e-05,
"loss": 0.8698,
"step": 501
},
{
"epoch": 0.5480349344978166,
"grad_norm": 1.039423505383226,
"learning_rate": 1.689540544737067e-05,
"loss": 0.8544,
"step": 502
},
{
"epoch": 0.5491266375545851,
"grad_norm": 1.0217176219487862,
"learning_rate": 1.6882697930612238e-05,
"loss": 0.7874,
"step": 503
},
{
"epoch": 0.5502183406113537,
"grad_norm": 1.045988466122317,
"learning_rate": 1.686996926034902e-05,
"loss": 0.8567,
"step": 504
},
{
"epoch": 0.5513100436681223,
"grad_norm": 1.0447659724616767,
"learning_rate": 1.6857219475701717e-05,
"loss": 0.8109,
"step": 505
},
{
"epoch": 0.5524017467248908,
"grad_norm": 1.0664195249433643,
"learning_rate": 1.6844448615855933e-05,
"loss": 0.8202,
"step": 506
},
{
"epoch": 0.5534934497816594,
"grad_norm": 1.0089648633669173,
"learning_rate": 1.683165672006204e-05,
"loss": 0.9121,
"step": 507
},
{
"epoch": 0.5545851528384279,
"grad_norm": 1.0452529273651734,
"learning_rate": 1.6818843827635052e-05,
"loss": 0.8013,
"step": 508
},
{
"epoch": 0.5556768558951966,
"grad_norm": 1.0633409457304046,
"learning_rate": 1.6806009977954533e-05,
"loss": 0.7944,
"step": 509
},
{
"epoch": 0.5567685589519651,
"grad_norm": 1.0482706463529357,
"learning_rate": 1.6793155210464442e-05,
"loss": 0.8671,
"step": 510
},
{
"epoch": 0.5578602620087336,
"grad_norm": 1.029790394822473,
"learning_rate": 1.678027956467304e-05,
"loss": 0.8645,
"step": 511
},
{
"epoch": 0.5589519650655022,
"grad_norm": 1.030909653935639,
"learning_rate": 1.6767383080152744e-05,
"loss": 0.8318,
"step": 512
},
{
"epoch": 0.5600436681222707,
"grad_norm": 1.9383979856595541,
"learning_rate": 1.675446579654003e-05,
"loss": 0.888,
"step": 513
},
{
"epoch": 0.5611353711790393,
"grad_norm": 1.0983725411335354,
"learning_rate": 1.6741527753535285e-05,
"loss": 0.8451,
"step": 514
},
{
"epoch": 0.5622270742358079,
"grad_norm": 1.0640374450842205,
"learning_rate": 1.6728568990902713e-05,
"loss": 0.8531,
"step": 515
},
{
"epoch": 0.5633187772925764,
"grad_norm": 1.0519286549875264,
"learning_rate": 1.6715589548470187e-05,
"loss": 0.8703,
"step": 516
},
{
"epoch": 0.5644104803493449,
"grad_norm": 1.0494252089696754,
"learning_rate": 1.670258946612914e-05,
"loss": 0.8492,
"step": 517
},
{
"epoch": 0.5655021834061136,
"grad_norm": 0.9904660339342319,
"learning_rate": 1.668956878383445e-05,
"loss": 0.7819,
"step": 518
},
{
"epoch": 0.5665938864628821,
"grad_norm": 1.0101879469337889,
"learning_rate": 1.667652754160429e-05,
"loss": 0.8154,
"step": 519
},
{
"epoch": 0.5676855895196506,
"grad_norm": 1.0204945679192219,
"learning_rate": 1.6663465779520042e-05,
"loss": 0.8306,
"step": 520
},
{
"epoch": 0.5687772925764192,
"grad_norm": 1.0213823349571565,
"learning_rate": 1.665038353772614e-05,
"loss": 0.8345,
"step": 521
},
{
"epoch": 0.5698689956331878,
"grad_norm": 1.036141206555699,
"learning_rate": 1.6637280856429964e-05,
"loss": 0.8133,
"step": 522
},
{
"epoch": 0.5709606986899564,
"grad_norm": 0.9890345567375529,
"learning_rate": 1.662415777590172e-05,
"loss": 0.8548,
"step": 523
},
{
"epoch": 0.5720524017467249,
"grad_norm": 0.9798722379098405,
"learning_rate": 1.6611014336474303e-05,
"loss": 0.8496,
"step": 524
},
{
"epoch": 0.5731441048034934,
"grad_norm": 0.9979661876547649,
"learning_rate": 1.6597850578543177e-05,
"loss": 0.783,
"step": 525
},
{
"epoch": 0.574235807860262,
"grad_norm": 1.0077066818548024,
"learning_rate": 1.658466654256627e-05,
"loss": 0.8443,
"step": 526
},
{
"epoch": 0.5753275109170306,
"grad_norm": 0.9644667511413415,
"learning_rate": 1.6571462269063812e-05,
"loss": 0.8507,
"step": 527
},
{
"epoch": 0.5764192139737991,
"grad_norm": 1.0021206328145529,
"learning_rate": 1.6558237798618243e-05,
"loss": 0.8371,
"step": 528
},
{
"epoch": 0.5775109170305677,
"grad_norm": 0.9854485893320216,
"learning_rate": 1.6544993171874077e-05,
"loss": 0.8189,
"step": 529
},
{
"epoch": 0.5786026200873362,
"grad_norm": 1.0229073338371153,
"learning_rate": 1.6531728429537766e-05,
"loss": 0.8071,
"step": 530
},
{
"epoch": 0.5796943231441049,
"grad_norm": 1.0114149790482636,
"learning_rate": 1.6518443612377613e-05,
"loss": 0.8185,
"step": 531
},
{
"epoch": 0.5807860262008734,
"grad_norm": 1.0586948600839432,
"learning_rate": 1.6505138761223586e-05,
"loss": 0.8605,
"step": 532
},
{
"epoch": 0.5818777292576419,
"grad_norm": 0.972195996633417,
"learning_rate": 1.6491813916967246e-05,
"loss": 0.8359,
"step": 533
},
{
"epoch": 0.5829694323144105,
"grad_norm": 0.9930463512342607,
"learning_rate": 1.64784691205616e-05,
"loss": 0.8135,
"step": 534
},
{
"epoch": 0.584061135371179,
"grad_norm": 1.0417164111837445,
"learning_rate": 1.646510441302097e-05,
"loss": 0.8224,
"step": 535
},
{
"epoch": 0.5851528384279476,
"grad_norm": 1.6731855789988992,
"learning_rate": 1.645171983542088e-05,
"loss": 0.8745,
"step": 536
},
{
"epoch": 0.5862445414847162,
"grad_norm": 1.0085925032008127,
"learning_rate": 1.6438315428897914e-05,
"loss": 0.8359,
"step": 537
},
{
"epoch": 0.5873362445414847,
"grad_norm": 1.1694220636353037,
"learning_rate": 1.642489123464962e-05,
"loss": 0.8165,
"step": 538
},
{
"epoch": 0.5884279475982532,
"grad_norm": 1.0570727108856277,
"learning_rate": 1.641144729393433e-05,
"loss": 0.8999,
"step": 539
},
{
"epoch": 0.5895196506550219,
"grad_norm": 0.9871206084931676,
"learning_rate": 1.6397983648071093e-05,
"loss": 0.8631,
"step": 540
},
{
"epoch": 0.5906113537117904,
"grad_norm": 1.0832488292215814,
"learning_rate": 1.638450033843951e-05,
"loss": 0.8706,
"step": 541
},
{
"epoch": 0.5917030567685589,
"grad_norm": 1.0499913565218573,
"learning_rate": 1.6370997406479617e-05,
"loss": 0.8278,
"step": 542
},
{
"epoch": 0.5927947598253275,
"grad_norm": 0.988445718936173,
"learning_rate": 1.635747489369176e-05,
"loss": 0.778,
"step": 543
},
{
"epoch": 0.5938864628820961,
"grad_norm": 1.1145601219811156,
"learning_rate": 1.6343932841636455e-05,
"loss": 0.8756,
"step": 544
},
{
"epoch": 0.5949781659388647,
"grad_norm": 1.0274662465223996,
"learning_rate": 1.6330371291934292e-05,
"loss": 0.8715,
"step": 545
},
{
"epoch": 0.5960698689956332,
"grad_norm": 0.9888929908537507,
"learning_rate": 1.6316790286265764e-05,
"loss": 0.8016,
"step": 546
},
{
"epoch": 0.5971615720524017,
"grad_norm": 0.9800632578267566,
"learning_rate": 1.6303189866371177e-05,
"loss": 0.8186,
"step": 547
},
{
"epoch": 0.5982532751091703,
"grad_norm": 1.0181022884649904,
"learning_rate": 1.6289570074050492e-05,
"loss": 0.7717,
"step": 548
},
{
"epoch": 0.5993449781659389,
"grad_norm": 0.9931383725603153,
"learning_rate": 1.627593095116322e-05,
"loss": 0.8062,
"step": 549
},
{
"epoch": 0.6004366812227074,
"grad_norm": 1.0565432388214935,
"learning_rate": 1.6262272539628277e-05,
"loss": 0.9332,
"step": 550
},
{
"epoch": 0.601528384279476,
"grad_norm": 1.017358655586954,
"learning_rate": 1.6248594881423866e-05,
"loss": 0.8293,
"step": 551
},
{
"epoch": 0.6026200873362445,
"grad_norm": 1.0564541094752922,
"learning_rate": 1.6234898018587336e-05,
"loss": 0.8108,
"step": 552
},
{
"epoch": 0.6037117903930131,
"grad_norm": 1.0962662069798912,
"learning_rate": 1.622118199321507e-05,
"loss": 0.8118,
"step": 553
},
{
"epoch": 0.6048034934497817,
"grad_norm": 1.0002035698397997,
"learning_rate": 1.6207446847462338e-05,
"loss": 0.8251,
"step": 554
},
{
"epoch": 0.6058951965065502,
"grad_norm": 1.0362803709803372,
"learning_rate": 1.619369262354318e-05,
"loss": 0.831,
"step": 555
},
{
"epoch": 0.6069868995633187,
"grad_norm": 1.0350272517706043,
"learning_rate": 1.617991936373027e-05,
"loss": 0.8296,
"step": 556
},
{
"epoch": 0.6080786026200873,
"grad_norm": 1.022295830165343,
"learning_rate": 1.6166127110354778e-05,
"loss": 0.8439,
"step": 557
},
{
"epoch": 0.6091703056768559,
"grad_norm": 1.0036104581770837,
"learning_rate": 1.615231590580627e-05,
"loss": 0.8274,
"step": 558
},
{
"epoch": 0.6102620087336245,
"grad_norm": 1.0625230714531217,
"learning_rate": 1.613848579253254e-05,
"loss": 0.8444,
"step": 559
},
{
"epoch": 0.611353711790393,
"grad_norm": 0.9795637534852265,
"learning_rate": 1.6124636813039502e-05,
"loss": 0.8321,
"step": 560
},
{
"epoch": 0.6124454148471615,
"grad_norm": 1.032960466441184,
"learning_rate": 1.6110769009891055e-05,
"loss": 0.8488,
"step": 561
},
{
"epoch": 0.6135371179039302,
"grad_norm": 1.0248084792381282,
"learning_rate": 1.6096882425708953e-05,
"loss": 0.8687,
"step": 562
},
{
"epoch": 0.6146288209606987,
"grad_norm": 1.1858913840076635,
"learning_rate": 1.6082977103172664e-05,
"loss": 0.883,
"step": 563
},
{
"epoch": 0.6157205240174672,
"grad_norm": 0.969580319658712,
"learning_rate": 1.6069053085019258e-05,
"loss": 0.7961,
"step": 564
},
{
"epoch": 0.6168122270742358,
"grad_norm": 1.0003293992792357,
"learning_rate": 1.605511041404326e-05,
"loss": 0.8077,
"step": 565
},
{
"epoch": 0.6179039301310044,
"grad_norm": 1.1047881958890635,
"learning_rate": 1.6041149133096515e-05,
"loss": 0.9151,
"step": 566
},
{
"epoch": 0.618995633187773,
"grad_norm": 1.0577490044445212,
"learning_rate": 1.6027169285088074e-05,
"loss": 0.8326,
"step": 567
},
{
"epoch": 0.6200873362445415,
"grad_norm": 0.9662357738547696,
"learning_rate": 1.601317091298406e-05,
"loss": 0.8242,
"step": 568
},
{
"epoch": 0.62117903930131,
"grad_norm": 1.0086707551521494,
"learning_rate": 1.599915405980751e-05,
"loss": 0.8419,
"step": 569
},
{
"epoch": 0.6222707423580786,
"grad_norm": 1.0265853989002143,
"learning_rate": 1.5985118768638276e-05,
"loss": 0.8381,
"step": 570
},
{
"epoch": 0.6233624454148472,
"grad_norm": 1.0364995286049488,
"learning_rate": 1.5971065082612866e-05,
"loss": 0.8099,
"step": 571
},
{
"epoch": 0.6244541484716157,
"grad_norm": 1.0536821607548954,
"learning_rate": 1.5956993044924334e-05,
"loss": 0.8965,
"step": 572
},
{
"epoch": 0.6255458515283843,
"grad_norm": 1.0587477022760539,
"learning_rate": 1.5942902698822136e-05,
"loss": 0.8344,
"step": 573
},
{
"epoch": 0.6266375545851528,
"grad_norm": 1.045882932961161,
"learning_rate": 1.5928794087611988e-05,
"loss": 0.7895,
"step": 574
},
{
"epoch": 0.6277292576419214,
"grad_norm": 1.0497652337308017,
"learning_rate": 1.5914667254655748e-05,
"loss": 0.8299,
"step": 575
},
{
"epoch": 0.62882096069869,
"grad_norm": 1.0354764635928753,
"learning_rate": 1.5900522243371283e-05,
"loss": 0.8605,
"step": 576
},
{
"epoch": 0.6299126637554585,
"grad_norm": 0.9667502333640907,
"learning_rate": 1.5886359097232324e-05,
"loss": 0.8035,
"step": 577
},
{
"epoch": 0.631004366812227,
"grad_norm": 1.153930271061058,
"learning_rate": 1.5872177859768336e-05,
"loss": 0.7997,
"step": 578
},
{
"epoch": 0.6320960698689956,
"grad_norm": 1.0450063972970456,
"learning_rate": 1.585797857456439e-05,
"loss": 0.829,
"step": 579
},
{
"epoch": 0.6331877729257642,
"grad_norm": 0.9617190100950247,
"learning_rate": 1.5843761285261027e-05,
"loss": 0.8408,
"step": 580
},
{
"epoch": 0.6342794759825328,
"grad_norm": 0.9913810116496855,
"learning_rate": 1.582952603555412e-05,
"loss": 0.7985,
"step": 581
},
{
"epoch": 0.6353711790393013,
"grad_norm": 1.0861455006638756,
"learning_rate": 1.581527286919474e-05,
"loss": 0.8731,
"step": 582
},
{
"epoch": 0.6364628820960698,
"grad_norm": 1.0258830351125474,
"learning_rate": 1.580100182998903e-05,
"loss": 0.82,
"step": 583
},
{
"epoch": 0.6375545851528385,
"grad_norm": 1.0262414051479618,
"learning_rate": 1.578671296179806e-05,
"loss": 0.8687,
"step": 584
},
{
"epoch": 0.638646288209607,
"grad_norm": 1.017254662022287,
"learning_rate": 1.5772406308537692e-05,
"loss": 0.8775,
"step": 585
},
{
"epoch": 0.6397379912663755,
"grad_norm": 0.9719658760673426,
"learning_rate": 1.5758081914178457e-05,
"loss": 0.763,
"step": 586
},
{
"epoch": 0.6408296943231441,
"grad_norm": 1.0125128698206225,
"learning_rate": 1.5743739822745405e-05,
"loss": 0.8296,
"step": 587
},
{
"epoch": 0.6419213973799127,
"grad_norm": 0.9521846071912784,
"learning_rate": 1.5729380078317982e-05,
"loss": 0.7994,
"step": 588
},
{
"epoch": 0.6430131004366813,
"grad_norm": 0.9605098428504907,
"learning_rate": 1.5715002725029893e-05,
"loss": 0.8023,
"step": 589
},
{
"epoch": 0.6441048034934498,
"grad_norm": 1.0140159283483732,
"learning_rate": 1.5700607807068946e-05,
"loss": 0.7785,
"step": 590
},
{
"epoch": 0.6451965065502183,
"grad_norm": 0.9692730437519576,
"learning_rate": 1.5686195368676954e-05,
"loss": 0.8041,
"step": 591
},
{
"epoch": 0.6462882096069869,
"grad_norm": 0.9749041091602636,
"learning_rate": 1.5671765454149558e-05,
"loss": 0.8173,
"step": 592
},
{
"epoch": 0.6473799126637555,
"grad_norm": 1.0355433585794054,
"learning_rate": 1.5657318107836133e-05,
"loss": 0.8306,
"step": 593
},
{
"epoch": 0.648471615720524,
"grad_norm": 1.0032727193309083,
"learning_rate": 1.564285337413961e-05,
"loss": 0.8293,
"step": 594
},
{
"epoch": 0.6495633187772926,
"grad_norm": 0.989829163598842,
"learning_rate": 1.5628371297516364e-05,
"loss": 0.8961,
"step": 595
},
{
"epoch": 0.6506550218340611,
"grad_norm": 1.0382291038720708,
"learning_rate": 1.5613871922476082e-05,
"loss": 0.841,
"step": 596
},
{
"epoch": 0.6517467248908297,
"grad_norm": 0.9462720396951789,
"learning_rate": 1.5599355293581598e-05,
"loss": 0.8017,
"step": 597
},
{
"epoch": 0.6528384279475983,
"grad_norm": 1.0098732664768768,
"learning_rate": 1.558482145544879e-05,
"loss": 0.8493,
"step": 598
},
{
"epoch": 0.6539301310043668,
"grad_norm": 1.0161614656932214,
"learning_rate": 1.5570270452746426e-05,
"loss": 0.8675,
"step": 599
},
{
"epoch": 0.6550218340611353,
"grad_norm": 1.0205394808315873,
"learning_rate": 1.5555702330196024e-05,
"loss": 0.8065,
"step": 600
},
{
"epoch": 0.6561135371179039,
"grad_norm": 1.0215903373863715,
"learning_rate": 1.5541117132571718e-05,
"loss": 0.8406,
"step": 601
},
{
"epoch": 0.6572052401746725,
"grad_norm": 0.9730755603144055,
"learning_rate": 1.552651490470012e-05,
"loss": 0.8315,
"step": 602
},
{
"epoch": 0.6582969432314411,
"grad_norm": 0.9720694505211114,
"learning_rate": 1.5511895691460187e-05,
"loss": 0.82,
"step": 603
},
{
"epoch": 0.6593886462882096,
"grad_norm": 0.9647496450135179,
"learning_rate": 1.5497259537783084e-05,
"loss": 0.8889,
"step": 604
},
{
"epoch": 0.6604803493449781,
"grad_norm": 1.0148430437286953,
"learning_rate": 1.548260648865203e-05,
"loss": 0.8047,
"step": 605
},
{
"epoch": 0.6615720524017468,
"grad_norm": 0.9817255120479763,
"learning_rate": 1.546793658910218e-05,
"loss": 0.8596,
"step": 606
},
{
"epoch": 0.6626637554585153,
"grad_norm": 1.026775560362339,
"learning_rate": 1.5453249884220466e-05,
"loss": 0.8419,
"step": 607
},
{
"epoch": 0.6637554585152838,
"grad_norm": 1.1049484132912009,
"learning_rate": 1.543854641914549e-05,
"loss": 0.8032,
"step": 608
},
{
"epoch": 0.6648471615720524,
"grad_norm": 0.9987731135942097,
"learning_rate": 1.5423826239067342e-05,
"loss": 0.8059,
"step": 609
},
{
"epoch": 0.665938864628821,
"grad_norm": 1.00693504079384,
"learning_rate": 1.540908938922751e-05,
"loss": 0.807,
"step": 610
},
{
"epoch": 0.6670305676855895,
"grad_norm": 0.9953626555716258,
"learning_rate": 1.539433591491869e-05,
"loss": 0.7989,
"step": 611
},
{
"epoch": 0.6681222707423581,
"grad_norm": 1.0935727179499282,
"learning_rate": 1.537956586148469e-05,
"loss": 0.8398,
"step": 612
},
{
"epoch": 0.6692139737991266,
"grad_norm": 0.9861802448122945,
"learning_rate": 1.5364779274320255e-05,
"loss": 0.8345,
"step": 613
},
{
"epoch": 0.6703056768558951,
"grad_norm": 0.9973751361236348,
"learning_rate": 1.5349976198870974e-05,
"loss": 0.8042,
"step": 614
},
{
"epoch": 0.6713973799126638,
"grad_norm": 0.9648172303786421,
"learning_rate": 1.5335156680633082e-05,
"loss": 0.838,
"step": 615
},
{
"epoch": 0.6724890829694323,
"grad_norm": 0.9845493228154923,
"learning_rate": 1.5320320765153367e-05,
"loss": 0.7666,
"step": 616
},
{
"epoch": 0.6735807860262009,
"grad_norm": 1.0080145421206512,
"learning_rate": 1.5305468498029007e-05,
"loss": 0.8443,
"step": 617
},
{
"epoch": 0.6746724890829694,
"grad_norm": 0.9849749457287591,
"learning_rate": 1.5290599924907435e-05,
"loss": 0.7828,
"step": 618
},
{
"epoch": 0.675764192139738,
"grad_norm": 1.0107001619234208,
"learning_rate": 1.5275715091486204e-05,
"loss": 0.8223,
"step": 619
},
{
"epoch": 0.6768558951965066,
"grad_norm": 0.9984295630646346,
"learning_rate": 1.5260814043512838e-05,
"loss": 0.837,
"step": 620
},
{
"epoch": 0.6779475982532751,
"grad_norm": 1.0516480526589032,
"learning_rate": 1.5245896826784689e-05,
"loss": 0.8818,
"step": 621
},
{
"epoch": 0.6790393013100436,
"grad_norm": 0.9985462599845958,
"learning_rate": 1.5230963487148822e-05,
"loss": 0.8257,
"step": 622
},
{
"epoch": 0.6801310043668122,
"grad_norm": 0.9880543567496975,
"learning_rate": 1.5216014070501835e-05,
"loss": 0.8242,
"step": 623
},
{
"epoch": 0.6812227074235808,
"grad_norm": 0.9800412772988228,
"learning_rate": 1.5201048622789747e-05,
"loss": 0.7912,
"step": 624
},
{
"epoch": 0.6823144104803494,
"grad_norm": 1.0599364829335567,
"learning_rate": 1.5186067190007845e-05,
"loss": 0.8358,
"step": 625
},
{
"epoch": 0.6834061135371179,
"grad_norm": 1.0253996895351853,
"learning_rate": 1.5171069818200548e-05,
"loss": 0.8223,
"step": 626
},
{
"epoch": 0.6844978165938864,
"grad_norm": 1.0282350221027587,
"learning_rate": 1.5156056553461253e-05,
"loss": 0.8907,
"step": 627
},
{
"epoch": 0.6855895196506551,
"grad_norm": 1.0008290193674836,
"learning_rate": 1.5141027441932217e-05,
"loss": 0.8469,
"step": 628
},
{
"epoch": 0.6866812227074236,
"grad_norm": 1.0547981484639382,
"learning_rate": 1.5125982529804395e-05,
"loss": 0.8504,
"step": 629
},
{
"epoch": 0.6877729257641921,
"grad_norm": 1.0040270469408357,
"learning_rate": 1.5110921863317293e-05,
"loss": 0.8332,
"step": 630
},
{
"epoch": 0.6888646288209607,
"grad_norm": 1.00200157878387,
"learning_rate": 1.5095845488758856e-05,
"loss": 0.8376,
"step": 631
},
{
"epoch": 0.6899563318777293,
"grad_norm": 1.0293568144498755,
"learning_rate": 1.5080753452465296e-05,
"loss": 0.8075,
"step": 632
},
{
"epoch": 0.6910480349344978,
"grad_norm": 0.9591396587673909,
"learning_rate": 1.506564580082096e-05,
"loss": 0.7595,
"step": 633
},
{
"epoch": 0.6921397379912664,
"grad_norm": 0.9958045992810253,
"learning_rate": 1.5050522580258189e-05,
"loss": 0.8148,
"step": 634
},
{
"epoch": 0.6932314410480349,
"grad_norm": 0.9936759756949013,
"learning_rate": 1.5035383837257178e-05,
"loss": 0.8383,
"step": 635
},
{
"epoch": 0.6943231441048034,
"grad_norm": 1.0056214166764363,
"learning_rate": 1.502022961834582e-05,
"loss": 0.8701,
"step": 636
},
{
"epoch": 0.6954148471615721,
"grad_norm": 0.9796346831092025,
"learning_rate": 1.5005059970099585e-05,
"loss": 0.7997,
"step": 637
},
{
"epoch": 0.6965065502183406,
"grad_norm": 1.0131184460971026,
"learning_rate": 1.498987493914135e-05,
"loss": 0.8516,
"step": 638
},
{
"epoch": 0.6975982532751092,
"grad_norm": 1.0026902211337145,
"learning_rate": 1.4974674572141286e-05,
"loss": 0.8475,
"step": 639
},
{
"epoch": 0.6986899563318777,
"grad_norm": 0.9965791230499381,
"learning_rate": 1.4959458915816681e-05,
"loss": 0.8906,
"step": 640
},
{
"epoch": 0.6997816593886463,
"grad_norm": 0.9896333605024169,
"learning_rate": 1.494422801693182e-05,
"loss": 0.8038,
"step": 641
},
{
"epoch": 0.7008733624454149,
"grad_norm": 0.9841965630548845,
"learning_rate": 1.4928981922297842e-05,
"loss": 0.8416,
"step": 642
},
{
"epoch": 0.7019650655021834,
"grad_norm": 0.9632646796327432,
"learning_rate": 1.4913720678772584e-05,
"loss": 0.848,
"step": 643
},
{
"epoch": 0.7030567685589519,
"grad_norm": 0.9903855092333864,
"learning_rate": 1.4898444333260436e-05,
"loss": 0.8272,
"step": 644
},
{
"epoch": 0.7041484716157205,
"grad_norm": 3.8909126325326695,
"learning_rate": 1.4883152932712218e-05,
"loss": 0.8821,
"step": 645
},
{
"epoch": 0.7052401746724891,
"grad_norm": 1.0038664828829962,
"learning_rate": 1.4867846524125e-05,
"loss": 0.8273,
"step": 646
},
{
"epoch": 0.7063318777292577,
"grad_norm": 1.024727549894026,
"learning_rate": 1.4852525154541999e-05,
"loss": 0.8376,
"step": 647
},
{
"epoch": 0.7074235807860262,
"grad_norm": 0.9943881502376688,
"learning_rate": 1.4837188871052399e-05,
"loss": 0.8115,
"step": 648
},
{
"epoch": 0.7085152838427947,
"grad_norm": 1.0008529422042372,
"learning_rate": 1.482183772079123e-05,
"loss": 0.7906,
"step": 649
},
{
"epoch": 0.7096069868995634,
"grad_norm": 1.0074406981986863,
"learning_rate": 1.4806471750939206e-05,
"loss": 0.8472,
"step": 650
},
{
"epoch": 0.7106986899563319,
"grad_norm": 1.10207681776356,
"learning_rate": 1.4791091008722593e-05,
"loss": 0.8346,
"step": 651
},
{
"epoch": 0.7117903930131004,
"grad_norm": 0.9495604250061768,
"learning_rate": 1.4775695541413063e-05,
"loss": 0.7643,
"step": 652
},
{
"epoch": 0.712882096069869,
"grad_norm": 0.9705689287173537,
"learning_rate": 1.4760285396327531e-05,
"loss": 0.8515,
"step": 653
},
{
"epoch": 0.7139737991266376,
"grad_norm": 0.9943072279076429,
"learning_rate": 1.4744860620828034e-05,
"loss": 0.8581,
"step": 654
},
{
"epoch": 0.7150655021834061,
"grad_norm": 0.9500837414179008,
"learning_rate": 1.472942126232158e-05,
"loss": 0.8083,
"step": 655
},
{
"epoch": 0.7161572052401747,
"grad_norm": 1.4508128085431165,
"learning_rate": 1.4713967368259981e-05,
"loss": 0.8757,
"step": 656
},
{
"epoch": 0.7172489082969432,
"grad_norm": 1.041912264343576,
"learning_rate": 1.469849898613973e-05,
"loss": 0.818,
"step": 657
},
{
"epoch": 0.7183406113537117,
"grad_norm": 1.0169681097871424,
"learning_rate": 1.4683016163501855e-05,
"loss": 0.8569,
"step": 658
},
{
"epoch": 0.7194323144104804,
"grad_norm": 0.9732885384099658,
"learning_rate": 1.4667518947931757e-05,
"loss": 0.8028,
"step": 659
},
{
"epoch": 0.7205240174672489,
"grad_norm": 1.2877849023116021,
"learning_rate": 1.4652007387059077e-05,
"loss": 0.8654,
"step": 660
},
{
"epoch": 0.7216157205240175,
"grad_norm": 1.0208222523058725,
"learning_rate": 1.4636481528557545e-05,
"loss": 0.8167,
"step": 661
},
{
"epoch": 0.722707423580786,
"grad_norm": 1.0154911130890327,
"learning_rate": 1.4620941420144828e-05,
"loss": 0.836,
"step": 662
},
{
"epoch": 0.7237991266375546,
"grad_norm": 0.9613442159161312,
"learning_rate": 1.4605387109582401e-05,
"loss": 0.8037,
"step": 663
},
{
"epoch": 0.7248908296943232,
"grad_norm": 0.9931794657816619,
"learning_rate": 1.4589818644675378e-05,
"loss": 0.8241,
"step": 664
},
{
"epoch": 0.7259825327510917,
"grad_norm": 0.9625916354044245,
"learning_rate": 1.4574236073272379e-05,
"loss": 0.8558,
"step": 665
},
{
"epoch": 0.7270742358078602,
"grad_norm": 1.0561559338430706,
"learning_rate": 1.4558639443265379e-05,
"loss": 0.8742,
"step": 666
},
{
"epoch": 0.7281659388646288,
"grad_norm": 0.9754063417758851,
"learning_rate": 1.4543028802589563e-05,
"loss": 0.7826,
"step": 667
},
{
"epoch": 0.7292576419213974,
"grad_norm": 1.009620594905663,
"learning_rate": 1.4527404199223173e-05,
"loss": 0.8012,
"step": 668
},
{
"epoch": 0.730349344978166,
"grad_norm": 1.038359143146164,
"learning_rate": 1.4511765681187364e-05,
"loss": 0.8892,
"step": 669
},
{
"epoch": 0.7314410480349345,
"grad_norm": 0.9782108925488974,
"learning_rate": 1.4496113296546068e-05,
"loss": 0.8092,
"step": 670
},
{
"epoch": 0.732532751091703,
"grad_norm": 0.9715884651492042,
"learning_rate": 1.4480447093405818e-05,
"loss": 0.8014,
"step": 671
},
{
"epoch": 0.7336244541484717,
"grad_norm": 1.007135724554551,
"learning_rate": 1.446476711991563e-05,
"loss": 0.841,
"step": 672
},
{
"epoch": 0.7347161572052402,
"grad_norm": 1.023383297219398,
"learning_rate": 1.4449073424266838e-05,
"loss": 0.8613,
"step": 673
},
{
"epoch": 0.7358078602620087,
"grad_norm": 0.9493751859457114,
"learning_rate": 1.443336605469295e-05,
"loss": 0.7889,
"step": 674
},
{
"epoch": 0.7368995633187773,
"grad_norm": 0.9955853874545515,
"learning_rate": 1.4417645059469498e-05,
"loss": 0.7985,
"step": 675
},
{
"epoch": 0.7379912663755459,
"grad_norm": 1.0282733377323727,
"learning_rate": 1.4401910486913892e-05,
"loss": 0.9445,
"step": 676
},
{
"epoch": 0.7390829694323144,
"grad_norm": 0.980370554149024,
"learning_rate": 1.4386162385385279e-05,
"loss": 0.8207,
"step": 677
},
{
"epoch": 0.740174672489083,
"grad_norm": 0.9763214279020056,
"learning_rate": 1.4370400803284374e-05,
"loss": 0.8163,
"step": 678
},
{
"epoch": 0.7412663755458515,
"grad_norm": 0.9963883849643623,
"learning_rate": 1.4354625789053328e-05,
"loss": 0.8186,
"step": 679
},
{
"epoch": 0.74235807860262,
"grad_norm": 1.019333715035063,
"learning_rate": 1.4338837391175582e-05,
"loss": 0.8846,
"step": 680
},
{
"epoch": 0.7434497816593887,
"grad_norm": 1.077615669208836,
"learning_rate": 1.4323035658175704e-05,
"loss": 0.8719,
"step": 681
},
{
"epoch": 0.7445414847161572,
"grad_norm": 0.9937937975792568,
"learning_rate": 1.4307220638619244e-05,
"loss": 0.8384,
"step": 682
},
{
"epoch": 0.7456331877729258,
"grad_norm": 0.9956760675459271,
"learning_rate": 1.429139238111259e-05,
"loss": 0.8223,
"step": 683
},
{
"epoch": 0.7467248908296943,
"grad_norm": 1.0534388480179164,
"learning_rate": 1.4275550934302822e-05,
"loss": 0.8904,
"step": 684
},
{
"epoch": 0.7478165938864629,
"grad_norm": 0.9891840917951243,
"learning_rate": 1.425969634687755e-05,
"loss": 0.8517,
"step": 685
},
{
"epoch": 0.7489082969432315,
"grad_norm": 0.9724996527026983,
"learning_rate": 1.4243828667564767e-05,
"loss": 0.8238,
"step": 686
},
{
"epoch": 0.75,
"grad_norm": 1.0032474069614845,
"learning_rate": 1.4227947945132713e-05,
"loss": 0.7726,
"step": 687
},
{
"epoch": 0.7510917030567685,
"grad_norm": 0.9752454481407052,
"learning_rate": 1.4212054228389712e-05,
"loss": 0.8083,
"step": 688
},
{
"epoch": 0.7521834061135371,
"grad_norm": 0.9852688832956603,
"learning_rate": 1.4196147566184015e-05,
"loss": 0.7867,
"step": 689
},
{
"epoch": 0.7532751091703057,
"grad_norm": 0.9922343292070561,
"learning_rate": 1.4180228007403676e-05,
"loss": 0.7682,
"step": 690
},
{
"epoch": 0.7543668122270742,
"grad_norm": 1.0006760093721123,
"learning_rate": 1.4164295600976375e-05,
"loss": 0.8416,
"step": 691
},
{
"epoch": 0.7554585152838428,
"grad_norm": 1.0692995514756525,
"learning_rate": 1.4148350395869279e-05,
"loss": 0.8199,
"step": 692
},
{
"epoch": 0.7565502183406113,
"grad_norm": 1.0115666503549918,
"learning_rate": 1.41323924410889e-05,
"loss": 0.8207,
"step": 693
},
{
"epoch": 0.75764192139738,
"grad_norm": 0.9985469312353209,
"learning_rate": 1.4116421785680923e-05,
"loss": 0.8245,
"step": 694
},
{
"epoch": 0.7587336244541485,
"grad_norm": 1.0816992231207874,
"learning_rate": 1.4100438478730074e-05,
"loss": 0.8371,
"step": 695
},
{
"epoch": 0.759825327510917,
"grad_norm": 1.0649304459633577,
"learning_rate": 1.4084442569359964e-05,
"loss": 0.8275,
"step": 696
},
{
"epoch": 0.7609170305676856,
"grad_norm": 1.003656248934757,
"learning_rate": 1.406843410673293e-05,
"loss": 0.8172,
"step": 697
},
{
"epoch": 0.7620087336244541,
"grad_norm": 1.0822301460173938,
"learning_rate": 1.4052413140049898e-05,
"loss": 0.8934,
"step": 698
},
{
"epoch": 0.7631004366812227,
"grad_norm": 1.055975845987622,
"learning_rate": 1.4036379718550225e-05,
"loss": 0.845,
"step": 699
},
{
"epoch": 0.7641921397379913,
"grad_norm": 1.0606490574993288,
"learning_rate": 1.4020333891511536e-05,
"loss": 0.7901,
"step": 700
},
{
"epoch": 0.7652838427947598,
"grad_norm": 1.0385150349981462,
"learning_rate": 1.4004275708249595e-05,
"loss": 0.8014,
"step": 701
},
{
"epoch": 0.7663755458515283,
"grad_norm": 0.9881125405362946,
"learning_rate": 1.3988205218118141e-05,
"loss": 0.789,
"step": 702
},
{
"epoch": 0.767467248908297,
"grad_norm": 1.0535630478031675,
"learning_rate": 1.3972122470508726e-05,
"loss": 0.7994,
"step": 703
},
{
"epoch": 0.7685589519650655,
"grad_norm": 1.0448192241712306,
"learning_rate": 1.395602751485059e-05,
"loss": 0.756,
"step": 704
},
{
"epoch": 0.769650655021834,
"grad_norm": 1.0813049250435902,
"learning_rate": 1.3939920400610483e-05,
"loss": 0.8495,
"step": 705
},
{
"epoch": 0.7707423580786026,
"grad_norm": 1.0177428362399865,
"learning_rate": 1.3923801177292529e-05,
"loss": 0.7897,
"step": 706
},
{
"epoch": 0.7718340611353712,
"grad_norm": 1.0840063917332026,
"learning_rate": 1.3907669894438064e-05,
"loss": 0.8305,
"step": 707
},
{
"epoch": 0.7729257641921398,
"grad_norm": 0.9788916341571317,
"learning_rate": 1.3891526601625492e-05,
"loss": 0.8272,
"step": 708
},
{
"epoch": 0.7740174672489083,
"grad_norm": 0.9777606860616397,
"learning_rate": 1.3875371348470129e-05,
"loss": 0.8552,
"step": 709
},
{
"epoch": 0.7751091703056768,
"grad_norm": 1.0397782609951218,
"learning_rate": 1.3859204184624047e-05,
"loss": 0.8109,
"step": 710
},
{
"epoch": 0.7762008733624454,
"grad_norm": 1.0649447413206765,
"learning_rate": 1.3843025159775924e-05,
"loss": 0.7934,
"step": 711
},
{
"epoch": 0.777292576419214,
"grad_norm": 1.7969971151980717,
"learning_rate": 1.3826834323650899e-05,
"loss": 0.8874,
"step": 712
},
{
"epoch": 0.7783842794759825,
"grad_norm": 1.033106462716774,
"learning_rate": 1.3810631726010405e-05,
"loss": 0.8803,
"step": 713
},
{
"epoch": 0.7794759825327511,
"grad_norm": 0.9897248814519977,
"learning_rate": 1.3794417416652027e-05,
"loss": 0.7928,
"step": 714
},
{
"epoch": 0.7805676855895196,
"grad_norm": 0.9478712783580292,
"learning_rate": 1.3778191445409341e-05,
"loss": 0.7599,
"step": 715
},
{
"epoch": 0.7816593886462883,
"grad_norm": 0.9953969385209082,
"learning_rate": 1.3761953862151773e-05,
"loss": 0.7623,
"step": 716
},
{
"epoch": 0.7827510917030568,
"grad_norm": 1.0020597019618507,
"learning_rate": 1.3745704716784429e-05,
"loss": 0.8438,
"step": 717
},
{
"epoch": 0.7838427947598253,
"grad_norm": 0.9726051950466007,
"learning_rate": 1.3729444059247954e-05,
"loss": 0.8263,
"step": 718
},
{
"epoch": 0.7849344978165939,
"grad_norm": 0.9722458304302996,
"learning_rate": 1.3713171939518378e-05,
"loss": 0.834,
"step": 719
},
{
"epoch": 0.7860262008733624,
"grad_norm": 0.9944927083101299,
"learning_rate": 1.3696888407606952e-05,
"loss": 0.8046,
"step": 720
},
{
"epoch": 0.787117903930131,
"grad_norm": 0.9894755603020594,
"learning_rate": 1.3680593513560006e-05,
"loss": 0.8194,
"step": 721
},
{
"epoch": 0.7882096069868996,
"grad_norm": 0.9922031547192685,
"learning_rate": 1.3664287307458794e-05,
"loss": 0.7997,
"step": 722
},
{
"epoch": 0.7893013100436681,
"grad_norm": 0.9985883461042631,
"learning_rate": 1.3647969839419335e-05,
"loss": 0.7659,
"step": 723
},
{
"epoch": 0.7903930131004366,
"grad_norm": 0.986557355767747,
"learning_rate": 1.3631641159592253e-05,
"loss": 0.8137,
"step": 724
},
{
"epoch": 0.7914847161572053,
"grad_norm": 0.9830438907910151,
"learning_rate": 1.3615301318162635e-05,
"loss": 0.7841,
"step": 725
},
{
"epoch": 0.7925764192139738,
"grad_norm": 1.001586575147818,
"learning_rate": 1.3598950365349884e-05,
"loss": 0.8088,
"step": 726
},
{
"epoch": 0.7936681222707423,
"grad_norm": 1.0781375127128376,
"learning_rate": 1.3582588351407537e-05,
"loss": 0.8293,
"step": 727
},
{
"epoch": 0.7947598253275109,
"grad_norm": 1.1457870849091742,
"learning_rate": 1.3566215326623131e-05,
"loss": 0.8932,
"step": 728
},
{
"epoch": 0.7958515283842795,
"grad_norm": 1.0582361940933662,
"learning_rate": 1.3549831341318052e-05,
"loss": 0.8197,
"step": 729
},
{
"epoch": 0.7969432314410481,
"grad_norm": 1.0009241462708363,
"learning_rate": 1.353343644584736e-05,
"loss": 0.818,
"step": 730
},
{
"epoch": 0.7980349344978166,
"grad_norm": 1.0258247342217293,
"learning_rate": 1.3517030690599662e-05,
"loss": 0.8336,
"step": 731
},
{
"epoch": 0.7991266375545851,
"grad_norm": 1.0483528617887998,
"learning_rate": 1.3500614125996924e-05,
"loss": 0.8518,
"step": 732
},
{
"epoch": 0.8002183406113537,
"grad_norm": 1.9936218017729914,
"learning_rate": 1.3484186802494346e-05,
"loss": 1.0314,
"step": 733
},
{
"epoch": 0.8013100436681223,
"grad_norm": 1.0760885690226825,
"learning_rate": 1.3467748770580193e-05,
"loss": 0.8428,
"step": 734
},
{
"epoch": 0.8024017467248908,
"grad_norm": 1.0295274834541253,
"learning_rate": 1.3451300080775636e-05,
"loss": 0.8959,
"step": 735
},
{
"epoch": 0.8034934497816594,
"grad_norm": 1.0268868689300337,
"learning_rate": 1.3434840783634611e-05,
"loss": 0.8605,
"step": 736
},
{
"epoch": 0.8045851528384279,
"grad_norm": 0.9868758942660906,
"learning_rate": 1.341837092974365e-05,
"loss": 0.7949,
"step": 737
},
{
"epoch": 0.8056768558951966,
"grad_norm": 1.0273404591070299,
"learning_rate": 1.3401890569721725e-05,
"loss": 0.8387,
"step": 738
},
{
"epoch": 0.8067685589519651,
"grad_norm": 0.9852739340152835,
"learning_rate": 1.3385399754220108e-05,
"loss": 0.822,
"step": 739
},
{
"epoch": 0.8078602620087336,
"grad_norm": 1.0097657303789613,
"learning_rate": 1.3368898533922202e-05,
"loss": 0.8445,
"step": 740
},
{
"epoch": 0.8089519650655022,
"grad_norm": 1.0349678711665362,
"learning_rate": 1.3352386959543384e-05,
"loss": 0.7618,
"step": 741
},
{
"epoch": 0.8100436681222707,
"grad_norm": 0.9858077303095648,
"learning_rate": 1.3335865081830858e-05,
"loss": 0.8012,
"step": 742
},
{
"epoch": 0.8111353711790393,
"grad_norm": 1.027171394985511,
"learning_rate": 1.3319332951563495e-05,
"loss": 0.8858,
"step": 743
},
{
"epoch": 0.8122270742358079,
"grad_norm": 1.0435280920584542,
"learning_rate": 1.3302790619551673e-05,
"loss": 0.819,
"step": 744
},
{
"epoch": 0.8133187772925764,
"grad_norm": 1.0102996172054506,
"learning_rate": 1.3286238136637127e-05,
"loss": 0.8205,
"step": 745
},
{
"epoch": 0.8144104803493449,
"grad_norm": 0.9971006094968926,
"learning_rate": 1.3269675553692787e-05,
"loss": 0.8174,
"step": 746
},
{
"epoch": 0.8155021834061136,
"grad_norm": 0.9767390708268943,
"learning_rate": 1.3253102921622632e-05,
"loss": 0.7789,
"step": 747
},
{
"epoch": 0.8165938864628821,
"grad_norm": 1.0426302680146287,
"learning_rate": 1.3236520291361516e-05,
"loss": 0.8146,
"step": 748
},
{
"epoch": 0.8176855895196506,
"grad_norm": 1.0470627534691905,
"learning_rate": 1.3219927713875032e-05,
"loss": 0.8336,
"step": 749
},
{
"epoch": 0.8187772925764192,
"grad_norm": 0.9878917969348322,
"learning_rate": 1.3203325240159337e-05,
"loss": 0.7707,
"step": 750
},
{
"epoch": 0.8198689956331878,
"grad_norm": 1.0293950449242155,
"learning_rate": 1.3186712921241009e-05,
"loss": 0.8195,
"step": 751
},
{
"epoch": 0.8209606986899564,
"grad_norm": 0.9880792790798334,
"learning_rate": 1.3170090808176883e-05,
"loss": 0.8138,
"step": 752
},
{
"epoch": 0.8220524017467249,
"grad_norm": 0.9819090021005722,
"learning_rate": 1.315345895205389e-05,
"loss": 0.8292,
"step": 753
},
{
"epoch": 0.8231441048034934,
"grad_norm": 0.952640449487676,
"learning_rate": 1.3136817403988918e-05,
"loss": 0.7759,
"step": 754
},
{
"epoch": 0.824235807860262,
"grad_norm": 1.015292142708617,
"learning_rate": 1.3120166215128627e-05,
"loss": 0.8593,
"step": 755
},
{
"epoch": 0.8253275109170306,
"grad_norm": 1.0326420470098143,
"learning_rate": 1.310350543664932e-05,
"loss": 0.8247,
"step": 756
},
{
"epoch": 0.8264192139737991,
"grad_norm": 0.9627929915911005,
"learning_rate": 1.308683511975677e-05,
"loss": 0.7897,
"step": 757
},
{
"epoch": 0.8275109170305677,
"grad_norm": 0.9807703137353102,
"learning_rate": 1.307015531568606e-05,
"loss": 0.7854,
"step": 758
},
{
"epoch": 0.8286026200873362,
"grad_norm": 0.977008775189795,
"learning_rate": 1.305346607570144e-05,
"loss": 0.8258,
"step": 759
},
{
"epoch": 0.8296943231441049,
"grad_norm": 1.0039573606359118,
"learning_rate": 1.3036767451096148e-05,
"loss": 0.8637,
"step": 760
},
{
"epoch": 0.8307860262008734,
"grad_norm": 0.956921364153981,
"learning_rate": 1.3020059493192283e-05,
"loss": 0.7328,
"step": 761
},
{
"epoch": 0.8318777292576419,
"grad_norm": 0.9832469102700667,
"learning_rate": 1.3003342253340613e-05,
"loss": 0.8046,
"step": 762
},
{
"epoch": 0.8329694323144105,
"grad_norm": 0.9511120035637255,
"learning_rate": 1.298661578292044e-05,
"loss": 0.7811,
"step": 763
},
{
"epoch": 0.834061135371179,
"grad_norm": 1.0150460656045295,
"learning_rate": 1.2969880133339437e-05,
"loss": 0.8347,
"step": 764
},
{
"epoch": 0.8351528384279476,
"grad_norm": 4.232566704644336,
"learning_rate": 1.2953135356033486e-05,
"loss": 1.0034,
"step": 765
},
{
"epoch": 0.8362445414847162,
"grad_norm": 1.0186459299872372,
"learning_rate": 1.2936381502466524e-05,
"loss": 0.8563,
"step": 766
},
{
"epoch": 0.8373362445414847,
"grad_norm": 1.0010625201169578,
"learning_rate": 1.2919618624130381e-05,
"loss": 0.8281,
"step": 767
},
{
"epoch": 0.8384279475982532,
"grad_norm": 1.171817471878155,
"learning_rate": 1.2902846772544625e-05,
"loss": 0.8543,
"step": 768
},
{
"epoch": 0.8395196506550219,
"grad_norm": 1.0444982899549722,
"learning_rate": 1.2886065999256406e-05,
"loss": 0.8222,
"step": 769
},
{
"epoch": 0.8406113537117904,
"grad_norm": 1.0437254568671326,
"learning_rate": 1.2869276355840288e-05,
"loss": 0.8324,
"step": 770
},
{
"epoch": 0.8417030567685589,
"grad_norm": 0.996507947428093,
"learning_rate": 1.2852477893898101e-05,
"loss": 0.8251,
"step": 771
},
{
"epoch": 0.8427947598253275,
"grad_norm": 1.0479342824701492,
"learning_rate": 1.2835670665058779e-05,
"loss": 0.8522,
"step": 772
},
{
"epoch": 0.8438864628820961,
"grad_norm": 1.0318373901517308,
"learning_rate": 1.2818854720978198e-05,
"loss": 0.7771,
"step": 773
},
{
"epoch": 0.8449781659388647,
"grad_norm": 1.0112620173281517,
"learning_rate": 1.2802030113339016e-05,
"loss": 0.8526,
"step": 774
},
{
"epoch": 0.8460698689956332,
"grad_norm": 1.079577680085193,
"learning_rate": 1.2785196893850532e-05,
"loss": 0.8896,
"step": 775
},
{
"epoch": 0.8471615720524017,
"grad_norm": 0.9904092400676572,
"learning_rate": 1.2768355114248493e-05,
"loss": 0.7849,
"step": 776
},
{
"epoch": 0.8482532751091703,
"grad_norm": 0.991007035893447,
"learning_rate": 1.2751504826294971e-05,
"loss": 0.7736,
"step": 777
},
{
"epoch": 0.8493449781659389,
"grad_norm": 0.9944342662129599,
"learning_rate": 1.273464608177818e-05,
"loss": 0.7979,
"step": 778
},
{
"epoch": 0.8504366812227074,
"grad_norm": 1.018751149033696,
"learning_rate": 1.2717778932512333e-05,
"loss": 0.8291,
"step": 779
},
{
"epoch": 0.851528384279476,
"grad_norm": 0.992005577424338,
"learning_rate": 1.2700903430337456e-05,
"loss": 0.8151,
"step": 780
},
{
"epoch": 0.8526200873362445,
"grad_norm": 1.1485359617550743,
"learning_rate": 1.2684019627119267e-05,
"loss": 0.8273,
"step": 781
},
{
"epoch": 0.8537117903930131,
"grad_norm": 1.025382194009243,
"learning_rate": 1.2667127574748985e-05,
"loss": 0.773,
"step": 782
},
{
"epoch": 0.8548034934497817,
"grad_norm": 0.9822864486049122,
"learning_rate": 1.2650227325143192e-05,
"loss": 0.7843,
"step": 783
},
{
"epoch": 0.8558951965065502,
"grad_norm": 1.011756898798447,
"learning_rate": 1.2633318930243647e-05,
"loss": 0.8574,
"step": 784
},
{
"epoch": 0.8569868995633187,
"grad_norm": 0.9820886451357439,
"learning_rate": 1.2616402442017168e-05,
"loss": 0.7239,
"step": 785
},
{
"epoch": 0.8580786026200873,
"grad_norm": 0.9654465190740715,
"learning_rate": 1.2599477912455425e-05,
"loss": 0.8002,
"step": 786
},
{
"epoch": 0.8591703056768559,
"grad_norm": 0.9474348684874027,
"learning_rate": 1.258254539357481e-05,
"loss": 0.7862,
"step": 787
},
{
"epoch": 0.8602620087336245,
"grad_norm": 1.0597038987083574,
"learning_rate": 1.2565604937416267e-05,
"loss": 0.794,
"step": 788
},
{
"epoch": 0.861353711790393,
"grad_norm": 1.0134468504898015,
"learning_rate": 1.2548656596045147e-05,
"loss": 0.7747,
"step": 789
},
{
"epoch": 0.8624454148471615,
"grad_norm": 1.0170037116131896,
"learning_rate": 1.253170042155102e-05,
"loss": 0.8248,
"step": 790
},
{
"epoch": 0.8635371179039302,
"grad_norm": 1.0182553228876519,
"learning_rate": 1.2514736466047539e-05,
"loss": 0.8153,
"step": 791
},
{
"epoch": 0.8646288209606987,
"grad_norm": 0.9682427596515195,
"learning_rate": 1.249776478167227e-05,
"loss": 0.7605,
"step": 792
},
{
"epoch": 0.8657205240174672,
"grad_norm": 0.9766360517928412,
"learning_rate": 1.2480785420586532e-05,
"loss": 0.8433,
"step": 793
},
{
"epoch": 0.8668122270742358,
"grad_norm": 0.9668797075638123,
"learning_rate": 1.2463798434975239e-05,
"loss": 0.8318,
"step": 794
},
{
"epoch": 0.8679039301310044,
"grad_norm": 0.9860295829191986,
"learning_rate": 1.2446803877046734e-05,
"loss": 0.8268,
"step": 795
},
{
"epoch": 0.868995633187773,
"grad_norm": 1.0477866682225716,
"learning_rate": 1.242980179903264e-05,
"loss": 0.8658,
"step": 796
},
{
"epoch": 0.8700873362445415,
"grad_norm": 0.9789001248534538,
"learning_rate": 1.2412792253187693e-05,
"loss": 0.8116,
"step": 797
},
{
"epoch": 0.87117903930131,
"grad_norm": 1.030161845340474,
"learning_rate": 1.239577529178957e-05,
"loss": 0.7731,
"step": 798
},
{
"epoch": 0.8722707423580786,
"grad_norm": 1.026800420596272,
"learning_rate": 1.2378750967138752e-05,
"loss": 0.8211,
"step": 799
},
{
"epoch": 0.8733624454148472,
"grad_norm": 1.0021297856771973,
"learning_rate": 1.2361719331558346e-05,
"loss": 0.7555,
"step": 800
},
{
"epoch": 0.8744541484716157,
"grad_norm": 1.0105829808434217,
"learning_rate": 1.2344680437393923e-05,
"loss": 0.7984,
"step": 801
},
{
"epoch": 0.8755458515283843,
"grad_norm": 1.022887611025188,
"learning_rate": 1.2327634337013366e-05,
"loss": 0.8456,
"step": 802
},
{
"epoch": 0.8766375545851528,
"grad_norm": 1.0160214074392968,
"learning_rate": 1.2310581082806713e-05,
"loss": 0.8015,
"step": 803
},
{
"epoch": 0.8777292576419214,
"grad_norm": 1.0275765697851287,
"learning_rate": 1.229352072718598e-05,
"loss": 0.8558,
"step": 804
},
{
"epoch": 0.87882096069869,
"grad_norm": 0.9940591580282742,
"learning_rate": 1.2276453322585012e-05,
"loss": 0.791,
"step": 805
},
{
"epoch": 0.8799126637554585,
"grad_norm": 0.9889975349482799,
"learning_rate": 1.225937892145932e-05,
"loss": 0.7756,
"step": 806
},
{
"epoch": 0.881004366812227,
"grad_norm": 0.9807345351008925,
"learning_rate": 1.2242297576285911e-05,
"loss": 0.7979,
"step": 807
},
{
"epoch": 0.8820960698689956,
"grad_norm": 1.0503768144195273,
"learning_rate": 1.2225209339563144e-05,
"loss": 0.8143,
"step": 808
},
{
"epoch": 0.8831877729257642,
"grad_norm": 1.046964193058164,
"learning_rate": 1.2208114263810555e-05,
"loss": 0.8826,
"step": 809
},
{
"epoch": 0.8842794759825328,
"grad_norm": 1.0150491417761265,
"learning_rate": 1.2191012401568698e-05,
"loss": 0.8423,
"step": 810
},
{
"epoch": 0.8853711790393013,
"grad_norm": 1.049591548870281,
"learning_rate": 1.2173903805398986e-05,
"loss": 0.8366,
"step": 811
},
{
"epoch": 0.8864628820960698,
"grad_norm": 1.0543208147085164,
"learning_rate": 1.2156788527883524e-05,
"loss": 0.8588,
"step": 812
},
{
"epoch": 0.8875545851528385,
"grad_norm": 0.9909638513378916,
"learning_rate": 1.213966662162496e-05,
"loss": 0.7952,
"step": 813
},
{
"epoch": 0.888646288209607,
"grad_norm": 0.9678733069500823,
"learning_rate": 1.2122538139246308e-05,
"loss": 0.8128,
"step": 814
},
{
"epoch": 0.8897379912663755,
"grad_norm": 1.0706679449248189,
"learning_rate": 1.2105403133390797e-05,
"loss": 0.8274,
"step": 815
},
{
"epoch": 0.8908296943231441,
"grad_norm": 1.053814874477726,
"learning_rate": 1.20882616567217e-05,
"loss": 0.802,
"step": 816
},
{
"epoch": 0.8919213973799127,
"grad_norm": 1.0364797166098356,
"learning_rate": 1.2071113761922187e-05,
"loss": 0.8035,
"step": 817
},
{
"epoch": 0.8930131004366813,
"grad_norm": 1.0240440492193246,
"learning_rate": 1.2053959501695144e-05,
"loss": 0.7751,
"step": 818
},
{
"epoch": 0.8941048034934498,
"grad_norm": 1.0194760561807408,
"learning_rate": 1.203679892876303e-05,
"loss": 0.8562,
"step": 819
},
{
"epoch": 0.8951965065502183,
"grad_norm": 1.0078471362915395,
"learning_rate": 1.2019632095867697e-05,
"loss": 0.8255,
"step": 820
},
{
"epoch": 0.8962882096069869,
"grad_norm": 0.9803764017984781,
"learning_rate": 1.2002459055770244e-05,
"loss": 0.8506,
"step": 821
},
{
"epoch": 0.8973799126637555,
"grad_norm": 1.01785410579334,
"learning_rate": 1.1985279861250839e-05,
"loss": 0.7918,
"step": 822
},
{
"epoch": 0.898471615720524,
"grad_norm": 0.9660083715427997,
"learning_rate": 1.1968094565108573e-05,
"loss": 0.7601,
"step": 823
},
{
"epoch": 0.8995633187772926,
"grad_norm": 0.9794396166041377,
"learning_rate": 1.1950903220161286e-05,
"loss": 0.8389,
"step": 824
},
{
"epoch": 0.9006550218340611,
"grad_norm": 0.9817519396770674,
"learning_rate": 1.1933705879245408e-05,
"loss": 0.8082,
"step": 825
},
{
"epoch": 0.9017467248908297,
"grad_norm": 0.984940309641793,
"learning_rate": 1.1916502595215799e-05,
"loss": 0.7597,
"step": 826
},
{
"epoch": 0.9028384279475983,
"grad_norm": 0.9481098363542785,
"learning_rate": 1.189929342094559e-05,
"loss": 0.7861,
"step": 827
},
{
"epoch": 0.9039301310043668,
"grad_norm": 0.9198743597023081,
"learning_rate": 1.1882078409326003e-05,
"loss": 0.7555,
"step": 828
},
{
"epoch": 0.9050218340611353,
"grad_norm": 0.9797553102750473,
"learning_rate": 1.1864857613266212e-05,
"loss": 0.7742,
"step": 829
},
{
"epoch": 0.9061135371179039,
"grad_norm": 0.9773485584878683,
"learning_rate": 1.1847631085693159e-05,
"loss": 0.8229,
"step": 830
},
{
"epoch": 0.9072052401746725,
"grad_norm": 0.9578374194364608,
"learning_rate": 1.1830398879551412e-05,
"loss": 0.8386,
"step": 831
},
{
"epoch": 0.9082969432314411,
"grad_norm": 0.9958895257948418,
"learning_rate": 1.1813161047802986e-05,
"loss": 0.8422,
"step": 832
},
{
"epoch": 0.9093886462882096,
"grad_norm": 0.9826362002668748,
"learning_rate": 1.1795917643427179e-05,
"loss": 0.8115,
"step": 833
},
{
"epoch": 0.9104803493449781,
"grad_norm": 0.9887746894138071,
"learning_rate": 1.1778668719420436e-05,
"loss": 0.8156,
"step": 834
},
{
"epoch": 0.9115720524017468,
"grad_norm": 1.0301054410493677,
"learning_rate": 1.1761414328796147e-05,
"loss": 0.8829,
"step": 835
},
{
"epoch": 0.9126637554585153,
"grad_norm": 0.9789625559772259,
"learning_rate": 1.174415452458451e-05,
"loss": 0.8057,
"step": 836
},
{
"epoch": 0.9137554585152838,
"grad_norm": 0.977980282560095,
"learning_rate": 1.172688935983236e-05,
"loss": 0.8235,
"step": 837
},
{
"epoch": 0.9148471615720524,
"grad_norm": 0.98941084345582,
"learning_rate": 1.1709618887603013e-05,
"loss": 0.8218,
"step": 838
},
{
"epoch": 0.915938864628821,
"grad_norm": 0.987254501811818,
"learning_rate": 1.1692343160976092e-05,
"loss": 0.8036,
"step": 839
},
{
"epoch": 0.9170305676855895,
"grad_norm": 1.13642252086255,
"learning_rate": 1.1675062233047365e-05,
"loss": 0.871,
"step": 840
},
{
"epoch": 0.9181222707423581,
"grad_norm": 0.9883352389103331,
"learning_rate": 1.1657776156928598e-05,
"loss": 0.8463,
"step": 841
},
{
"epoch": 0.9192139737991266,
"grad_norm": 1.0245691753057746,
"learning_rate": 1.1640484985747365e-05,
"loss": 0.826,
"step": 842
},
{
"epoch": 0.9203056768558951,
"grad_norm": 1.071710141972311,
"learning_rate": 1.162318877264691e-05,
"loss": 0.8027,
"step": 843
},
{
"epoch": 0.9213973799126638,
"grad_norm": 2.622690089986299,
"learning_rate": 1.1605887570785972e-05,
"loss": 0.8753,
"step": 844
},
{
"epoch": 0.9224890829694323,
"grad_norm": 1.03397508878459,
"learning_rate": 1.1588581433338614e-05,
"loss": 0.8267,
"step": 845
},
{
"epoch": 0.9235807860262009,
"grad_norm": 1.004890691522863,
"learning_rate": 1.1571270413494082e-05,
"loss": 0.8158,
"step": 846
},
{
"epoch": 0.9246724890829694,
"grad_norm": 2.3399569808932226,
"learning_rate": 1.1553954564456616e-05,
"loss": 0.8807,
"step": 847
},
{
"epoch": 0.925764192139738,
"grad_norm": 2.523967070686754,
"learning_rate": 1.1536633939445302e-05,
"loss": 0.8555,
"step": 848
},
{
"epoch": 0.9268558951965066,
"grad_norm": 1.1062411624065849,
"learning_rate": 1.1519308591693905e-05,
"loss": 0.8523,
"step": 849
},
{
"epoch": 0.9279475982532751,
"grad_norm": 0.9804487779384249,
"learning_rate": 1.150197857445071e-05,
"loss": 0.8005,
"step": 850
},
{
"epoch": 0.9290393013100436,
"grad_norm": 0.9696684979015404,
"learning_rate": 1.148464394097834e-05,
"loss": 0.7745,
"step": 851
},
{
"epoch": 0.9301310043668122,
"grad_norm": 0.9925354155786811,
"learning_rate": 1.1467304744553618e-05,
"loss": 0.7371,
"step": 852
},
{
"epoch": 0.9312227074235808,
"grad_norm": 1.0833318800332936,
"learning_rate": 1.144996103846739e-05,
"loss": 0.8728,
"step": 853
},
{
"epoch": 0.9323144104803494,
"grad_norm": 0.9732649990140317,
"learning_rate": 1.1432612876024351e-05,
"loss": 0.8192,
"step": 854
},
{
"epoch": 0.9334061135371179,
"grad_norm": 1.0096302801535977,
"learning_rate": 1.141526031054291e-05,
"loss": 0.7698,
"step": 855
},
{
"epoch": 0.9344978165938864,
"grad_norm": 1.0421365871395685,
"learning_rate": 1.1397903395354996e-05,
"loss": 0.7518,
"step": 856
},
{
"epoch": 0.9355895196506551,
"grad_norm": 0.9723305573161944,
"learning_rate": 1.1380542183805908e-05,
"loss": 0.8159,
"step": 857
},
{
"epoch": 0.9366812227074236,
"grad_norm": 0.9622830874165507,
"learning_rate": 1.1363176729254147e-05,
"loss": 0.8135,
"step": 858
},
{
"epoch": 0.9377729257641921,
"grad_norm": 1.0340895983705864,
"learning_rate": 1.1345807085071263e-05,
"loss": 0.8202,
"step": 859
},
{
"epoch": 0.9388646288209607,
"grad_norm": 0.9628961444901861,
"learning_rate": 1.132843330464168e-05,
"loss": 0.8002,
"step": 860
},
{
"epoch": 0.9399563318777293,
"grad_norm": 0.9569676804323826,
"learning_rate": 1.1311055441362525e-05,
"loss": 0.7781,
"step": 861
},
{
"epoch": 0.9410480349344978,
"grad_norm": 0.9796557229038654,
"learning_rate": 1.1293673548643492e-05,
"loss": 0.8113,
"step": 862
},
{
"epoch": 0.9421397379912664,
"grad_norm": 1.070015316978831,
"learning_rate": 1.1276287679906638e-05,
"loss": 0.8349,
"step": 863
},
{
"epoch": 0.9432314410480349,
"grad_norm": 0.9965597942630992,
"learning_rate": 1.1258897888586256e-05,
"loss": 0.7928,
"step": 864
},
{
"epoch": 0.9443231441048034,
"grad_norm": 0.982620165765039,
"learning_rate": 1.1241504228128685e-05,
"loss": 0.8491,
"step": 865
},
{
"epoch": 0.9454148471615721,
"grad_norm": 1.024082714252077,
"learning_rate": 1.1224106751992164e-05,
"loss": 0.8295,
"step": 866
},
{
"epoch": 0.9465065502183406,
"grad_norm": 1.0165057189321018,
"learning_rate": 1.1206705513646652e-05,
"loss": 0.8505,
"step": 867
},
{
"epoch": 0.9475982532751092,
"grad_norm": 0.9931196700818016,
"learning_rate": 1.118930056657367e-05,
"loss": 0.821,
"step": 868
},
{
"epoch": 0.9486899563318777,
"grad_norm": 1.0761172910671928,
"learning_rate": 1.1171891964266149e-05,
"loss": 0.8206,
"step": 869
},
{
"epoch": 0.9497816593886463,
"grad_norm": 1.0222944988488392,
"learning_rate": 1.1154479760228242e-05,
"loss": 0.8145,
"step": 870
},
{
"epoch": 0.9508733624454149,
"grad_norm": 0.9996391931238009,
"learning_rate": 1.1137064007975176e-05,
"loss": 0.8087,
"step": 871
},
{
"epoch": 0.9519650655021834,
"grad_norm": 1.0759040754869575,
"learning_rate": 1.1119644761033079e-05,
"loss": 0.8263,
"step": 872
},
{
"epoch": 0.9530567685589519,
"grad_norm": 1.0149901896888367,
"learning_rate": 1.1102222072938832e-05,
"loss": 0.84,
"step": 873
},
{
"epoch": 0.9541484716157205,
"grad_norm": 0.9758678103754201,
"learning_rate": 1.108479599723988e-05,
"loss": 0.7788,
"step": 874
},
{
"epoch": 0.9552401746724891,
"grad_norm": 0.9779263401794293,
"learning_rate": 1.1067366587494082e-05,
"loss": 0.8187,
"step": 875
},
{
"epoch": 0.9563318777292577,
"grad_norm": 0.9496722534810037,
"learning_rate": 1.1049933897269547e-05,
"loss": 0.7601,
"step": 876
},
{
"epoch": 0.9574235807860262,
"grad_norm": 1.057087300105753,
"learning_rate": 1.1032497980144465e-05,
"loss": 0.8634,
"step": 877
},
{
"epoch": 0.9585152838427947,
"grad_norm": 0.9601016652262959,
"learning_rate": 1.1015058889706942e-05,
"loss": 0.7608,
"step": 878
},
{
"epoch": 0.9596069868995634,
"grad_norm": 1.0298126653534947,
"learning_rate": 1.0997616679554842e-05,
"loss": 0.8092,
"step": 879
},
{
"epoch": 0.9606986899563319,
"grad_norm": 0.9596604740539253,
"learning_rate": 1.098017140329561e-05,
"loss": 0.7979,
"step": 880
},
{
"epoch": 0.9617903930131004,
"grad_norm": 1.079089092837934,
"learning_rate": 1.0962723114546116e-05,
"loss": 0.8252,
"step": 881
},
{
"epoch": 0.962882096069869,
"grad_norm": 1.0239999052193924,
"learning_rate": 1.0945271866932496e-05,
"loss": 0.7865,
"step": 882
},
{
"epoch": 0.9639737991266376,
"grad_norm": 1.0125502527542158,
"learning_rate": 1.0927817714089975e-05,
"loss": 0.8376,
"step": 883
},
{
"epoch": 0.9650655021834061,
"grad_norm": 0.951675880376376,
"learning_rate": 1.0910360709662701e-05,
"loss": 0.8116,
"step": 884
},
{
"epoch": 0.9661572052401747,
"grad_norm": 0.9463982277911809,
"learning_rate": 1.08929009073036e-05,
"loss": 0.7657,
"step": 885
},
{
"epoch": 0.9672489082969432,
"grad_norm": 0.9581965887711609,
"learning_rate": 1.087543836067418e-05,
"loss": 0.7924,
"step": 886
},
{
"epoch": 0.9683406113537117,
"grad_norm": 0.9689688481698168,
"learning_rate": 1.0857973123444401e-05,
"loss": 0.8093,
"step": 887
},
{
"epoch": 0.9694323144104804,
"grad_norm": 0.973362093900798,
"learning_rate": 1.0840505249292477e-05,
"loss": 0.8037,
"step": 888
},
{
"epoch": 0.9705240174672489,
"grad_norm": 1.0400643723504706,
"learning_rate": 1.0823034791904734e-05,
"loss": 0.7994,
"step": 889
},
{
"epoch": 0.9716157205240175,
"grad_norm": 1.0109394051155618,
"learning_rate": 1.0805561804975443e-05,
"loss": 0.841,
"step": 890
},
{
"epoch": 0.972707423580786,
"grad_norm": 1.0049314560152705,
"learning_rate": 1.0788086342206636e-05,
"loss": 0.8279,
"step": 891
},
{
"epoch": 0.9737991266375546,
"grad_norm": 1.0140994197877482,
"learning_rate": 1.0770608457307965e-05,
"loss": 0.7933,
"step": 892
},
{
"epoch": 0.9748908296943232,
"grad_norm": 1.024185021737908,
"learning_rate": 1.0753128203996519e-05,
"loss": 0.826,
"step": 893
},
{
"epoch": 0.9759825327510917,
"grad_norm": 1.0075028915925779,
"learning_rate": 1.0735645635996676e-05,
"loss": 0.7969,
"step": 894
},
{
"epoch": 0.9770742358078602,
"grad_norm": 0.9685780901544102,
"learning_rate": 1.0718160807039916e-05,
"loss": 0.7937,
"step": 895
},
{
"epoch": 0.9781659388646288,
"grad_norm": 0.9491355156142823,
"learning_rate": 1.0700673770864673e-05,
"loss": 0.805,
"step": 896
},
{
"epoch": 0.9792576419213974,
"grad_norm": 0.9616449107261509,
"learning_rate": 1.068318458121617e-05,
"loss": 0.7931,
"step": 897
},
{
"epoch": 0.980349344978166,
"grad_norm": 0.9705835266119947,
"learning_rate": 1.0665693291846245e-05,
"loss": 0.8081,
"step": 898
},
{
"epoch": 0.9814410480349345,
"grad_norm": 0.9739991330817586,
"learning_rate": 1.064819995651318e-05,
"loss": 0.8269,
"step": 899
},
{
"epoch": 0.982532751091703,
"grad_norm": 0.9772132359047561,
"learning_rate": 1.0630704628981561e-05,
"loss": 0.84,
"step": 900
},
{
"epoch": 0.9836244541484717,
"grad_norm": 1.013102781346369,
"learning_rate": 1.0613207363022086e-05,
"loss": 0.8002,
"step": 901
},
{
"epoch": 0.9847161572052402,
"grad_norm": 0.964046204980614,
"learning_rate": 1.0595708212411417e-05,
"loss": 0.7956,
"step": 902
},
{
"epoch": 0.9858078602620087,
"grad_norm": 1.022239204691434,
"learning_rate": 1.0578207230932e-05,
"loss": 0.9032,
"step": 903
},
{
"epoch": 0.9868995633187773,
"grad_norm": 1.0302684010352272,
"learning_rate": 1.0560704472371919e-05,
"loss": 0.784,
"step": 904
},
{
"epoch": 0.9879912663755459,
"grad_norm": 0.9521512924621471,
"learning_rate": 1.0543199990524711e-05,
"loss": 0.8158,
"step": 905
},
{
"epoch": 0.9890829694323144,
"grad_norm": 0.9862783654579674,
"learning_rate": 1.0525693839189215e-05,
"loss": 0.814,
"step": 906
},
{
"epoch": 0.990174672489083,
"grad_norm": 0.9441851015215522,
"learning_rate": 1.0508186072169391e-05,
"loss": 0.7525,
"step": 907
},
{
"epoch": 0.9912663755458515,
"grad_norm": 0.9629624125081184,
"learning_rate": 1.0490676743274181e-05,
"loss": 0.7342,
"step": 908
},
{
"epoch": 0.99235807860262,
"grad_norm": 0.9778492824734855,
"learning_rate": 1.0473165906317318e-05,
"loss": 0.8002,
"step": 909
},
{
"epoch": 0.9934497816593887,
"grad_norm": 0.9418112859440163,
"learning_rate": 1.0455653615117163e-05,
"loss": 0.7826,
"step": 910
},
{
"epoch": 0.9945414847161572,
"grad_norm": 1.0353237258763315,
"learning_rate": 1.0438139923496562e-05,
"loss": 0.8274,
"step": 911
},
{
"epoch": 0.9956331877729258,
"grad_norm": 1.0143499500261535,
"learning_rate": 1.0420624885282653e-05,
"loss": 0.8211,
"step": 912
},
{
"epoch": 0.9967248908296943,
"grad_norm": 0.9414306983920787,
"learning_rate": 1.0403108554306718e-05,
"loss": 0.7435,
"step": 913
},
{
"epoch": 0.9978165938864629,
"grad_norm": 0.9834124075925206,
"learning_rate": 1.0385590984404009e-05,
"loss": 0.76,
"step": 914
},
{
"epoch": 0.9989082969432315,
"grad_norm": 0.9224083288811961,
"learning_rate": 1.036807222941359e-05,
"loss": 0.7612,
"step": 915
},
{
"epoch": 1.0,
"grad_norm": 0.9438757968505777,
"learning_rate": 1.0350552343178164e-05,
"loss": 0.7441,
"step": 916
}
],
"logging_steps": 1,
"max_steps": 1832,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 458,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.4205786854208307e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}