tangken333's picture
End of training
e5f8476 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9983193277310924,
"eval_steps": 500,
"global_step": 594,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005042016806722689,
"grad_norm": 9.45597365399993,
"learning_rate": 0.0,
"loss": 1.7242,
"step": 1
},
{
"epoch": 0.010084033613445379,
"grad_norm": 9.218921810032594,
"learning_rate": 1.6666666666666668e-07,
"loss": 1.9603,
"step": 2
},
{
"epoch": 0.015126050420168067,
"grad_norm": 9.19364568473009,
"learning_rate": 3.3333333333333335e-07,
"loss": 1.7815,
"step": 3
},
{
"epoch": 0.020168067226890758,
"grad_norm": 9.753359655679406,
"learning_rate": 5.000000000000001e-07,
"loss": 1.8671,
"step": 4
},
{
"epoch": 0.025210084033613446,
"grad_norm": 10.188684139684757,
"learning_rate": 6.666666666666667e-07,
"loss": 1.8868,
"step": 5
},
{
"epoch": 0.030252100840336135,
"grad_norm": 9.253535763532076,
"learning_rate": 8.333333333333333e-07,
"loss": 1.8821,
"step": 6
},
{
"epoch": 0.03529411764705882,
"grad_norm": 9.452472463389428,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.8398,
"step": 7
},
{
"epoch": 0.040336134453781515,
"grad_norm": 8.338459992866273,
"learning_rate": 1.1666666666666668e-06,
"loss": 1.7522,
"step": 8
},
{
"epoch": 0.0453781512605042,
"grad_norm": 8.599040436901118,
"learning_rate": 1.3333333333333334e-06,
"loss": 1.7879,
"step": 9
},
{
"epoch": 0.05042016806722689,
"grad_norm": 9.204139051227466,
"learning_rate": 1.5e-06,
"loss": 1.8949,
"step": 10
},
{
"epoch": 0.05546218487394958,
"grad_norm": 8.383986517840034,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.7568,
"step": 11
},
{
"epoch": 0.06050420168067227,
"grad_norm": 6.14215523192106,
"learning_rate": 1.8333333333333333e-06,
"loss": 1.6243,
"step": 12
},
{
"epoch": 0.06554621848739496,
"grad_norm": 5.998914335428499,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.5973,
"step": 13
},
{
"epoch": 0.07058823529411765,
"grad_norm": 5.047474738743573,
"learning_rate": 2.166666666666667e-06,
"loss": 1.3774,
"step": 14
},
{
"epoch": 0.07563025210084033,
"grad_norm": 5.330740621399064,
"learning_rate": 2.3333333333333336e-06,
"loss": 1.5953,
"step": 15
},
{
"epoch": 0.08067226890756303,
"grad_norm": 3.3659526026887012,
"learning_rate": 2.5e-06,
"loss": 1.3746,
"step": 16
},
{
"epoch": 0.08571428571428572,
"grad_norm": 3.639732034816691,
"learning_rate": 2.666666666666667e-06,
"loss": 1.4698,
"step": 17
},
{
"epoch": 0.0907563025210084,
"grad_norm": 3.461514147091586,
"learning_rate": 2.8333333333333335e-06,
"loss": 1.4229,
"step": 18
},
{
"epoch": 0.0957983193277311,
"grad_norm": 3.765309579932919,
"learning_rate": 3e-06,
"loss": 1.3948,
"step": 19
},
{
"epoch": 0.10084033613445378,
"grad_norm": 2.825230202760748,
"learning_rate": 3.1666666666666667e-06,
"loss": 1.3286,
"step": 20
},
{
"epoch": 0.10588235294117647,
"grad_norm": 2.387015147619193,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.2574,
"step": 21
},
{
"epoch": 0.11092436974789915,
"grad_norm": 2.6592293064240176,
"learning_rate": 3.5e-06,
"loss": 1.2994,
"step": 22
},
{
"epoch": 0.11596638655462185,
"grad_norm": 2.9338685422018163,
"learning_rate": 3.6666666666666666e-06,
"loss": 1.271,
"step": 23
},
{
"epoch": 0.12100840336134454,
"grad_norm": 2.8053283243940923,
"learning_rate": 3.833333333333334e-06,
"loss": 1.239,
"step": 24
},
{
"epoch": 0.12605042016806722,
"grad_norm": 2.4764651014882673,
"learning_rate": 4.000000000000001e-06,
"loss": 1.2632,
"step": 25
},
{
"epoch": 0.13109243697478992,
"grad_norm": 4.193230652323676,
"learning_rate": 4.166666666666667e-06,
"loss": 1.2494,
"step": 26
},
{
"epoch": 0.1361344537815126,
"grad_norm": 2.166632601601999,
"learning_rate": 4.333333333333334e-06,
"loss": 1.1772,
"step": 27
},
{
"epoch": 0.1411764705882353,
"grad_norm": 2.0456983888545133,
"learning_rate": 4.5e-06,
"loss": 1.3323,
"step": 28
},
{
"epoch": 0.146218487394958,
"grad_norm": 1.9041534025850353,
"learning_rate": 4.666666666666667e-06,
"loss": 1.123,
"step": 29
},
{
"epoch": 0.15126050420168066,
"grad_norm": 1.7473372136225975,
"learning_rate": 4.833333333333333e-06,
"loss": 1.1116,
"step": 30
},
{
"epoch": 0.15630252100840336,
"grad_norm": 1.9237786068741898,
"learning_rate": 5e-06,
"loss": 1.2038,
"step": 31
},
{
"epoch": 0.16134453781512606,
"grad_norm": 1.9862371515679214,
"learning_rate": 5.1666666666666675e-06,
"loss": 1.2171,
"step": 32
},
{
"epoch": 0.16638655462184873,
"grad_norm": 1.5922593116941988,
"learning_rate": 5.333333333333334e-06,
"loss": 1.0193,
"step": 33
},
{
"epoch": 0.17142857142857143,
"grad_norm": 1.6830455258736572,
"learning_rate": 5.500000000000001e-06,
"loss": 1.0761,
"step": 34
},
{
"epoch": 0.17647058823529413,
"grad_norm": 1.594143028453368,
"learning_rate": 5.666666666666667e-06,
"loss": 1.1126,
"step": 35
},
{
"epoch": 0.1815126050420168,
"grad_norm": 1.9420003685481775,
"learning_rate": 5.833333333333334e-06,
"loss": 1.1203,
"step": 36
},
{
"epoch": 0.1865546218487395,
"grad_norm": 1.5815112240806883,
"learning_rate": 6e-06,
"loss": 1.0293,
"step": 37
},
{
"epoch": 0.1915966386554622,
"grad_norm": 1.4697006996217221,
"learning_rate": 6.166666666666667e-06,
"loss": 0.995,
"step": 38
},
{
"epoch": 0.19663865546218487,
"grad_norm": 1.5886739084366435,
"learning_rate": 6.333333333333333e-06,
"loss": 1.1051,
"step": 39
},
{
"epoch": 0.20168067226890757,
"grad_norm": 1.3717225438634324,
"learning_rate": 6.5000000000000004e-06,
"loss": 1.0817,
"step": 40
},
{
"epoch": 0.20672268907563024,
"grad_norm": 1.4586233032739204,
"learning_rate": 6.666666666666667e-06,
"loss": 0.9949,
"step": 41
},
{
"epoch": 0.21176470588235294,
"grad_norm": 1.4404526895251804,
"learning_rate": 6.833333333333334e-06,
"loss": 1.0369,
"step": 42
},
{
"epoch": 0.21680672268907564,
"grad_norm": 1.5011071614715905,
"learning_rate": 7e-06,
"loss": 1.0126,
"step": 43
},
{
"epoch": 0.2218487394957983,
"grad_norm": 1.446801500279163,
"learning_rate": 7.166666666666667e-06,
"loss": 0.9829,
"step": 44
},
{
"epoch": 0.226890756302521,
"grad_norm": 1.3157845464395648,
"learning_rate": 7.333333333333333e-06,
"loss": 0.9432,
"step": 45
},
{
"epoch": 0.2319327731092437,
"grad_norm": 1.3291092123967403,
"learning_rate": 7.500000000000001e-06,
"loss": 0.9518,
"step": 46
},
{
"epoch": 0.23697478991596638,
"grad_norm": 1.5105509029003468,
"learning_rate": 7.666666666666667e-06,
"loss": 1.0235,
"step": 47
},
{
"epoch": 0.24201680672268908,
"grad_norm": 1.420355667391472,
"learning_rate": 7.833333333333333e-06,
"loss": 0.9567,
"step": 48
},
{
"epoch": 0.24705882352941178,
"grad_norm": 1.463732709856337,
"learning_rate": 8.000000000000001e-06,
"loss": 1.0417,
"step": 49
},
{
"epoch": 0.25210084033613445,
"grad_norm": 1.4275241446789713,
"learning_rate": 8.166666666666668e-06,
"loss": 1.0347,
"step": 50
},
{
"epoch": 0.2571428571428571,
"grad_norm": 1.309592587931707,
"learning_rate": 8.333333333333334e-06,
"loss": 0.9524,
"step": 51
},
{
"epoch": 0.26218487394957984,
"grad_norm": 1.3344872488030621,
"learning_rate": 8.5e-06,
"loss": 1.0684,
"step": 52
},
{
"epoch": 0.2672268907563025,
"grad_norm": 1.3533956797177575,
"learning_rate": 8.666666666666668e-06,
"loss": 0.9501,
"step": 53
},
{
"epoch": 0.2722689075630252,
"grad_norm": 1.4422509166091777,
"learning_rate": 8.833333333333334e-06,
"loss": 0.9452,
"step": 54
},
{
"epoch": 0.2773109243697479,
"grad_norm": 1.3534627088209181,
"learning_rate": 9e-06,
"loss": 0.9243,
"step": 55
},
{
"epoch": 0.2823529411764706,
"grad_norm": 1.370929089587996,
"learning_rate": 9.166666666666666e-06,
"loss": 0.9577,
"step": 56
},
{
"epoch": 0.28739495798319326,
"grad_norm": 1.34141912977082,
"learning_rate": 9.333333333333334e-06,
"loss": 0.9216,
"step": 57
},
{
"epoch": 0.292436974789916,
"grad_norm": 1.437190020022949,
"learning_rate": 9.5e-06,
"loss": 0.986,
"step": 58
},
{
"epoch": 0.29747899159663865,
"grad_norm": 1.3190591357074484,
"learning_rate": 9.666666666666667e-06,
"loss": 1.0163,
"step": 59
},
{
"epoch": 0.3025210084033613,
"grad_norm": 1.3230400720636633,
"learning_rate": 9.833333333333333e-06,
"loss": 0.9071,
"step": 60
},
{
"epoch": 0.30756302521008405,
"grad_norm": 1.570821042981294,
"learning_rate": 1e-05,
"loss": 1.0532,
"step": 61
},
{
"epoch": 0.3126050420168067,
"grad_norm": 1.3817712282096664,
"learning_rate": 9.999913472135126e-06,
"loss": 0.9497,
"step": 62
},
{
"epoch": 0.3176470588235294,
"grad_norm": 1.3461235016869455,
"learning_rate": 9.99965389153533e-06,
"loss": 0.9656,
"step": 63
},
{
"epoch": 0.3226890756302521,
"grad_norm": 1.2703045215015534,
"learning_rate": 9.999221267184993e-06,
"loss": 0.8563,
"step": 64
},
{
"epoch": 0.3277310924369748,
"grad_norm": 1.4463044763025328,
"learning_rate": 9.998615614057743e-06,
"loss": 0.9743,
"step": 65
},
{
"epoch": 0.33277310924369746,
"grad_norm": 1.2126520135581191,
"learning_rate": 9.997836953115927e-06,
"loss": 0.8256,
"step": 66
},
{
"epoch": 0.3378151260504202,
"grad_norm": 1.465456256707118,
"learning_rate": 9.996885311309892e-06,
"loss": 0.9112,
"step": 67
},
{
"epoch": 0.34285714285714286,
"grad_norm": 1.3774012861831768,
"learning_rate": 9.995760721577053e-06,
"loss": 1.0031,
"step": 68
},
{
"epoch": 0.34789915966386553,
"grad_norm": 1.214727510886685,
"learning_rate": 9.994463222840748e-06,
"loss": 0.8777,
"step": 69
},
{
"epoch": 0.35294117647058826,
"grad_norm": 1.3372556283226344,
"learning_rate": 9.992992860008893e-06,
"loss": 0.9503,
"step": 70
},
{
"epoch": 0.35798319327731093,
"grad_norm": 1.2629663699758409,
"learning_rate": 9.991349683972435e-06,
"loss": 0.9707,
"step": 71
},
{
"epoch": 0.3630252100840336,
"grad_norm": 1.2961666438854509,
"learning_rate": 9.989533751603578e-06,
"loss": 0.8987,
"step": 72
},
{
"epoch": 0.3680672268907563,
"grad_norm": 1.3451690514655665,
"learning_rate": 9.987545125753818e-06,
"loss": 0.9614,
"step": 73
},
{
"epoch": 0.373109243697479,
"grad_norm": 1.3824819884360038,
"learning_rate": 9.985383875251783e-06,
"loss": 0.9101,
"step": 74
},
{
"epoch": 0.37815126050420167,
"grad_norm": 1.290324816657544,
"learning_rate": 9.983050074900824e-06,
"loss": 0.8901,
"step": 75
},
{
"epoch": 0.3831932773109244,
"grad_norm": 1.3785449206810632,
"learning_rate": 9.980543805476447e-06,
"loss": 0.9305,
"step": 76
},
{
"epoch": 0.38823529411764707,
"grad_norm": 1.2723741333137952,
"learning_rate": 9.977865153723508e-06,
"loss": 0.9145,
"step": 77
},
{
"epoch": 0.39327731092436974,
"grad_norm": 1.3277787150964286,
"learning_rate": 9.975014212353212e-06,
"loss": 0.9386,
"step": 78
},
{
"epoch": 0.3983193277310924,
"grad_norm": 1.300378629259356,
"learning_rate": 9.971991080039912e-06,
"loss": 0.9072,
"step": 79
},
{
"epoch": 0.40336134453781514,
"grad_norm": 1.3180887220440103,
"learning_rate": 9.968795861417676e-06,
"loss": 0.8538,
"step": 80
},
{
"epoch": 0.4084033613445378,
"grad_norm": 1.2852565908527667,
"learning_rate": 9.965428667076687e-06,
"loss": 0.8625,
"step": 81
},
{
"epoch": 0.4134453781512605,
"grad_norm": 1.22082061679436,
"learning_rate": 9.961889613559396e-06,
"loss": 0.8002,
"step": 82
},
{
"epoch": 0.4184873949579832,
"grad_norm": 1.3948047447367582,
"learning_rate": 9.958178823356503e-06,
"loss": 0.9563,
"step": 83
},
{
"epoch": 0.4235294117647059,
"grad_norm": 1.32125427246041,
"learning_rate": 9.954296424902709e-06,
"loss": 0.9009,
"step": 84
},
{
"epoch": 0.42857142857142855,
"grad_norm": 1.2664915782700163,
"learning_rate": 9.950242552572272e-06,
"loss": 0.8489,
"step": 85
},
{
"epoch": 0.4336134453781513,
"grad_norm": 1.273298827077617,
"learning_rate": 9.946017346674362e-06,
"loss": 0.847,
"step": 86
},
{
"epoch": 0.43865546218487395,
"grad_norm": 1.328680054216705,
"learning_rate": 9.941620953448195e-06,
"loss": 0.9382,
"step": 87
},
{
"epoch": 0.4436974789915966,
"grad_norm": 1.263646905073375,
"learning_rate": 9.937053525057977e-06,
"loss": 0.8991,
"step": 88
},
{
"epoch": 0.44873949579831934,
"grad_norm": 1.209796673070386,
"learning_rate": 9.932315219587641e-06,
"loss": 0.8611,
"step": 89
},
{
"epoch": 0.453781512605042,
"grad_norm": 1.1317133515894529,
"learning_rate": 9.927406201035368e-06,
"loss": 0.8254,
"step": 90
},
{
"epoch": 0.4588235294117647,
"grad_norm": 1.2581352252268798,
"learning_rate": 9.922326639307918e-06,
"loss": 0.8186,
"step": 91
},
{
"epoch": 0.4638655462184874,
"grad_norm": 1.1615726675287243,
"learning_rate": 9.917076710214739e-06,
"loss": 0.8217,
"step": 92
},
{
"epoch": 0.4689075630252101,
"grad_norm": 1.3906544125113194,
"learning_rate": 9.911656595461899e-06,
"loss": 0.9606,
"step": 93
},
{
"epoch": 0.47394957983193275,
"grad_norm": 1.3491688269700184,
"learning_rate": 9.906066482645774e-06,
"loss": 0.8865,
"step": 94
},
{
"epoch": 0.4789915966386555,
"grad_norm": 1.2884319333617182,
"learning_rate": 9.900306565246579e-06,
"loss": 0.8608,
"step": 95
},
{
"epoch": 0.48403361344537815,
"grad_norm": 1.332999472417029,
"learning_rate": 9.894377042621654e-06,
"loss": 0.8476,
"step": 96
},
{
"epoch": 0.4890756302521008,
"grad_norm": 1.3206768360556793,
"learning_rate": 9.888278119998573e-06,
"loss": 0.898,
"step": 97
},
{
"epoch": 0.49411764705882355,
"grad_norm": 1.3732673184556148,
"learning_rate": 9.882010008468038e-06,
"loss": 0.9482,
"step": 98
},
{
"epoch": 0.4991596638655462,
"grad_norm": 1.4284063475101123,
"learning_rate": 9.875572924976568e-06,
"loss": 0.8932,
"step": 99
},
{
"epoch": 0.5042016806722689,
"grad_norm": 1.249757410129038,
"learning_rate": 9.868967092319003e-06,
"loss": 0.9113,
"step": 100
},
{
"epoch": 0.5092436974789916,
"grad_norm": 1.2033755235104269,
"learning_rate": 9.86219273913078e-06,
"loss": 0.8373,
"step": 101
},
{
"epoch": 0.5142857142857142,
"grad_norm": 1.3285676372655046,
"learning_rate": 9.855250099880026e-06,
"loss": 0.82,
"step": 102
},
{
"epoch": 0.519327731092437,
"grad_norm": 1.280372963776325,
"learning_rate": 9.848139414859441e-06,
"loss": 0.9269,
"step": 103
},
{
"epoch": 0.5243697478991597,
"grad_norm": 1.3597201294098022,
"learning_rate": 9.840860930177984e-06,
"loss": 0.8917,
"step": 104
},
{
"epoch": 0.5294117647058824,
"grad_norm": 1.3044841757394627,
"learning_rate": 9.833414897752346e-06,
"loss": 0.8242,
"step": 105
},
{
"epoch": 0.534453781512605,
"grad_norm": 1.2237707733265701,
"learning_rate": 9.825801575298248e-06,
"loss": 0.8369,
"step": 106
},
{
"epoch": 0.5394957983193277,
"grad_norm": 1.2984723776565605,
"learning_rate": 9.818021226321502e-06,
"loss": 0.8687,
"step": 107
},
{
"epoch": 0.5445378151260504,
"grad_norm": 1.3966505679016854,
"learning_rate": 9.8100741201089e-06,
"loss": 0.8698,
"step": 108
},
{
"epoch": 0.5495798319327732,
"grad_norm": 1.3695596995593027,
"learning_rate": 9.801960531718898e-06,
"loss": 0.9224,
"step": 109
},
{
"epoch": 0.5546218487394958,
"grad_norm": 1.2219956732497297,
"learning_rate": 9.793680741972084e-06,
"loss": 0.7909,
"step": 110
},
{
"epoch": 0.5596638655462185,
"grad_norm": 1.1958717679101365,
"learning_rate": 9.785235037441473e-06,
"loss": 0.8222,
"step": 111
},
{
"epoch": 0.5647058823529412,
"grad_norm": 1.3284406137942217,
"learning_rate": 9.77662371044258e-06,
"loss": 0.9698,
"step": 112
},
{
"epoch": 0.5697478991596638,
"grad_norm": 1.4005342916908725,
"learning_rate": 9.767847059023292e-06,
"loss": 0.8141,
"step": 113
},
{
"epoch": 0.5747899159663865,
"grad_norm": 1.3280058867861344,
"learning_rate": 9.75890538695358e-06,
"loss": 0.8281,
"step": 114
},
{
"epoch": 0.5798319327731093,
"grad_norm": 1.348332178712391,
"learning_rate": 9.749799003714954e-06,
"loss": 0.8174,
"step": 115
},
{
"epoch": 0.584873949579832,
"grad_norm": 1.345901958116435,
"learning_rate": 9.74052822448978e-06,
"loss": 0.8662,
"step": 116
},
{
"epoch": 0.5899159663865546,
"grad_norm": 1.4938772005815362,
"learning_rate": 9.731093370150349e-06,
"loss": 0.9227,
"step": 117
},
{
"epoch": 0.5949579831932773,
"grad_norm": 1.5782055001938107,
"learning_rate": 9.721494767247779e-06,
"loss": 0.9292,
"step": 118
},
{
"epoch": 0.6,
"grad_norm": 1.2813061736782214,
"learning_rate": 9.71173274800072e-06,
"loss": 0.808,
"step": 119
},
{
"epoch": 0.6050420168067226,
"grad_norm": 1.3387521092808896,
"learning_rate": 9.70180765028384e-06,
"loss": 0.8052,
"step": 120
},
{
"epoch": 0.6100840336134454,
"grad_norm": 1.1971567112258479,
"learning_rate": 9.691719817616148e-06,
"loss": 0.8321,
"step": 121
},
{
"epoch": 0.6151260504201681,
"grad_norm": 1.4022847044925355,
"learning_rate": 9.681469599149093e-06,
"loss": 0.8362,
"step": 122
},
{
"epoch": 0.6201680672268908,
"grad_norm": 1.4458562904255674,
"learning_rate": 9.671057349654481e-06,
"loss": 0.8753,
"step": 123
},
{
"epoch": 0.6252100840336134,
"grad_norm": 1.3489812277335955,
"learning_rate": 9.660483429512198e-06,
"loss": 0.8406,
"step": 124
},
{
"epoch": 0.6302521008403361,
"grad_norm": 1.2541520148654464,
"learning_rate": 9.649748204697741e-06,
"loss": 0.8096,
"step": 125
},
{
"epoch": 0.6352941176470588,
"grad_norm": 1.4166136476450861,
"learning_rate": 9.63885204676954e-06,
"loss": 0.9279,
"step": 126
},
{
"epoch": 0.6403361344537815,
"grad_norm": 1.2096305649684784,
"learning_rate": 9.627795332856107e-06,
"loss": 0.8668,
"step": 127
},
{
"epoch": 0.6453781512605042,
"grad_norm": 1.0817129947497557,
"learning_rate": 9.616578445642982e-06,
"loss": 0.8021,
"step": 128
},
{
"epoch": 0.6504201680672269,
"grad_norm": 1.2857282530529068,
"learning_rate": 9.605201773359485e-06,
"loss": 0.9031,
"step": 129
},
{
"epoch": 0.6554621848739496,
"grad_norm": 1.2909981390159206,
"learning_rate": 9.59366570976528e-06,
"loss": 0.9028,
"step": 130
},
{
"epoch": 0.6605042016806723,
"grad_norm": 1.277642300275485,
"learning_rate": 9.581970654136752e-06,
"loss": 0.8206,
"step": 131
},
{
"epoch": 0.6655462184873949,
"grad_norm": 1.2618202348884826,
"learning_rate": 9.570117011253173e-06,
"loss": 0.8038,
"step": 132
},
{
"epoch": 0.6705882352941176,
"grad_norm": 1.3158796346136465,
"learning_rate": 9.55810519138271e-06,
"loss": 0.8594,
"step": 133
},
{
"epoch": 0.6756302521008404,
"grad_norm": 1.464049668724664,
"learning_rate": 9.545935610268213e-06,
"loss": 0.8946,
"step": 134
},
{
"epoch": 0.680672268907563,
"grad_norm": 1.3568598282729065,
"learning_rate": 9.533608689112827e-06,
"loss": 0.8747,
"step": 135
},
{
"epoch": 0.6857142857142857,
"grad_norm": 1.459842199207566,
"learning_rate": 9.521124854565425e-06,
"loss": 0.8665,
"step": 136
},
{
"epoch": 0.6907563025210084,
"grad_norm": 1.2651754016717647,
"learning_rate": 9.508484538705823e-06,
"loss": 0.8172,
"step": 137
},
{
"epoch": 0.6957983193277311,
"grad_norm": 1.3148283789857567,
"learning_rate": 9.495688179029838e-06,
"loss": 0.8159,
"step": 138
},
{
"epoch": 0.7008403361344537,
"grad_norm": 1.3062514406684878,
"learning_rate": 9.482736218434144e-06,
"loss": 0.772,
"step": 139
},
{
"epoch": 0.7058823529411765,
"grad_norm": 1.233357901449911,
"learning_rate": 9.469629105200937e-06,
"loss": 0.812,
"step": 140
},
{
"epoch": 0.7109243697478992,
"grad_norm": 1.4036092051385856,
"learning_rate": 9.45636729298243e-06,
"loss": 0.9176,
"step": 141
},
{
"epoch": 0.7159663865546219,
"grad_norm": 1.2475986918890871,
"learning_rate": 9.442951240785135e-06,
"loss": 0.9227,
"step": 142
},
{
"epoch": 0.7210084033613445,
"grad_norm": 1.33327258291273,
"learning_rate": 9.429381412954e-06,
"loss": 0.8406,
"step": 143
},
{
"epoch": 0.7260504201680672,
"grad_norm": 1.2457766641422836,
"learning_rate": 9.415658279156312e-06,
"loss": 0.7944,
"step": 144
},
{
"epoch": 0.7310924369747899,
"grad_norm": 1.214604972950531,
"learning_rate": 9.401782314365458e-06,
"loss": 0.7889,
"step": 145
},
{
"epoch": 0.7361344537815127,
"grad_norm": 1.4091496584822034,
"learning_rate": 9.387753998844482e-06,
"loss": 0.8542,
"step": 146
},
{
"epoch": 0.7411764705882353,
"grad_norm": 1.336371637577696,
"learning_rate": 9.37357381812946e-06,
"loss": 0.8713,
"step": 147
},
{
"epoch": 0.746218487394958,
"grad_norm": 1.2559095107113698,
"learning_rate": 9.359242263012693e-06,
"loss": 0.8405,
"step": 148
},
{
"epoch": 0.7512605042016807,
"grad_norm": 1.371982879040437,
"learning_rate": 9.344759829525734e-06,
"loss": 0.8666,
"step": 149
},
{
"epoch": 0.7563025210084033,
"grad_norm": 1.23974913873784,
"learning_rate": 9.330127018922195e-06,
"loss": 0.7429,
"step": 150
},
{
"epoch": 0.761344537815126,
"grad_norm": 1.3741045518217379,
"learning_rate": 9.315344337660422e-06,
"loss": 0.8649,
"step": 151
},
{
"epoch": 0.7663865546218488,
"grad_norm": 1.348659089360585,
"learning_rate": 9.300412297385954e-06,
"loss": 0.8614,
"step": 152
},
{
"epoch": 0.7714285714285715,
"grad_norm": 1.199362811459465,
"learning_rate": 9.285331414913816e-06,
"loss": 0.837,
"step": 153
},
{
"epoch": 0.7764705882352941,
"grad_norm": 1.2184218309322916,
"learning_rate": 9.270102212210632e-06,
"loss": 0.8404,
"step": 154
},
{
"epoch": 0.7815126050420168,
"grad_norm": 1.386612554465055,
"learning_rate": 9.254725216376562e-06,
"loss": 0.9221,
"step": 155
},
{
"epoch": 0.7865546218487395,
"grad_norm": 1.3380478699356555,
"learning_rate": 9.239200959627048e-06,
"loss": 0.8627,
"step": 156
},
{
"epoch": 0.7915966386554621,
"grad_norm": 1.4014570562834296,
"learning_rate": 9.223529979274411e-06,
"loss": 0.8525,
"step": 157
},
{
"epoch": 0.7966386554621848,
"grad_norm": 1.3172489244042282,
"learning_rate": 9.207712817709237e-06,
"loss": 0.7901,
"step": 158
},
{
"epoch": 0.8016806722689076,
"grad_norm": 1.354483035270781,
"learning_rate": 9.191750022381613e-06,
"loss": 0.865,
"step": 159
},
{
"epoch": 0.8067226890756303,
"grad_norm": 1.2415343975219086,
"learning_rate": 9.175642145782179e-06,
"loss": 0.7898,
"step": 160
},
{
"epoch": 0.8117647058823529,
"grad_norm": 1.2532359973917484,
"learning_rate": 9.159389745423003e-06,
"loss": 0.8372,
"step": 161
},
{
"epoch": 0.8168067226890756,
"grad_norm": 1.2390725118364732,
"learning_rate": 9.142993383818284e-06,
"loss": 0.8383,
"step": 162
},
{
"epoch": 0.8218487394957983,
"grad_norm": 1.3766117307822159,
"learning_rate": 9.126453628464889e-06,
"loss": 0.8151,
"step": 163
},
{
"epoch": 0.826890756302521,
"grad_norm": 1.3256804846243377,
"learning_rate": 9.109771051822702e-06,
"loss": 0.8444,
"step": 164
},
{
"epoch": 0.8319327731092437,
"grad_norm": 1.3520618668694473,
"learning_rate": 9.09294623129482e-06,
"loss": 0.8672,
"step": 165
},
{
"epoch": 0.8369747899159664,
"grad_norm": 1.329653882039925,
"learning_rate": 9.07597974920756e-06,
"loss": 0.8168,
"step": 166
},
{
"epoch": 0.8420168067226891,
"grad_norm": 1.3543281390803807,
"learning_rate": 9.058872192790314e-06,
"loss": 0.9118,
"step": 167
},
{
"epoch": 0.8470588235294118,
"grad_norm": 1.3456977881970305,
"learning_rate": 9.041624154155208e-06,
"loss": 0.8515,
"step": 168
},
{
"epoch": 0.8521008403361344,
"grad_norm": 1.297767613562501,
"learning_rate": 9.02423623027663e-06,
"loss": 0.7417,
"step": 169
},
{
"epoch": 0.8571428571428571,
"grad_norm": 1.2894576740180352,
"learning_rate": 9.006709022970547e-06,
"loss": 0.8408,
"step": 170
},
{
"epoch": 0.8621848739495799,
"grad_norm": 1.2240598626483896,
"learning_rate": 8.98904313887369e-06,
"loss": 0.7358,
"step": 171
},
{
"epoch": 0.8672268907563025,
"grad_norm": 1.1890744366393113,
"learning_rate": 8.971239189422555e-06,
"loss": 0.8322,
"step": 172
},
{
"epoch": 0.8722689075630252,
"grad_norm": 1.3386067991043302,
"learning_rate": 8.953297790832231e-06,
"loss": 0.8411,
"step": 173
},
{
"epoch": 0.8773109243697479,
"grad_norm": 1.408000314117784,
"learning_rate": 8.935219564075087e-06,
"loss": 0.8036,
"step": 174
},
{
"epoch": 0.8823529411764706,
"grad_norm": 1.3426412490545896,
"learning_rate": 8.917005134859263e-06,
"loss": 0.8035,
"step": 175
},
{
"epoch": 0.8873949579831932,
"grad_norm": 1.4645291848377162,
"learning_rate": 8.89865513360703e-06,
"loss": 0.8392,
"step": 176
},
{
"epoch": 0.892436974789916,
"grad_norm": 1.2117719390717796,
"learning_rate": 8.88017019543296e-06,
"loss": 0.8328,
"step": 177
},
{
"epoch": 0.8974789915966387,
"grad_norm": 1.3344830085574295,
"learning_rate": 8.861550960121946e-06,
"loss": 0.8543,
"step": 178
},
{
"epoch": 0.9025210084033614,
"grad_norm": 1.4853304361578643,
"learning_rate": 8.842798072107055e-06,
"loss": 0.8512,
"step": 179
},
{
"epoch": 0.907563025210084,
"grad_norm": 1.2284352653979531,
"learning_rate": 8.823912180447237e-06,
"loss": 0.8598,
"step": 180
},
{
"epoch": 0.9126050420168067,
"grad_norm": 1.37221802812512,
"learning_rate": 8.804893938804839e-06,
"loss": 0.8613,
"step": 181
},
{
"epoch": 0.9176470588235294,
"grad_norm": 1.4397712752139291,
"learning_rate": 8.785744005423003e-06,
"loss": 0.8192,
"step": 182
},
{
"epoch": 0.9226890756302522,
"grad_norm": 1.4307484306743805,
"learning_rate": 8.766463043102864e-06,
"loss": 0.8114,
"step": 183
},
{
"epoch": 0.9277310924369748,
"grad_norm": 1.4036453214728524,
"learning_rate": 8.747051719180626e-06,
"loss": 0.8922,
"step": 184
},
{
"epoch": 0.9327731092436975,
"grad_norm": 1.4752551479904314,
"learning_rate": 8.727510705504453e-06,
"loss": 0.8932,
"step": 185
},
{
"epoch": 0.9378151260504202,
"grad_norm": 1.322337640774981,
"learning_rate": 8.707840678411223e-06,
"loss": 0.7998,
"step": 186
},
{
"epoch": 0.9428571428571428,
"grad_norm": 1.2136277321616975,
"learning_rate": 8.688042318703111e-06,
"loss": 0.7416,
"step": 187
},
{
"epoch": 0.9478991596638655,
"grad_norm": 1.342849040104635,
"learning_rate": 8.66811631162404e-06,
"loss": 0.8685,
"step": 188
},
{
"epoch": 0.9529411764705882,
"grad_norm": 1.5250386207067939,
"learning_rate": 8.648063346835943e-06,
"loss": 0.8485,
"step": 189
},
{
"epoch": 0.957983193277311,
"grad_norm": 1.3173191874193797,
"learning_rate": 8.627884118394913e-06,
"loss": 0.8286,
"step": 190
},
{
"epoch": 0.9630252100840336,
"grad_norm": 1.32796081599915,
"learning_rate": 8.607579324727175e-06,
"loss": 0.8544,
"step": 191
},
{
"epoch": 0.9680672268907563,
"grad_norm": 1.350363153783161,
"learning_rate": 8.5871496686049e-06,
"loss": 0.8102,
"step": 192
},
{
"epoch": 0.973109243697479,
"grad_norm": 1.3655669107662696,
"learning_rate": 8.566595857121902e-06,
"loss": 0.8122,
"step": 193
},
{
"epoch": 0.9781512605042016,
"grad_norm": 1.3452211499259599,
"learning_rate": 8.545918601669147e-06,
"loss": 0.8834,
"step": 194
},
{
"epoch": 0.9831932773109243,
"grad_norm": 1.3376410418915317,
"learning_rate": 8.525118617910144e-06,
"loss": 0.8148,
"step": 195
},
{
"epoch": 0.9882352941176471,
"grad_norm": 1.2489273918302621,
"learning_rate": 8.504196625756166e-06,
"loss": 0.8271,
"step": 196
},
{
"epoch": 0.9932773109243698,
"grad_norm": 1.4139088289405872,
"learning_rate": 8.483153349341336e-06,
"loss": 0.845,
"step": 197
},
{
"epoch": 0.9983193277310924,
"grad_norm": 1.384588034693747,
"learning_rate": 8.461989516997565e-06,
"loss": 0.8312,
"step": 198
},
{
"epoch": 1.0050420168067227,
"grad_norm": 2.499955060770187,
"learning_rate": 8.440705861229344e-06,
"loss": 1.4381,
"step": 199
},
{
"epoch": 1.0100840336134453,
"grad_norm": 1.413536932523174,
"learning_rate": 8.41930311868839e-06,
"loss": 0.713,
"step": 200
},
{
"epoch": 1.015126050420168,
"grad_norm": 1.3570359586304308,
"learning_rate": 8.397782030148147e-06,
"loss": 0.716,
"step": 201
},
{
"epoch": 1.0201680672268907,
"grad_norm": 1.187974845871534,
"learning_rate": 8.376143340478153e-06,
"loss": 0.6197,
"step": 202
},
{
"epoch": 1.0252100840336134,
"grad_norm": 1.1805636492053666,
"learning_rate": 8.354387798618254e-06,
"loss": 0.6082,
"step": 203
},
{
"epoch": 1.030252100840336,
"grad_norm": 1.3319326327566277,
"learning_rate": 8.332516157552684e-06,
"loss": 0.6667,
"step": 204
},
{
"epoch": 1.035294117647059,
"grad_norm": 1.3080442340316867,
"learning_rate": 8.310529174284004e-06,
"loss": 0.6438,
"step": 205
},
{
"epoch": 1.0403361344537816,
"grad_norm": 1.360919752940988,
"learning_rate": 8.288427609806899e-06,
"loss": 0.6931,
"step": 206
},
{
"epoch": 1.0453781512605043,
"grad_norm": 1.2928882019326107,
"learning_rate": 8.266212229081846e-06,
"loss": 0.6571,
"step": 207
},
{
"epoch": 1.050420168067227,
"grad_norm": 1.279346131512037,
"learning_rate": 8.243883801008632e-06,
"loss": 0.6105,
"step": 208
},
{
"epoch": 1.0554621848739496,
"grad_norm": 1.3976246828088796,
"learning_rate": 8.221443098399733e-06,
"loss": 0.633,
"step": 209
},
{
"epoch": 1.0605042016806723,
"grad_norm": 1.4051676037106482,
"learning_rate": 8.198890897953586e-06,
"loss": 0.631,
"step": 210
},
{
"epoch": 1.065546218487395,
"grad_norm": 1.4026478680925658,
"learning_rate": 8.176227980227693e-06,
"loss": 0.646,
"step": 211
},
{
"epoch": 1.0705882352941176,
"grad_norm": 1.4783461586544826,
"learning_rate": 8.153455129611605e-06,
"loss": 0.6341,
"step": 212
},
{
"epoch": 1.0756302521008403,
"grad_norm": 1.2992917788523406,
"learning_rate": 8.130573134299782e-06,
"loss": 0.7027,
"step": 213
},
{
"epoch": 1.080672268907563,
"grad_norm": 1.4403523864907255,
"learning_rate": 8.107582786264299e-06,
"loss": 0.6745,
"step": 214
},
{
"epoch": 1.0857142857142856,
"grad_norm": 1.2904789259135272,
"learning_rate": 8.084484881227449e-06,
"loss": 0.6278,
"step": 215
},
{
"epoch": 1.0907563025210083,
"grad_norm": 1.3928383691850674,
"learning_rate": 8.061280218634192e-06,
"loss": 0.665,
"step": 216
},
{
"epoch": 1.0957983193277312,
"grad_norm": 1.3355440702392616,
"learning_rate": 8.037969601624495e-06,
"loss": 0.6095,
"step": 217
},
{
"epoch": 1.1008403361344539,
"grad_norm": 1.3135802297885384,
"learning_rate": 8.014553837005527e-06,
"loss": 0.7134,
"step": 218
},
{
"epoch": 1.1058823529411765,
"grad_norm": 1.3334358438044307,
"learning_rate": 7.99103373522373e-06,
"loss": 0.6149,
"step": 219
},
{
"epoch": 1.1109243697478992,
"grad_norm": 1.3855125872698653,
"learning_rate": 7.967410110336782e-06,
"loss": 0.6709,
"step": 220
},
{
"epoch": 1.1159663865546219,
"grad_norm": 1.4082439279428,
"learning_rate": 7.943683779985412e-06,
"loss": 0.6665,
"step": 221
},
{
"epoch": 1.1210084033613446,
"grad_norm": 1.3849413150174785,
"learning_rate": 7.919855565365102e-06,
"loss": 0.6698,
"step": 222
},
{
"epoch": 1.1260504201680672,
"grad_norm": 1.3025006342892487,
"learning_rate": 7.895926291197667e-06,
"loss": 0.6726,
"step": 223
},
{
"epoch": 1.13109243697479,
"grad_norm": 1.3438499346918609,
"learning_rate": 7.871896785702707e-06,
"loss": 0.6361,
"step": 224
},
{
"epoch": 1.1361344537815126,
"grad_norm": 1.252763414951386,
"learning_rate": 7.847767880568944e-06,
"loss": 0.6534,
"step": 225
},
{
"epoch": 1.1411764705882352,
"grad_norm": 1.4594024040073388,
"learning_rate": 7.823540410925434e-06,
"loss": 0.7176,
"step": 226
},
{
"epoch": 1.146218487394958,
"grad_norm": 1.3020082357416656,
"learning_rate": 7.799215215312667e-06,
"loss": 0.6117,
"step": 227
},
{
"epoch": 1.1512605042016806,
"grad_norm": 1.3344891922181583,
"learning_rate": 7.774793135653537e-06,
"loss": 0.6502,
"step": 228
},
{
"epoch": 1.1563025210084033,
"grad_norm": 1.1931020476239522,
"learning_rate": 7.750275017224208e-06,
"loss": 0.5864,
"step": 229
},
{
"epoch": 1.1613445378151261,
"grad_norm": 1.3817137725123274,
"learning_rate": 7.725661708624855e-06,
"loss": 0.6845,
"step": 230
},
{
"epoch": 1.1663865546218488,
"grad_norm": 1.3718851116188664,
"learning_rate": 7.700954061750295e-06,
"loss": 0.6666,
"step": 231
},
{
"epoch": 1.1714285714285715,
"grad_norm": 1.3538961263237106,
"learning_rate": 7.676152931760496e-06,
"loss": 0.6815,
"step": 232
},
{
"epoch": 1.1764705882352942,
"grad_norm": 1.3576998269549865,
"learning_rate": 7.651259177050996e-06,
"loss": 0.6169,
"step": 233
},
{
"epoch": 1.1815126050420168,
"grad_norm": 1.3317040137841496,
"learning_rate": 7.626273659223166e-06,
"loss": 0.8546,
"step": 234
},
{
"epoch": 1.1865546218487395,
"grad_norm": 1.368524911957153,
"learning_rate": 7.601197243054411e-06,
"loss": 0.6168,
"step": 235
},
{
"epoch": 1.1915966386554622,
"grad_norm": 1.3058914037226665,
"learning_rate": 7.576030796468233e-06,
"loss": 0.7452,
"step": 236
},
{
"epoch": 1.1966386554621848,
"grad_norm": 1.5392470830352827,
"learning_rate": 7.5507751905041885e-06,
"loss": 0.6195,
"step": 237
},
{
"epoch": 1.2016806722689075,
"grad_norm": 1.4102673119306182,
"learning_rate": 7.525431299287737e-06,
"loss": 0.6523,
"step": 238
},
{
"epoch": 1.2067226890756302,
"grad_norm": 1.4511322902886419,
"learning_rate": 7.500000000000001e-06,
"loss": 0.6862,
"step": 239
},
{
"epoch": 1.2117647058823529,
"grad_norm": 1.2661930310847365,
"learning_rate": 7.474482172847391e-06,
"loss": 0.6528,
"step": 240
},
{
"epoch": 1.2168067226890757,
"grad_norm": 1.3307860380456358,
"learning_rate": 7.4488787010311425e-06,
"loss": 0.6602,
"step": 241
},
{
"epoch": 1.2218487394957984,
"grad_norm": 1.3750585055686875,
"learning_rate": 7.423190470716761e-06,
"loss": 0.6432,
"step": 242
},
{
"epoch": 1.226890756302521,
"grad_norm": 1.2979245099980825,
"learning_rate": 7.3974183710033334e-06,
"loss": 0.6288,
"step": 243
},
{
"epoch": 1.2319327731092438,
"grad_norm": 1.2999814021886877,
"learning_rate": 7.371563293892761e-06,
"loss": 0.6119,
"step": 244
},
{
"epoch": 1.2369747899159664,
"grad_norm": 1.2917976929827104,
"learning_rate": 7.345626134258897e-06,
"loss": 0.6657,
"step": 245
},
{
"epoch": 1.242016806722689,
"grad_norm": 1.4010288472470998,
"learning_rate": 7.319607789816555e-06,
"loss": 0.6586,
"step": 246
},
{
"epoch": 1.2470588235294118,
"grad_norm": 1.4146400942510136,
"learning_rate": 7.293509161090453e-06,
"loss": 0.6595,
"step": 247
},
{
"epoch": 1.2521008403361344,
"grad_norm": 1.2728109027093242,
"learning_rate": 7.2673311513840395e-06,
"loss": 0.6353,
"step": 248
},
{
"epoch": 1.2571428571428571,
"grad_norm": 1.3471043709018875,
"learning_rate": 7.241074666748228e-06,
"loss": 0.6713,
"step": 249
},
{
"epoch": 1.2621848739495798,
"grad_norm": 1.353231427350053,
"learning_rate": 7.214740615950041e-06,
"loss": 0.6102,
"step": 250
},
{
"epoch": 1.2672268907563025,
"grad_norm": 1.337514944324046,
"learning_rate": 7.188329910441154e-06,
"loss": 0.6282,
"step": 251
},
{
"epoch": 1.2722689075630251,
"grad_norm": 1.362404295247445,
"learning_rate": 7.161843464326349e-06,
"loss": 0.6072,
"step": 252
},
{
"epoch": 1.2773109243697478,
"grad_norm": 1.1818447088372563,
"learning_rate": 7.135282194331881e-06,
"loss": 0.6057,
"step": 253
},
{
"epoch": 1.2823529411764705,
"grad_norm": 1.4982822435126113,
"learning_rate": 7.1086470197737405e-06,
"loss": 0.6803,
"step": 254
},
{
"epoch": 1.2873949579831931,
"grad_norm": 1.4344811997979932,
"learning_rate": 7.0819388625258385e-06,
"loss": 0.8567,
"step": 255
},
{
"epoch": 1.292436974789916,
"grad_norm": 1.3859091438882214,
"learning_rate": 7.05515864698811e-06,
"loss": 0.7355,
"step": 256
},
{
"epoch": 1.2974789915966387,
"grad_norm": 1.1626254136263392,
"learning_rate": 7.028307300054499e-06,
"loss": 0.5839,
"step": 257
},
{
"epoch": 1.3025210084033614,
"grad_norm": 1.3552944579781003,
"learning_rate": 7.0013857510808934e-06,
"loss": 0.6836,
"step": 258
},
{
"epoch": 1.307563025210084,
"grad_norm": 1.3028817545835125,
"learning_rate": 6.974394931852957e-06,
"loss": 0.6284,
"step": 259
},
{
"epoch": 1.3126050420168067,
"grad_norm": 1.5434124541373508,
"learning_rate": 6.94733577655387e-06,
"loss": 0.7012,
"step": 260
},
{
"epoch": 1.3176470588235294,
"grad_norm": 1.303474015679206,
"learning_rate": 6.920209221732007e-06,
"loss": 0.5703,
"step": 261
},
{
"epoch": 1.322689075630252,
"grad_norm": 1.3348450903633984,
"learning_rate": 6.893016206268518e-06,
"loss": 0.5917,
"step": 262
},
{
"epoch": 1.3277310924369747,
"grad_norm": 1.3433706513738732,
"learning_rate": 6.865757671344827e-06,
"loss": 0.6672,
"step": 263
},
{
"epoch": 1.3327731092436974,
"grad_norm": 1.2935787672149481,
"learning_rate": 6.838434560410064e-06,
"loss": 0.6701,
"step": 264
},
{
"epoch": 1.3378151260504203,
"grad_norm": 1.3458569492608534,
"learning_rate": 6.811047819148413e-06,
"loss": 0.6647,
"step": 265
},
{
"epoch": 1.342857142857143,
"grad_norm": 1.3814097147596185,
"learning_rate": 6.783598395446371e-06,
"loss": 0.6866,
"step": 266
},
{
"epoch": 1.3478991596638656,
"grad_norm": 1.384769236934002,
"learning_rate": 6.756087239359948e-06,
"loss": 0.6058,
"step": 267
},
{
"epoch": 1.3529411764705883,
"grad_norm": 1.4299755108319103,
"learning_rate": 6.728515303081782e-06,
"loss": 0.6608,
"step": 268
},
{
"epoch": 1.357983193277311,
"grad_norm": 1.6844501725850975,
"learning_rate": 6.700883540908185e-06,
"loss": 0.6902,
"step": 269
},
{
"epoch": 1.3630252100840337,
"grad_norm": 1.490837215727114,
"learning_rate": 6.673192909206109e-06,
"loss": 0.6622,
"step": 270
},
{
"epoch": 1.3680672268907563,
"grad_norm": 1.5025542365103597,
"learning_rate": 6.64544436638005e-06,
"loss": 0.7318,
"step": 271
},
{
"epoch": 1.373109243697479,
"grad_norm": 1.368007843570876,
"learning_rate": 6.617638872838874e-06,
"loss": 0.6616,
"step": 272
},
{
"epoch": 1.3781512605042017,
"grad_norm": 1.3302784390410516,
"learning_rate": 6.589777390962575e-06,
"loss": 0.5837,
"step": 273
},
{
"epoch": 1.3831932773109243,
"grad_norm": 1.3818583989196362,
"learning_rate": 6.561860885068972e-06,
"loss": 0.7319,
"step": 274
},
{
"epoch": 1.388235294117647,
"grad_norm": 1.3678970576063487,
"learning_rate": 6.53389032138032e-06,
"loss": 0.6479,
"step": 275
},
{
"epoch": 1.3932773109243697,
"grad_norm": 1.3918528373329961,
"learning_rate": 6.505866667989884e-06,
"loss": 0.6657,
"step": 276
},
{
"epoch": 1.3983193277310924,
"grad_norm": 1.3578596611461975,
"learning_rate": 6.477790894828422e-06,
"loss": 0.6227,
"step": 277
},
{
"epoch": 1.403361344537815,
"grad_norm": 1.37442116613121,
"learning_rate": 6.449663973630613e-06,
"loss": 0.668,
"step": 278
},
{
"epoch": 1.4084033613445377,
"grad_norm": 1.251535744853749,
"learning_rate": 6.421486877901436e-06,
"loss": 0.6394,
"step": 279
},
{
"epoch": 1.4134453781512604,
"grad_norm": 1.3817098557899696,
"learning_rate": 6.393260582882462e-06,
"loss": 0.7289,
"step": 280
},
{
"epoch": 1.4184873949579833,
"grad_norm": 1.3924770743130575,
"learning_rate": 6.364986065518106e-06,
"loss": 0.6632,
"step": 281
},
{
"epoch": 1.423529411764706,
"grad_norm": 1.3388647960669742,
"learning_rate": 6.336664304421818e-06,
"loss": 0.6445,
"step": 282
},
{
"epoch": 1.4285714285714286,
"grad_norm": 1.3627824010774807,
"learning_rate": 6.308296279842204e-06,
"loss": 0.6785,
"step": 283
},
{
"epoch": 1.4336134453781513,
"grad_norm": 1.2353887841733255,
"learning_rate": 6.279882973629101e-06,
"loss": 0.5987,
"step": 284
},
{
"epoch": 1.438655462184874,
"grad_norm": 1.2803646798399686,
"learning_rate": 6.2514253691996e-06,
"loss": 0.6593,
"step": 285
},
{
"epoch": 1.4436974789915966,
"grad_norm": 1.3106097252223476,
"learning_rate": 6.222924451504001e-06,
"loss": 0.6612,
"step": 286
},
{
"epoch": 1.4487394957983193,
"grad_norm": 1.491149138722541,
"learning_rate": 6.194381206991723e-06,
"loss": 0.6603,
"step": 287
},
{
"epoch": 1.453781512605042,
"grad_norm": 1.4729722170121724,
"learning_rate": 6.165796623577171e-06,
"loss": 0.6458,
"step": 288
},
{
"epoch": 1.4588235294117646,
"grad_norm": 1.2583772868484708,
"learning_rate": 6.1371716906055336e-06,
"loss": 0.6571,
"step": 289
},
{
"epoch": 1.4638655462184875,
"grad_norm": 1.6484902113991295,
"learning_rate": 6.10850739881854e-06,
"loss": 0.8048,
"step": 290
},
{
"epoch": 1.4689075630252102,
"grad_norm": 1.1293948636395863,
"learning_rate": 6.079804740320181e-06,
"loss": 0.631,
"step": 291
},
{
"epoch": 1.4739495798319329,
"grad_norm": 1.357543211738453,
"learning_rate": 6.051064708542357e-06,
"loss": 0.6834,
"step": 292
},
{
"epoch": 1.4789915966386555,
"grad_norm": 1.422094283192291,
"learning_rate": 6.022288298210502e-06,
"loss": 0.7688,
"step": 293
},
{
"epoch": 1.4840336134453782,
"grad_norm": 1.3320687626409005,
"learning_rate": 5.993476505309154e-06,
"loss": 0.6438,
"step": 294
},
{
"epoch": 1.4890756302521009,
"grad_norm": 1.479155880731166,
"learning_rate": 5.964630327047485e-06,
"loss": 0.6983,
"step": 295
},
{
"epoch": 1.4941176470588236,
"grad_norm": 1.4751670026359378,
"learning_rate": 5.935750761824777e-06,
"loss": 0.6784,
"step": 296
},
{
"epoch": 1.4991596638655462,
"grad_norm": 1.3971166152312533,
"learning_rate": 5.906838809195879e-06,
"loss": 0.7934,
"step": 297
},
{
"epoch": 1.504201680672269,
"grad_norm": 1.486282793941636,
"learning_rate": 5.877895469836604e-06,
"loss": 0.7149,
"step": 298
},
{
"epoch": 1.5092436974789916,
"grad_norm": 1.3831360984251488,
"learning_rate": 5.848921745509094e-06,
"loss": 0.6853,
"step": 299
},
{
"epoch": 1.5142857142857142,
"grad_norm": 1.373255418518971,
"learning_rate": 5.819918639027149e-06,
"loss": 0.6262,
"step": 300
},
{
"epoch": 1.519327731092437,
"grad_norm": 1.398139776725886,
"learning_rate": 5.790887154221521e-06,
"loss": 0.6682,
"step": 301
},
{
"epoch": 1.5243697478991596,
"grad_norm": 1.459786025141565,
"learning_rate": 5.7618282959051685e-06,
"loss": 0.6596,
"step": 302
},
{
"epoch": 1.5294117647058822,
"grad_norm": 1.386843554966046,
"learning_rate": 5.7327430698384775e-06,
"loss": 0.662,
"step": 303
},
{
"epoch": 1.534453781512605,
"grad_norm": 1.334093052658649,
"learning_rate": 5.703632482694453e-06,
"loss": 0.5642,
"step": 304
},
{
"epoch": 1.5394957983193276,
"grad_norm": 1.394936799748242,
"learning_rate": 5.674497542023875e-06,
"loss": 0.6785,
"step": 305
},
{
"epoch": 1.5445378151260503,
"grad_norm": 1.2487045092120568,
"learning_rate": 5.645339256220427e-06,
"loss": 0.6405,
"step": 306
},
{
"epoch": 1.5495798319327732,
"grad_norm": 1.449626002944486,
"learning_rate": 5.616158634485793e-06,
"loss": 0.7186,
"step": 307
},
{
"epoch": 1.5546218487394958,
"grad_norm": 1.3148115913009149,
"learning_rate": 5.5869566867947344e-06,
"loss": 0.6689,
"step": 308
},
{
"epoch": 1.5596638655462185,
"grad_norm": 1.3031066852612374,
"learning_rate": 5.557734423860122e-06,
"loss": 0.6865,
"step": 309
},
{
"epoch": 1.5647058823529412,
"grad_norm": 1.4070190634154978,
"learning_rate": 5.528492857097966e-06,
"loss": 0.692,
"step": 310
},
{
"epoch": 1.5697478991596638,
"grad_norm": 1.424416347019562,
"learning_rate": 5.499232998592399e-06,
"loss": 0.6712,
"step": 311
},
{
"epoch": 1.5747899159663865,
"grad_norm": 1.4045930546601455,
"learning_rate": 5.469955861060653e-06,
"loss": 0.692,
"step": 312
},
{
"epoch": 1.5798319327731094,
"grad_norm": 1.4633924161825607,
"learning_rate": 5.44066245781801e-06,
"loss": 0.6972,
"step": 313
},
{
"epoch": 1.584873949579832,
"grad_norm": 1.3419059215183884,
"learning_rate": 5.4113538027427245e-06,
"loss": 0.5832,
"step": 314
},
{
"epoch": 1.5899159663865547,
"grad_norm": 1.4651690425379238,
"learning_rate": 5.382030910240936e-06,
"loss": 0.7263,
"step": 315
},
{
"epoch": 1.5949579831932774,
"grad_norm": 1.3544416080791692,
"learning_rate": 5.352694795211555e-06,
"loss": 0.6693,
"step": 316
},
{
"epoch": 1.6,
"grad_norm": 1.3796831843734638,
"learning_rate": 5.3233464730111426e-06,
"loss": 0.6843,
"step": 317
},
{
"epoch": 1.6050420168067228,
"grad_norm": 1.3756368583869594,
"learning_rate": 5.29398695941876e-06,
"loss": 0.6956,
"step": 318
},
{
"epoch": 1.6100840336134454,
"grad_norm": 1.354906917799083,
"learning_rate": 5.2646172706008154e-06,
"loss": 0.5865,
"step": 319
},
{
"epoch": 1.615126050420168,
"grad_norm": 1.283604806155226,
"learning_rate": 5.235238423075899e-06,
"loss": 0.6476,
"step": 320
},
{
"epoch": 1.6201680672268908,
"grad_norm": 1.3323430668544856,
"learning_rate": 5.20585143367959e-06,
"loss": 0.5978,
"step": 321
},
{
"epoch": 1.6252100840336134,
"grad_norm": 1.4432636768429228,
"learning_rate": 5.176457319529264e-06,
"loss": 0.7229,
"step": 322
},
{
"epoch": 1.6302521008403361,
"grad_norm": 1.3389659599587687,
"learning_rate": 5.147057097988898e-06,
"loss": 0.7036,
"step": 323
},
{
"epoch": 1.6352941176470588,
"grad_norm": 1.40224689957347,
"learning_rate": 5.1176517866338495e-06,
"loss": 0.6524,
"step": 324
},
{
"epoch": 1.6403361344537815,
"grad_norm": 1.448948508673923,
"learning_rate": 5.088242403215644e-06,
"loss": 0.6574,
"step": 325
},
{
"epoch": 1.6453781512605041,
"grad_norm": 1.4336192786572701,
"learning_rate": 5.058829965626742e-06,
"loss": 0.6649,
"step": 326
},
{
"epoch": 1.6504201680672268,
"grad_norm": 1.1551398885920936,
"learning_rate": 5.029415491865311e-06,
"loss": 0.6607,
"step": 327
},
{
"epoch": 1.6554621848739495,
"grad_norm": 1.4081755117550179,
"learning_rate": 5e-06,
"loss": 0.6308,
"step": 328
},
{
"epoch": 1.6605042016806721,
"grad_norm": 1.2962293823552042,
"learning_rate": 4.97058450813469e-06,
"loss": 0.6315,
"step": 329
},
{
"epoch": 1.6655462184873948,
"grad_norm": 1.2609233329938516,
"learning_rate": 4.94117003437326e-06,
"loss": 0.6453,
"step": 330
},
{
"epoch": 1.6705882352941175,
"grad_norm": 1.4395586718171531,
"learning_rate": 4.911757596784358e-06,
"loss": 0.7056,
"step": 331
},
{
"epoch": 1.6756302521008404,
"grad_norm": 1.490647265803814,
"learning_rate": 4.882348213366152e-06,
"loss": 0.7463,
"step": 332
},
{
"epoch": 1.680672268907563,
"grad_norm": 1.4744084173114673,
"learning_rate": 4.8529429020111035e-06,
"loss": 0.6518,
"step": 333
},
{
"epoch": 1.6857142857142857,
"grad_norm": 1.3256051086606053,
"learning_rate": 4.823542680470738e-06,
"loss": 0.6322,
"step": 334
},
{
"epoch": 1.6907563025210084,
"grad_norm": 1.4043201154667322,
"learning_rate": 4.794148566320412e-06,
"loss": 0.6623,
"step": 335
},
{
"epoch": 1.695798319327731,
"grad_norm": 1.3058283187944708,
"learning_rate": 4.7647615769241e-06,
"loss": 0.7233,
"step": 336
},
{
"epoch": 1.7008403361344537,
"grad_norm": 1.3709304051984876,
"learning_rate": 4.7353827293991845e-06,
"loss": 0.7237,
"step": 337
},
{
"epoch": 1.7058823529411766,
"grad_norm": 1.3476441152074792,
"learning_rate": 4.706013040581242e-06,
"loss": 0.6408,
"step": 338
},
{
"epoch": 1.7109243697478993,
"grad_norm": 1.4435937624188804,
"learning_rate": 4.676653526988858e-06,
"loss": 0.6647,
"step": 339
},
{
"epoch": 1.715966386554622,
"grad_norm": 1.3226553142476545,
"learning_rate": 4.647305204788445e-06,
"loss": 0.6489,
"step": 340
},
{
"epoch": 1.7210084033613446,
"grad_norm": 1.3388051536697478,
"learning_rate": 4.617969089759066e-06,
"loss": 0.6414,
"step": 341
},
{
"epoch": 1.7260504201680673,
"grad_norm": 1.369018029455846,
"learning_rate": 4.588646197257278e-06,
"loss": 0.6535,
"step": 342
},
{
"epoch": 1.73109243697479,
"grad_norm": 1.4137443784434733,
"learning_rate": 4.559337542181993e-06,
"loss": 0.6446,
"step": 343
},
{
"epoch": 1.7361344537815127,
"grad_norm": 1.3718987426836817,
"learning_rate": 4.53004413893935e-06,
"loss": 0.6477,
"step": 344
},
{
"epoch": 1.7411764705882353,
"grad_norm": 1.262236928246166,
"learning_rate": 4.500767001407604e-06,
"loss": 0.6059,
"step": 345
},
{
"epoch": 1.746218487394958,
"grad_norm": 1.3613528737566392,
"learning_rate": 4.471507142902036e-06,
"loss": 0.6545,
"step": 346
},
{
"epoch": 1.7512605042016807,
"grad_norm": 1.303211681985445,
"learning_rate": 4.4422655761398785e-06,
"loss": 0.633,
"step": 347
},
{
"epoch": 1.7563025210084033,
"grad_norm": 1.3262900181605304,
"learning_rate": 4.413043313205266e-06,
"loss": 0.6873,
"step": 348
},
{
"epoch": 1.761344537815126,
"grad_norm": 1.5014706286550592,
"learning_rate": 4.383841365514208e-06,
"loss": 0.6715,
"step": 349
},
{
"epoch": 1.7663865546218487,
"grad_norm": 1.3748458240376293,
"learning_rate": 4.354660743779575e-06,
"loss": 0.6322,
"step": 350
},
{
"epoch": 1.7714285714285714,
"grad_norm": 1.3200606309946945,
"learning_rate": 4.325502457976126e-06,
"loss": 0.6468,
"step": 351
},
{
"epoch": 1.776470588235294,
"grad_norm": 1.4363798100469027,
"learning_rate": 4.296367517305548e-06,
"loss": 0.6424,
"step": 352
},
{
"epoch": 1.7815126050420167,
"grad_norm": 1.3665833844005753,
"learning_rate": 4.267256930161523e-06,
"loss": 0.6895,
"step": 353
},
{
"epoch": 1.7865546218487394,
"grad_norm": 1.3126702843544444,
"learning_rate": 4.238171704094833e-06,
"loss": 0.6766,
"step": 354
},
{
"epoch": 1.791596638655462,
"grad_norm": 1.3931998076257006,
"learning_rate": 4.209112845778481e-06,
"loss": 0.7165,
"step": 355
},
{
"epoch": 1.7966386554621847,
"grad_norm": 1.4120182498478362,
"learning_rate": 4.180081360972852e-06,
"loss": 0.6909,
"step": 356
},
{
"epoch": 1.8016806722689076,
"grad_norm": 1.3825157448385343,
"learning_rate": 4.151078254490908e-06,
"loss": 0.6634,
"step": 357
},
{
"epoch": 1.8067226890756303,
"grad_norm": 1.2976324503271779,
"learning_rate": 4.122104530163397e-06,
"loss": 0.6482,
"step": 358
},
{
"epoch": 1.811764705882353,
"grad_norm": 1.3371821093594873,
"learning_rate": 4.09316119080412e-06,
"loss": 0.5939,
"step": 359
},
{
"epoch": 1.8168067226890756,
"grad_norm": 1.2815723486743216,
"learning_rate": 4.064249238175223e-06,
"loss": 0.5873,
"step": 360
},
{
"epoch": 1.8218487394957983,
"grad_norm": 1.2598876616725718,
"learning_rate": 4.035369672952516e-06,
"loss": 0.6211,
"step": 361
},
{
"epoch": 1.826890756302521,
"grad_norm": 1.3775558524100238,
"learning_rate": 4.0065234946908456e-06,
"loss": 0.6362,
"step": 362
},
{
"epoch": 1.8319327731092439,
"grad_norm": 1.3605455122282684,
"learning_rate": 3.977711701789499e-06,
"loss": 0.6173,
"step": 363
},
{
"epoch": 1.8369747899159665,
"grad_norm": 1.2800072707024852,
"learning_rate": 3.948935291457645e-06,
"loss": 0.6325,
"step": 364
},
{
"epoch": 1.8420168067226892,
"grad_norm": 1.3258336050686086,
"learning_rate": 3.920195259679822e-06,
"loss": 0.653,
"step": 365
},
{
"epoch": 1.8470588235294119,
"grad_norm": 1.3413446326047822,
"learning_rate": 3.891492601181462e-06,
"loss": 0.651,
"step": 366
},
{
"epoch": 1.8521008403361345,
"grad_norm": 1.41115994835795,
"learning_rate": 3.862828309394469e-06,
"loss": 0.6292,
"step": 367
},
{
"epoch": 1.8571428571428572,
"grad_norm": 1.3205359045412157,
"learning_rate": 3.834203376422831e-06,
"loss": 0.6064,
"step": 368
},
{
"epoch": 1.8621848739495799,
"grad_norm": 1.271016774529,
"learning_rate": 3.805618793008279e-06,
"loss": 0.6503,
"step": 369
},
{
"epoch": 1.8672268907563025,
"grad_norm": 1.38208148943542,
"learning_rate": 3.777075548496001e-06,
"loss": 0.673,
"step": 370
},
{
"epoch": 1.8722689075630252,
"grad_norm": 1.4627608316199674,
"learning_rate": 3.7485746308004013e-06,
"loss": 0.6853,
"step": 371
},
{
"epoch": 1.877310924369748,
"grad_norm": 1.2952312321525565,
"learning_rate": 3.7201170263709004e-06,
"loss": 0.6164,
"step": 372
},
{
"epoch": 1.8823529411764706,
"grad_norm": 1.4840833764786416,
"learning_rate": 3.6917037201577977e-06,
"loss": 0.6935,
"step": 373
},
{
"epoch": 1.8873949579831932,
"grad_norm": 1.371096887673559,
"learning_rate": 3.6633356955781827e-06,
"loss": 0.6571,
"step": 374
},
{
"epoch": 1.892436974789916,
"grad_norm": 1.1787569156110669,
"learning_rate": 3.635013934481895e-06,
"loss": 0.5976,
"step": 375
},
{
"epoch": 1.8974789915966386,
"grad_norm": 1.292415912438797,
"learning_rate": 3.6067394171175397e-06,
"loss": 0.662,
"step": 376
},
{
"epoch": 1.9025210084033612,
"grad_norm": 1.4004270726912136,
"learning_rate": 3.578513122098566e-06,
"loss": 0.6902,
"step": 377
},
{
"epoch": 1.907563025210084,
"grad_norm": 1.3676893820953542,
"learning_rate": 3.5503360263693887e-06,
"loss": 0.6736,
"step": 378
},
{
"epoch": 1.9126050420168066,
"grad_norm": 1.5497019666472422,
"learning_rate": 3.5222091051715803e-06,
"loss": 0.6474,
"step": 379
},
{
"epoch": 1.9176470588235293,
"grad_norm": 1.4107058784966016,
"learning_rate": 3.4941333320101173e-06,
"loss": 0.6214,
"step": 380
},
{
"epoch": 1.9226890756302522,
"grad_norm": 1.3074693513299003,
"learning_rate": 3.466109678619681e-06,
"loss": 0.5863,
"step": 381
},
{
"epoch": 1.9277310924369748,
"grad_norm": 1.2533065740051568,
"learning_rate": 3.4381391149310294e-06,
"loss": 0.6145,
"step": 382
},
{
"epoch": 1.9327731092436975,
"grad_norm": 1.279932965905714,
"learning_rate": 3.4102226090374246e-06,
"loss": 0.6138,
"step": 383
},
{
"epoch": 1.9378151260504202,
"grad_norm": 1.279194036152673,
"learning_rate": 3.3823611271611266e-06,
"loss": 0.6051,
"step": 384
},
{
"epoch": 1.9428571428571428,
"grad_norm": 1.4523883672700335,
"learning_rate": 3.35455563361995e-06,
"loss": 0.6475,
"step": 385
},
{
"epoch": 1.9478991596638655,
"grad_norm": 1.319917640705539,
"learning_rate": 3.3268070907938915e-06,
"loss": 0.575,
"step": 386
},
{
"epoch": 1.9529411764705882,
"grad_norm": 1.356219744351625,
"learning_rate": 3.2991164590918162e-06,
"loss": 0.6707,
"step": 387
},
{
"epoch": 1.957983193277311,
"grad_norm": 1.3980927144998019,
"learning_rate": 3.271484696918218e-06,
"loss": 0.62,
"step": 388
},
{
"epoch": 1.9630252100840337,
"grad_norm": 1.3412194145756722,
"learning_rate": 3.2439127606400546e-06,
"loss": 0.6249,
"step": 389
},
{
"epoch": 1.9680672268907564,
"grad_norm": 1.231905550971943,
"learning_rate": 3.2164016045536306e-06,
"loss": 0.6542,
"step": 390
},
{
"epoch": 1.973109243697479,
"grad_norm": 1.3549695794420435,
"learning_rate": 3.1889521808515888e-06,
"loss": 0.6176,
"step": 391
},
{
"epoch": 1.9781512605042018,
"grad_norm": 1.415166811994311,
"learning_rate": 3.1615654395899377e-06,
"loss": 0.6593,
"step": 392
},
{
"epoch": 1.9831932773109244,
"grad_norm": 1.3126591809141124,
"learning_rate": 3.1342423286551756e-06,
"loss": 0.6891,
"step": 393
},
{
"epoch": 1.988235294117647,
"grad_norm": 1.3842054436860431,
"learning_rate": 3.1069837937314846e-06,
"loss": 0.6342,
"step": 394
},
{
"epoch": 1.9932773109243698,
"grad_norm": 1.4424046044230687,
"learning_rate": 3.0797907782679944e-06,
"loss": 0.6461,
"step": 395
},
{
"epoch": 1.9983193277310924,
"grad_norm": 1.3718751038472339,
"learning_rate": 3.0526642234461313e-06,
"loss": 0.6338,
"step": 396
},
{
"epoch": 2.0050420168067227,
"grad_norm": 3.363833604785768,
"learning_rate": 3.0256050681470446e-06,
"loss": 1.2006,
"step": 397
},
{
"epoch": 2.0100840336134453,
"grad_norm": 1.410375521884215,
"learning_rate": 2.9986142489191074e-06,
"loss": 0.5121,
"step": 398
},
{
"epoch": 2.015126050420168,
"grad_norm": 1.463355598251907,
"learning_rate": 2.971692699945502e-06,
"loss": 0.4394,
"step": 399
},
{
"epoch": 2.0201680672268907,
"grad_norm": 1.2914998337098158,
"learning_rate": 2.9448413530118912e-06,
"loss": 0.4978,
"step": 400
},
{
"epoch": 2.0252100840336134,
"grad_norm": 1.3604150815997402,
"learning_rate": 2.9180611374741623e-06,
"loss": 0.4689,
"step": 401
},
{
"epoch": 2.030252100840336,
"grad_norm": 1.1964953052023972,
"learning_rate": 2.891352980226262e-06,
"loss": 0.5015,
"step": 402
},
{
"epoch": 2.0352941176470587,
"grad_norm": 1.1694739760631343,
"learning_rate": 2.8647178056681197e-06,
"loss": 0.447,
"step": 403
},
{
"epoch": 2.0403361344537814,
"grad_norm": 1.3174590682003549,
"learning_rate": 2.838156535673652e-06,
"loss": 0.414,
"step": 404
},
{
"epoch": 2.045378151260504,
"grad_norm": 1.2140198128144435,
"learning_rate": 2.8116700895588473e-06,
"loss": 0.4505,
"step": 405
},
{
"epoch": 2.0504201680672267,
"grad_norm": 1.3398119898455612,
"learning_rate": 2.785259384049959e-06,
"loss": 0.4532,
"step": 406
},
{
"epoch": 2.0554621848739494,
"grad_norm": 1.4229930176202614,
"learning_rate": 2.7589253332517736e-06,
"loss": 0.5546,
"step": 407
},
{
"epoch": 2.060504201680672,
"grad_norm": 1.4684509907326317,
"learning_rate": 2.7326688486159613e-06,
"loss": 0.5254,
"step": 408
},
{
"epoch": 2.065546218487395,
"grad_norm": 1.4962520925453975,
"learning_rate": 2.706490838909547e-06,
"loss": 0.4673,
"step": 409
},
{
"epoch": 2.070588235294118,
"grad_norm": 1.3630229586386085,
"learning_rate": 2.680392210183446e-06,
"loss": 0.4473,
"step": 410
},
{
"epoch": 2.0756302521008405,
"grad_norm": 1.38978907137299,
"learning_rate": 2.6543738657411033e-06,
"loss": 0.5159,
"step": 411
},
{
"epoch": 2.080672268907563,
"grad_norm": 1.429662885547244,
"learning_rate": 2.628436706107238e-06,
"loss": 0.5161,
"step": 412
},
{
"epoch": 2.085714285714286,
"grad_norm": 1.394356185017467,
"learning_rate": 2.6025816289966703e-06,
"loss": 0.5032,
"step": 413
},
{
"epoch": 2.0907563025210085,
"grad_norm": 1.480088664868798,
"learning_rate": 2.5768095292832412e-06,
"loss": 0.4802,
"step": 414
},
{
"epoch": 2.095798319327731,
"grad_norm": 1.3859048551297604,
"learning_rate": 2.5511212989688587e-06,
"loss": 0.4993,
"step": 415
},
{
"epoch": 2.100840336134454,
"grad_norm": 1.440430022618694,
"learning_rate": 2.525517827152614e-06,
"loss": 0.4551,
"step": 416
},
{
"epoch": 2.1058823529411765,
"grad_norm": 1.4332550806993916,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.5611,
"step": 417
},
{
"epoch": 2.110924369747899,
"grad_norm": 1.3161188350792523,
"learning_rate": 2.4745687007122636e-06,
"loss": 0.4602,
"step": 418
},
{
"epoch": 2.115966386554622,
"grad_norm": 1.4145836319136063,
"learning_rate": 2.449224809495815e-06,
"loss": 0.4464,
"step": 419
},
{
"epoch": 2.1210084033613446,
"grad_norm": 1.3638972016864883,
"learning_rate": 2.423969203531768e-06,
"loss": 0.4625,
"step": 420
},
{
"epoch": 2.1260504201680672,
"grad_norm": 1.4282920146552893,
"learning_rate": 2.3988027569455895e-06,
"loss": 0.4809,
"step": 421
},
{
"epoch": 2.13109243697479,
"grad_norm": 1.452704091304085,
"learning_rate": 2.373726340776837e-06,
"loss": 0.4959,
"step": 422
},
{
"epoch": 2.1361344537815126,
"grad_norm": 1.4474065940760683,
"learning_rate": 2.348740822949006e-06,
"loss": 0.4557,
"step": 423
},
{
"epoch": 2.1411764705882352,
"grad_norm": 1.406883162238408,
"learning_rate": 2.323847068239504e-06,
"loss": 0.5069,
"step": 424
},
{
"epoch": 2.146218487394958,
"grad_norm": 1.4713827636564831,
"learning_rate": 2.2990459382497086e-06,
"loss": 0.4813,
"step": 425
},
{
"epoch": 2.1512605042016806,
"grad_norm": 1.4582227343532888,
"learning_rate": 2.274338291375147e-06,
"loss": 0.462,
"step": 426
},
{
"epoch": 2.1563025210084033,
"grad_norm": 1.353197229608169,
"learning_rate": 2.2497249827757933e-06,
"loss": 0.4658,
"step": 427
},
{
"epoch": 2.161344537815126,
"grad_norm": 1.3550947330778897,
"learning_rate": 2.225206864346465e-06,
"loss": 0.5794,
"step": 428
},
{
"epoch": 2.1663865546218486,
"grad_norm": 1.4137143069445475,
"learning_rate": 2.2007847846873342e-06,
"loss": 0.4722,
"step": 429
},
{
"epoch": 2.1714285714285713,
"grad_norm": 1.2932234077066185,
"learning_rate": 2.176459589074566e-06,
"loss": 0.4369,
"step": 430
},
{
"epoch": 2.176470588235294,
"grad_norm": 1.3725308971047603,
"learning_rate": 2.1522321194310577e-06,
"loss": 0.4958,
"step": 431
},
{
"epoch": 2.1815126050420166,
"grad_norm": 1.4324324040918073,
"learning_rate": 2.1281032142972933e-06,
"loss": 0.4954,
"step": 432
},
{
"epoch": 2.1865546218487397,
"grad_norm": 1.4153168395436235,
"learning_rate": 2.1040737088023323e-06,
"loss": 0.4457,
"step": 433
},
{
"epoch": 2.1915966386554624,
"grad_norm": 1.3341155055487035,
"learning_rate": 2.080144434634898e-06,
"loss": 0.5017,
"step": 434
},
{
"epoch": 2.196638655462185,
"grad_norm": 1.352939614197411,
"learning_rate": 2.056316220014588e-06,
"loss": 0.4553,
"step": 435
},
{
"epoch": 2.2016806722689077,
"grad_norm": 1.393182470026338,
"learning_rate": 2.0325898896632178e-06,
"loss": 0.4448,
"step": 436
},
{
"epoch": 2.2067226890756304,
"grad_norm": 1.4033955608191793,
"learning_rate": 2.0089662647762716e-06,
"loss": 0.441,
"step": 437
},
{
"epoch": 2.211764705882353,
"grad_norm": 1.41226298350313,
"learning_rate": 1.9854461629944764e-06,
"loss": 0.4656,
"step": 438
},
{
"epoch": 2.2168067226890757,
"grad_norm": 1.3512621478929514,
"learning_rate": 1.962030398375506e-06,
"loss": 0.5245,
"step": 439
},
{
"epoch": 2.2218487394957984,
"grad_norm": 1.3932479184910864,
"learning_rate": 1.9387197813658092e-06,
"loss": 0.456,
"step": 440
},
{
"epoch": 2.226890756302521,
"grad_norm": 1.3400595100259751,
"learning_rate": 1.915515118772555e-06,
"loss": 0.4622,
"step": 441
},
{
"epoch": 2.2319327731092438,
"grad_norm": 1.3239101426319217,
"learning_rate": 1.8924172137357038e-06,
"loss": 0.4821,
"step": 442
},
{
"epoch": 2.2369747899159664,
"grad_norm": 1.4028557110251756,
"learning_rate": 1.8694268657002197e-06,
"loss": 0.4592,
"step": 443
},
{
"epoch": 2.242016806722689,
"grad_norm": 1.4043326661254716,
"learning_rate": 1.8465448703883959e-06,
"loss": 0.4642,
"step": 444
},
{
"epoch": 2.2470588235294118,
"grad_norm": 1.4748018123002309,
"learning_rate": 1.8237720197723075e-06,
"loss": 0.5244,
"step": 445
},
{
"epoch": 2.2521008403361344,
"grad_norm": 1.3653204295657917,
"learning_rate": 1.8011091020464138e-06,
"loss": 0.5117,
"step": 446
},
{
"epoch": 2.257142857142857,
"grad_norm": 1.4578979263769525,
"learning_rate": 1.7785569016002686e-06,
"loss": 0.4622,
"step": 447
},
{
"epoch": 2.26218487394958,
"grad_norm": 1.4739147697577966,
"learning_rate": 1.75611619899137e-06,
"loss": 0.4524,
"step": 448
},
{
"epoch": 2.2672268907563025,
"grad_norm": 1.3465934593186815,
"learning_rate": 1.7337877709181527e-06,
"loss": 0.4616,
"step": 449
},
{
"epoch": 2.272268907563025,
"grad_norm": 1.4287084373091115,
"learning_rate": 1.711572390193102e-06,
"loss": 0.6594,
"step": 450
},
{
"epoch": 2.277310924369748,
"grad_norm": 1.3274840093520053,
"learning_rate": 1.689470825715998e-06,
"loss": 0.4529,
"step": 451
},
{
"epoch": 2.2823529411764705,
"grad_norm": 1.4216422105253623,
"learning_rate": 1.6674838424473172e-06,
"loss": 0.4655,
"step": 452
},
{
"epoch": 2.287394957983193,
"grad_norm": 1.452303728671861,
"learning_rate": 1.6456122013817477e-06,
"loss": 0.4625,
"step": 453
},
{
"epoch": 2.292436974789916,
"grad_norm": 1.4369743256615972,
"learning_rate": 1.6238566595218475e-06,
"loss": 0.4761,
"step": 454
},
{
"epoch": 2.2974789915966385,
"grad_norm": 1.407023006658543,
"learning_rate": 1.6022179698518525e-06,
"loss": 0.4505,
"step": 455
},
{
"epoch": 2.302521008403361,
"grad_norm": 1.391039540718536,
"learning_rate": 1.580696881311611e-06,
"loss": 0.4894,
"step": 456
},
{
"epoch": 2.307563025210084,
"grad_norm": 1.3557281771597436,
"learning_rate": 1.5592941387706562e-06,
"loss": 0.4108,
"step": 457
},
{
"epoch": 2.3126050420168065,
"grad_norm": 1.3010131467886796,
"learning_rate": 1.538010483002435e-06,
"loss": 0.425,
"step": 458
},
{
"epoch": 2.317647058823529,
"grad_norm": 1.3625069219769537,
"learning_rate": 1.5168466506586654e-06,
"loss": 0.4431,
"step": 459
},
{
"epoch": 2.3226890756302523,
"grad_norm": 1.2997097389936179,
"learning_rate": 1.4958033742438348e-06,
"loss": 0.4058,
"step": 460
},
{
"epoch": 2.327731092436975,
"grad_norm": 1.3546221586310845,
"learning_rate": 1.4748813820898554e-06,
"loss": 0.5043,
"step": 461
},
{
"epoch": 2.3327731092436976,
"grad_norm": 1.3503940282999218,
"learning_rate": 1.454081398330855e-06,
"loss": 0.5015,
"step": 462
},
{
"epoch": 2.3378151260504203,
"grad_norm": 1.2879127697899735,
"learning_rate": 1.4334041428781003e-06,
"loss": 0.4219,
"step": 463
},
{
"epoch": 2.342857142857143,
"grad_norm": 1.5900890446730591,
"learning_rate": 1.4128503313951008e-06,
"loss": 0.5508,
"step": 464
},
{
"epoch": 2.3478991596638656,
"grad_norm": 1.4693275041182954,
"learning_rate": 1.3924206752728282e-06,
"loss": 0.5196,
"step": 465
},
{
"epoch": 2.3529411764705883,
"grad_norm": 1.3739526563603481,
"learning_rate": 1.3721158816050872e-06,
"loss": 0.5223,
"step": 466
},
{
"epoch": 2.357983193277311,
"grad_norm": 1.2888756368302696,
"learning_rate": 1.3519366531640589e-06,
"loss": 0.4745,
"step": 467
},
{
"epoch": 2.3630252100840337,
"grad_norm": 1.3646861171520672,
"learning_rate": 1.3318836883759634e-06,
"loss": 0.4765,
"step": 468
},
{
"epoch": 2.3680672268907563,
"grad_norm": 1.3876282049959663,
"learning_rate": 1.3119576812968893e-06,
"loss": 0.4552,
"step": 469
},
{
"epoch": 2.373109243697479,
"grad_norm": 1.3212811305037033,
"learning_rate": 1.292159321588778e-06,
"loss": 0.4444,
"step": 470
},
{
"epoch": 2.3781512605042017,
"grad_norm": 1.4025656868262555,
"learning_rate": 1.272489294495548e-06,
"loss": 0.5373,
"step": 471
},
{
"epoch": 2.3831932773109243,
"grad_norm": 1.3992039142572703,
"learning_rate": 1.252948280819375e-06,
"loss": 0.4297,
"step": 472
},
{
"epoch": 2.388235294117647,
"grad_norm": 1.438194701698973,
"learning_rate": 1.2335369568971362e-06,
"loss": 0.4577,
"step": 473
},
{
"epoch": 2.3932773109243697,
"grad_norm": 1.3560235059252677,
"learning_rate": 1.2142559945769995e-06,
"loss": 0.4576,
"step": 474
},
{
"epoch": 2.3983193277310924,
"grad_norm": 1.357949004614199,
"learning_rate": 1.1951060611951615e-06,
"loss": 0.5944,
"step": 475
},
{
"epoch": 2.403361344537815,
"grad_norm": 1.2895013043643404,
"learning_rate": 1.1760878195527642e-06,
"loss": 0.4192,
"step": 476
},
{
"epoch": 2.4084033613445377,
"grad_norm": 1.2608640104913673,
"learning_rate": 1.1572019278929457e-06,
"loss": 0.4431,
"step": 477
},
{
"epoch": 2.4134453781512604,
"grad_norm": 1.4235058216914491,
"learning_rate": 1.1384490398780563e-06,
"loss": 0.4835,
"step": 478
},
{
"epoch": 2.418487394957983,
"grad_norm": 1.3849158950764375,
"learning_rate": 1.1198298045670402e-06,
"loss": 0.4497,
"step": 479
},
{
"epoch": 2.4235294117647057,
"grad_norm": 1.4243621054419897,
"learning_rate": 1.1013448663929704e-06,
"loss": 0.5031,
"step": 480
},
{
"epoch": 2.4285714285714284,
"grad_norm": 1.2997464135987702,
"learning_rate": 1.0829948651407374e-06,
"loss": 0.483,
"step": 481
},
{
"epoch": 2.4336134453781515,
"grad_norm": 1.2887117802326669,
"learning_rate": 1.0647804359249143e-06,
"loss": 0.4424,
"step": 482
},
{
"epoch": 2.438655462184874,
"grad_norm": 1.2955280324064098,
"learning_rate": 1.0467022091677692e-06,
"loss": 0.4963,
"step": 483
},
{
"epoch": 2.443697478991597,
"grad_norm": 1.5695989821047664,
"learning_rate": 1.0287608105774456e-06,
"loss": 0.512,
"step": 484
},
{
"epoch": 2.4487394957983195,
"grad_norm": 1.3900121464168351,
"learning_rate": 1.0109568611263094e-06,
"loss": 0.4418,
"step": 485
},
{
"epoch": 2.453781512605042,
"grad_norm": 1.443290081700745,
"learning_rate": 9.932909770294542e-07,
"loss": 0.4439,
"step": 486
},
{
"epoch": 2.458823529411765,
"grad_norm": 1.3476484251272791,
"learning_rate": 9.757637697233723e-07,
"loss": 0.4885,
"step": 487
},
{
"epoch": 2.4638655462184875,
"grad_norm": 1.3389474168899225,
"learning_rate": 9.58375845844793e-07,
"loss": 0.4486,
"step": 488
},
{
"epoch": 2.46890756302521,
"grad_norm": 1.2353966317116258,
"learning_rate": 9.41127807209688e-07,
"loss": 0.4321,
"step": 489
},
{
"epoch": 2.473949579831933,
"grad_norm": 1.2849383161233021,
"learning_rate": 9.240202507924412e-07,
"loss": 0.433,
"step": 490
},
{
"epoch": 2.4789915966386555,
"grad_norm": 1.3336087651970685,
"learning_rate": 9.070537687051817e-07,
"loss": 0.4516,
"step": 491
},
{
"epoch": 2.484033613445378,
"grad_norm": 1.3550057200939567,
"learning_rate": 8.902289481772996e-07,
"loss": 0.4616,
"step": 492
},
{
"epoch": 2.489075630252101,
"grad_norm": 1.3590095983206505,
"learning_rate": 8.735463715351139e-07,
"loss": 0.4203,
"step": 493
},
{
"epoch": 2.4941176470588236,
"grad_norm": 1.2915320514796769,
"learning_rate": 8.570066161817176e-07,
"loss": 0.4503,
"step": 494
},
{
"epoch": 2.499159663865546,
"grad_norm": 1.2679676777389248,
"learning_rate": 8.406102545769989e-07,
"loss": 0.4566,
"step": 495
},
{
"epoch": 2.504201680672269,
"grad_norm": 1.426642729326135,
"learning_rate": 8.243578542178227e-07,
"loss": 0.4707,
"step": 496
},
{
"epoch": 2.5092436974789916,
"grad_norm": 1.4592108582229681,
"learning_rate": 8.082499776183883e-07,
"loss": 0.4845,
"step": 497
},
{
"epoch": 2.5142857142857142,
"grad_norm": 1.5266839034291377,
"learning_rate": 7.922871822907641e-07,
"loss": 0.5228,
"step": 498
},
{
"epoch": 2.519327731092437,
"grad_norm": 1.471645595600825,
"learning_rate": 7.764700207255904e-07,
"loss": 0.4173,
"step": 499
},
{
"epoch": 2.5243697478991596,
"grad_norm": 1.3871858021840573,
"learning_rate": 7.607990403729526e-07,
"loss": 0.4601,
"step": 500
},
{
"epoch": 2.5294117647058822,
"grad_norm": 1.3138350820905274,
"learning_rate": 7.452747836234392e-07,
"loss": 0.4504,
"step": 501
},
{
"epoch": 2.534453781512605,
"grad_norm": 1.2975304324598231,
"learning_rate": 7.298977877893688e-07,
"loss": 0.4265,
"step": 502
},
{
"epoch": 2.5394957983193276,
"grad_norm": 1.3447001192643702,
"learning_rate": 7.146685850861851e-07,
"loss": 0.466,
"step": 503
},
{
"epoch": 2.5445378151260503,
"grad_norm": 1.3862420743153665,
"learning_rate": 6.995877026140468e-07,
"loss": 0.4884,
"step": 504
},
{
"epoch": 2.549579831932773,
"grad_norm": 1.4032983423284162,
"learning_rate": 6.846556623395795e-07,
"loss": 0.4948,
"step": 505
},
{
"epoch": 2.5546218487394956,
"grad_norm": 1.362120295068725,
"learning_rate": 6.698729810778065e-07,
"loss": 0.4702,
"step": 506
},
{
"epoch": 2.5596638655462183,
"grad_norm": 1.389808913275814,
"learning_rate": 6.552401704742678e-07,
"loss": 0.4825,
"step": 507
},
{
"epoch": 2.564705882352941,
"grad_norm": 1.2860994495581453,
"learning_rate": 6.40757736987307e-07,
"loss": 0.4321,
"step": 508
},
{
"epoch": 2.5697478991596636,
"grad_norm": 1.212606448511892,
"learning_rate": 6.26426181870542e-07,
"loss": 0.3868,
"step": 509
},
{
"epoch": 2.5747899159663863,
"grad_norm": 1.2670489383748516,
"learning_rate": 6.122460011555187e-07,
"loss": 0.4532,
"step": 510
},
{
"epoch": 2.5798319327731094,
"grad_norm": 1.3801554590726837,
"learning_rate": 5.982176856345445e-07,
"loss": 0.4263,
"step": 511
},
{
"epoch": 2.584873949579832,
"grad_norm": 1.3394504151016333,
"learning_rate": 5.843417208436908e-07,
"loss": 0.496,
"step": 512
},
{
"epoch": 2.5899159663865547,
"grad_norm": 1.2955707760211432,
"learning_rate": 5.706185870460018e-07,
"loss": 0.4253,
"step": 513
},
{
"epoch": 2.5949579831932774,
"grad_norm": 1.289481906227215,
"learning_rate": 5.570487592148666e-07,
"loss": 0.4035,
"step": 514
},
{
"epoch": 2.6,
"grad_norm": 1.3376266312340062,
"learning_rate": 5.436327070175729e-07,
"loss": 0.4545,
"step": 515
},
{
"epoch": 2.6050420168067228,
"grad_norm": 1.4001675009701846,
"learning_rate": 5.303708947990638e-07,
"loss": 0.4684,
"step": 516
},
{
"epoch": 2.6100840336134454,
"grad_norm": 1.4896915805848956,
"learning_rate": 5.172637815658583e-07,
"loss": 0.4704,
"step": 517
},
{
"epoch": 2.615126050420168,
"grad_norm": 1.430686916061002,
"learning_rate": 5.04311820970163e-07,
"loss": 0.4782,
"step": 518
},
{
"epoch": 2.6201680672268908,
"grad_norm": 1.3676105828350056,
"learning_rate": 4.915154612941781e-07,
"loss": 0.5979,
"step": 519
},
{
"epoch": 2.6252100840336134,
"grad_norm": 1.3552413071380474,
"learning_rate": 4.788751454345763e-07,
"loss": 0.4405,
"step": 520
},
{
"epoch": 2.630252100840336,
"grad_norm": 1.320913107468769,
"learning_rate": 4.663913108871726e-07,
"loss": 0.4105,
"step": 521
},
{
"epoch": 2.635294117647059,
"grad_norm": 1.2848967010536776,
"learning_rate": 4.540643897317887e-07,
"loss": 0.3934,
"step": 522
},
{
"epoch": 2.6403361344537815,
"grad_norm": 1.3500509189164658,
"learning_rate": 4.4189480861729137e-07,
"loss": 0.4339,
"step": 523
},
{
"epoch": 2.645378151260504,
"grad_norm": 1.3387080610453355,
"learning_rate": 4.2988298874682754e-07,
"loss": 0.4552,
"step": 524
},
{
"epoch": 2.650420168067227,
"grad_norm": 1.3397812410356982,
"learning_rate": 4.1802934586324897e-07,
"loss": 0.5401,
"step": 525
},
{
"epoch": 2.6554621848739495,
"grad_norm": 1.446011629760243,
"learning_rate": 4.0633429023472004e-07,
"loss": 0.5409,
"step": 526
},
{
"epoch": 2.660504201680672,
"grad_norm": 1.3710949034220614,
"learning_rate": 3.947982266405159e-07,
"loss": 0.501,
"step": 527
},
{
"epoch": 2.665546218487395,
"grad_norm": 1.5073033115483478,
"learning_rate": 3.834215543570191e-07,
"loss": 0.5156,
"step": 528
},
{
"epoch": 2.6705882352941175,
"grad_norm": 1.3549599833015573,
"learning_rate": 3.72204667143895e-07,
"loss": 0.4667,
"step": 529
},
{
"epoch": 2.6756302521008406,
"grad_norm": 1.368632751852017,
"learning_rate": 3.611479532304618e-07,
"loss": 0.4596,
"step": 530
},
{
"epoch": 2.6806722689075633,
"grad_norm": 1.3310734620781681,
"learning_rate": 3.5025179530225995e-07,
"loss": 0.4248,
"step": 531
},
{
"epoch": 2.685714285714286,
"grad_norm": 1.429961991715737,
"learning_rate": 3.395165704878023e-07,
"loss": 0.4921,
"step": 532
},
{
"epoch": 2.6907563025210086,
"grad_norm": 1.3220689464603654,
"learning_rate": 3.289426503455201e-07,
"loss": 0.4686,
"step": 533
},
{
"epoch": 2.6957983193277313,
"grad_norm": 1.3596446823078556,
"learning_rate": 3.185304008509077e-07,
"loss": 0.4692,
"step": 534
},
{
"epoch": 2.700840336134454,
"grad_norm": 1.2664017870580138,
"learning_rate": 3.082801823838527e-07,
"loss": 0.4792,
"step": 535
},
{
"epoch": 2.7058823529411766,
"grad_norm": 1.277008676617942,
"learning_rate": 2.9819234971616154e-07,
"loss": 0.4496,
"step": 536
},
{
"epoch": 2.7109243697478993,
"grad_norm": 1.3031675483473417,
"learning_rate": 2.882672519992824e-07,
"loss": 0.4599,
"step": 537
},
{
"epoch": 2.715966386554622,
"grad_norm": 1.475285425023621,
"learning_rate": 2.785052327522214e-07,
"loss": 0.5562,
"step": 538
},
{
"epoch": 2.7210084033613446,
"grad_norm": 1.2387397112349467,
"learning_rate": 2.6890662984965234e-07,
"loss": 0.4508,
"step": 539
},
{
"epoch": 2.7260504201680673,
"grad_norm": 1.2769755883493084,
"learning_rate": 2.594717755102205e-07,
"loss": 0.4497,
"step": 540
},
{
"epoch": 2.73109243697479,
"grad_norm": 1.4117553058680856,
"learning_rate": 2.5020099628504603e-07,
"loss": 0.4176,
"step": 541
},
{
"epoch": 2.7361344537815127,
"grad_norm": 1.3430474164461437,
"learning_rate": 2.4109461304642254e-07,
"loss": 0.61,
"step": 542
},
{
"epoch": 2.7411764705882353,
"grad_norm": 1.319429861827343,
"learning_rate": 2.3215294097670927e-07,
"loss": 0.4451,
"step": 543
},
{
"epoch": 2.746218487394958,
"grad_norm": 1.436920605125832,
"learning_rate": 2.2337628955742263e-07,
"loss": 0.4874,
"step": 544
},
{
"epoch": 2.7512605042016807,
"grad_norm": 1.3812471581213166,
"learning_rate": 2.1476496255852685e-07,
"loss": 0.382,
"step": 545
},
{
"epoch": 2.7563025210084033,
"grad_norm": 1.205494792014491,
"learning_rate": 2.0631925802791608e-07,
"loss": 0.5224,
"step": 546
},
{
"epoch": 2.761344537815126,
"grad_norm": 1.3083334014447827,
"learning_rate": 1.9803946828110376e-07,
"loss": 0.5117,
"step": 547
},
{
"epoch": 2.7663865546218487,
"grad_norm": 1.3758887119834913,
"learning_rate": 1.8992587989110133e-07,
"loss": 0.4898,
"step": 548
},
{
"epoch": 2.7714285714285714,
"grad_norm": 1.3436017213466456,
"learning_rate": 1.8197877367849948e-07,
"loss": 0.5596,
"step": 549
},
{
"epoch": 2.776470588235294,
"grad_norm": 1.4507659924194913,
"learning_rate": 1.7419842470175196e-07,
"loss": 0.4889,
"step": 550
},
{
"epoch": 2.7815126050420167,
"grad_norm": 1.5070411133243147,
"learning_rate": 1.6658510224765333e-07,
"loss": 0.47,
"step": 551
},
{
"epoch": 2.7865546218487394,
"grad_norm": 1.3934953281445221,
"learning_rate": 1.5913906982201744e-07,
"loss": 0.4626,
"step": 552
},
{
"epoch": 2.791596638655462,
"grad_norm": 1.4300047982632422,
"learning_rate": 1.5186058514055912e-07,
"loss": 0.4808,
"step": 553
},
{
"epoch": 2.7966386554621847,
"grad_norm": 1.3007207174809041,
"learning_rate": 1.447499001199748e-07,
"loss": 0.5228,
"step": 554
},
{
"epoch": 2.8016806722689074,
"grad_norm": 1.335166451449638,
"learning_rate": 1.3780726086922103e-07,
"loss": 0.5314,
"step": 555
},
{
"epoch": 2.80672268907563,
"grad_norm": 1.2727049723883297,
"learning_rate": 1.3103290768099796e-07,
"loss": 0.4538,
"step": 556
},
{
"epoch": 2.8117647058823527,
"grad_norm": 1.4233653924829766,
"learning_rate": 1.244270750234333e-07,
"loss": 0.4768,
"step": 557
},
{
"epoch": 2.8168067226890754,
"grad_norm": 1.4089563114452142,
"learning_rate": 1.1798999153196433e-07,
"loss": 0.4543,
"step": 558
},
{
"epoch": 2.821848739495798,
"grad_norm": 1.3596745441590257,
"learning_rate": 1.1172188000142803e-07,
"loss": 0.5016,
"step": 559
},
{
"epoch": 2.8268907563025207,
"grad_norm": 1.3375081145484837,
"learning_rate": 1.0562295737834738e-07,
"loss": 0.47,
"step": 560
},
{
"epoch": 2.831932773109244,
"grad_norm": 1.3797076618818533,
"learning_rate": 9.969343475342285e-08,
"loss": 0.4762,
"step": 561
},
{
"epoch": 2.8369747899159665,
"grad_norm": 1.4014527371585839,
"learning_rate": 9.393351735422773e-08,
"loss": 0.4606,
"step": 562
},
{
"epoch": 2.842016806722689,
"grad_norm": 1.317969883356561,
"learning_rate": 8.834340453810375e-08,
"loss": 0.4353,
"step": 563
},
{
"epoch": 2.847058823529412,
"grad_norm": 1.3062183016322855,
"learning_rate": 8.29232897852611e-08,
"loss": 0.3857,
"step": 564
},
{
"epoch": 2.8521008403361345,
"grad_norm": 1.3280320137002732,
"learning_rate": 7.76733606920832e-08,
"loss": 0.4572,
"step": 565
},
{
"epoch": 2.857142857142857,
"grad_norm": 1.4128418670110612,
"learning_rate": 7.259379896463248e-08,
"loss": 0.4476,
"step": 566
},
{
"epoch": 2.86218487394958,
"grad_norm": 1.3977595292294513,
"learning_rate": 6.768478041236037e-08,
"loss": 0.4436,
"step": 567
},
{
"epoch": 2.8672268907563025,
"grad_norm": 1.3855652086248782,
"learning_rate": 6.294647494202444e-08,
"loss": 0.4346,
"step": 568
},
{
"epoch": 2.872268907563025,
"grad_norm": 1.3251986287781006,
"learning_rate": 5.8379046551807486e-08,
"loss": 0.493,
"step": 569
},
{
"epoch": 2.877310924369748,
"grad_norm": 1.32087943884219,
"learning_rate": 5.398265332563935e-08,
"loss": 0.4551,
"step": 570
},
{
"epoch": 2.8823529411764706,
"grad_norm": 1.2437729277991256,
"learning_rate": 4.975744742772848e-08,
"loss": 0.4098,
"step": 571
},
{
"epoch": 2.8873949579831932,
"grad_norm": 1.340919476266603,
"learning_rate": 4.5703575097292286e-08,
"loss": 0.4726,
"step": 572
},
{
"epoch": 2.892436974789916,
"grad_norm": 1.2461844948007363,
"learning_rate": 4.182117664349783e-08,
"loss": 0.449,
"step": 573
},
{
"epoch": 2.8974789915966386,
"grad_norm": 1.3240662502351237,
"learning_rate": 3.8110386440605164e-08,
"loss": 0.4603,
"step": 574
},
{
"epoch": 2.9025210084033612,
"grad_norm": 1.3494315545656852,
"learning_rate": 3.457133292331494e-08,
"loss": 0.5058,
"step": 575
},
{
"epoch": 2.907563025210084,
"grad_norm": 1.3389143724686245,
"learning_rate": 3.120413858232474e-08,
"loss": 0.4578,
"step": 576
},
{
"epoch": 2.9126050420168066,
"grad_norm": 1.344475790060752,
"learning_rate": 2.8008919960090253e-08,
"loss": 0.5347,
"step": 577
},
{
"epoch": 2.9176470588235293,
"grad_norm": 1.388286539991785,
"learning_rate": 2.4985787646788497e-08,
"loss": 0.4792,
"step": 578
},
{
"epoch": 2.9226890756302524,
"grad_norm": 1.4667343155241181,
"learning_rate": 2.2134846276494205e-08,
"loss": 0.4854,
"step": 579
},
{
"epoch": 2.927731092436975,
"grad_norm": 1.393293250138424,
"learning_rate": 1.9456194523554404e-08,
"loss": 0.4796,
"step": 580
},
{
"epoch": 2.9327731092436977,
"grad_norm": 1.3210976282362301,
"learning_rate": 1.69499250991767e-08,
"loss": 0.4465,
"step": 581
},
{
"epoch": 2.9378151260504204,
"grad_norm": 1.3544687735071852,
"learning_rate": 1.4616124748217387e-08,
"loss": 0.5223,
"step": 582
},
{
"epoch": 2.942857142857143,
"grad_norm": 1.467595755846224,
"learning_rate": 1.2454874246181081e-08,
"loss": 0.6671,
"step": 583
},
{
"epoch": 2.9478991596638657,
"grad_norm": 1.3671723526105932,
"learning_rate": 1.0466248396424072e-08,
"loss": 0.4499,
"step": 584
},
{
"epoch": 2.9529411764705884,
"grad_norm": 1.4167636187504142,
"learning_rate": 8.650316027566386e-09,
"loss": 0.4873,
"step": 585
},
{
"epoch": 2.957983193277311,
"grad_norm": 1.220474765102595,
"learning_rate": 7.007139991108136e-09,
"loss": 0.4043,
"step": 586
},
{
"epoch": 2.9630252100840337,
"grad_norm": 1.3733660106334655,
"learning_rate": 5.536777159254603e-09,
"loss": 0.4793,
"step": 587
},
{
"epoch": 2.9680672268907564,
"grad_norm": 1.3544611708705747,
"learning_rate": 4.239278422948911e-09,
"loss": 0.4953,
"step": 588
},
{
"epoch": 2.973109243697479,
"grad_norm": 1.4589364978859505,
"learning_rate": 3.1146886901090024e-09,
"loss": 0.4547,
"step": 589
},
{
"epoch": 2.9781512605042018,
"grad_norm": 1.3938123480231057,
"learning_rate": 2.1630468840738716e-09,
"loss": 0.4115,
"step": 590
},
{
"epoch": 2.9831932773109244,
"grad_norm": 1.3511479563562372,
"learning_rate": 1.3843859422574269e-09,
"loss": 0.4926,
"step": 591
},
{
"epoch": 2.988235294117647,
"grad_norm": 1.445464043641677,
"learning_rate": 7.787328150071771e-10,
"loss": 0.5346,
"step": 592
},
{
"epoch": 2.9932773109243698,
"grad_norm": 1.5785257738352532,
"learning_rate": 3.4610846467109106e-10,
"loss": 0.5032,
"step": 593
},
{
"epoch": 2.9983193277310924,
"grad_norm": 1.305339383484568,
"learning_rate": 8.652786487484133e-11,
"loss": 0.4666,
"step": 594
},
{
"epoch": 2.9983193277310924,
"step": 594,
"total_flos": 4.726427205490442e+17,
"train_loss": 0.7082312573688199,
"train_runtime": 63951.2458,
"train_samples_per_second": 0.447,
"train_steps_per_second": 0.009
}
],
"logging_steps": 1,
"max_steps": 594,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.726427205490442e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}