pret-921-hatsu-r1 / trainer_state.json
amuvarma's picture
Update model
71642e1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 921,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003257328990228013,
"grad_norm": 8.991729736328125,
"learning_rate": 5e-06,
"loss": 4.95,
"step": 1
},
{
"epoch": 0.006514657980456026,
"grad_norm": 8.309410095214844,
"learning_rate": 4.999985455791999e-06,
"loss": 4.8245,
"step": 2
},
{
"epoch": 0.009771986970684038,
"grad_norm": 8.790390968322754,
"learning_rate": 4.999941823337222e-06,
"loss": 5.0692,
"step": 3
},
{
"epoch": 0.013029315960912053,
"grad_norm": 6.615326881408691,
"learning_rate": 4.99986910314335e-06,
"loss": 4.7744,
"step": 4
},
{
"epoch": 0.016286644951140065,
"grad_norm": 6.837432384490967,
"learning_rate": 4.999767296056508e-06,
"loss": 4.6511,
"step": 5
},
{
"epoch": 0.019543973941368076,
"grad_norm": 8.298238754272461,
"learning_rate": 4.999636403261258e-06,
"loss": 4.7926,
"step": 6
},
{
"epoch": 0.02280130293159609,
"grad_norm": 6.846872806549072,
"learning_rate": 4.999476426280588e-06,
"loss": 4.6072,
"step": 7
},
{
"epoch": 0.026058631921824105,
"grad_norm": 6.849335193634033,
"learning_rate": 4.999287366975888e-06,
"loss": 4.5674,
"step": 8
},
{
"epoch": 0.029315960912052116,
"grad_norm": 5.802831172943115,
"learning_rate": 4.99906922754693e-06,
"loss": 4.3631,
"step": 9
},
{
"epoch": 0.03257328990228013,
"grad_norm": 8.730732917785645,
"learning_rate": 4.998822010531849e-06,
"loss": 4.8049,
"step": 10
},
{
"epoch": 0.035830618892508145,
"grad_norm": 5.898006916046143,
"learning_rate": 4.998545718807104e-06,
"loss": 4.8731,
"step": 11
},
{
"epoch": 0.03908794788273615,
"grad_norm": 8.241886138916016,
"learning_rate": 4.998240355587452e-06,
"loss": 4.4484,
"step": 12
},
{
"epoch": 0.04234527687296417,
"grad_norm": 5.800337314605713,
"learning_rate": 4.997905924425903e-06,
"loss": 4.5168,
"step": 13
},
{
"epoch": 0.04560260586319218,
"grad_norm": 4.856197357177734,
"learning_rate": 4.99754242921369e-06,
"loss": 4.6223,
"step": 14
},
{
"epoch": 0.048859934853420196,
"grad_norm": 6.2050628662109375,
"learning_rate": 4.997149874180209e-06,
"loss": 4.3281,
"step": 15
},
{
"epoch": 0.05211726384364821,
"grad_norm": 5.954516410827637,
"learning_rate": 4.996728263892985e-06,
"loss": 4.5456,
"step": 16
},
{
"epoch": 0.05537459283387622,
"grad_norm": 5.490358352661133,
"learning_rate": 4.9962776032576065e-06,
"loss": 4.599,
"step": 17
},
{
"epoch": 0.05863192182410423,
"grad_norm": 5.2625885009765625,
"learning_rate": 4.995797897517676e-06,
"loss": 4.3262,
"step": 18
},
{
"epoch": 0.06188925081433225,
"grad_norm": 5.2583394050598145,
"learning_rate": 4.995289152254744e-06,
"loss": 4.3131,
"step": 19
},
{
"epoch": 0.06514657980456026,
"grad_norm": 8.310872077941895,
"learning_rate": 4.994751373388249e-06,
"loss": 4.3256,
"step": 20
},
{
"epoch": 0.06840390879478828,
"grad_norm": 4.853321552276611,
"learning_rate": 4.994184567175446e-06,
"loss": 4.1977,
"step": 21
},
{
"epoch": 0.07166123778501629,
"grad_norm": 7.600931644439697,
"learning_rate": 4.9935887402113315e-06,
"loss": 4.0224,
"step": 22
},
{
"epoch": 0.0749185667752443,
"grad_norm": 5.884659767150879,
"learning_rate": 4.9929638994285715e-06,
"loss": 4.2029,
"step": 23
},
{
"epoch": 0.0781758957654723,
"grad_norm": 5.0048699378967285,
"learning_rate": 4.9923100520974165e-06,
"loss": 4.434,
"step": 24
},
{
"epoch": 0.08143322475570032,
"grad_norm": 4.726573467254639,
"learning_rate": 4.991627205825621e-06,
"loss": 4.0566,
"step": 25
},
{
"epoch": 0.08469055374592833,
"grad_norm": 6.583544731140137,
"learning_rate": 4.99091536855835e-06,
"loss": 4.1312,
"step": 26
},
{
"epoch": 0.08794788273615635,
"grad_norm": 6.678943157196045,
"learning_rate": 4.990174548578093e-06,
"loss": 4.1762,
"step": 27
},
{
"epoch": 0.09120521172638436,
"grad_norm": 6.158233642578125,
"learning_rate": 4.989404754504561e-06,
"loss": 4.6873,
"step": 28
},
{
"epoch": 0.09446254071661238,
"grad_norm": 6.6103291511535645,
"learning_rate": 4.9886059952945885e-06,
"loss": 4.0162,
"step": 29
},
{
"epoch": 0.09771986970684039,
"grad_norm": 8.516252517700195,
"learning_rate": 4.987778280242034e-06,
"loss": 4.0497,
"step": 30
},
{
"epoch": 0.10097719869706841,
"grad_norm": 6.198530673980713,
"learning_rate": 4.986921618977664e-06,
"loss": 4.2576,
"step": 31
},
{
"epoch": 0.10423452768729642,
"grad_norm": 6.267817974090576,
"learning_rate": 4.9860360214690465e-06,
"loss": 4.3573,
"step": 32
},
{
"epoch": 0.10749185667752444,
"grad_norm": 4.247263431549072,
"learning_rate": 4.985121498020433e-06,
"loss": 4.4098,
"step": 33
},
{
"epoch": 0.11074918566775244,
"grad_norm": 6.026848793029785,
"learning_rate": 4.9841780592726385e-06,
"loss": 4.3846,
"step": 34
},
{
"epoch": 0.11400651465798045,
"grad_norm": 4.245572566986084,
"learning_rate": 4.9832057162029194e-06,
"loss": 4.1725,
"step": 35
},
{
"epoch": 0.11726384364820847,
"grad_norm": 6.846841812133789,
"learning_rate": 4.982204480124844e-06,
"loss": 4.5872,
"step": 36
},
{
"epoch": 0.12052117263843648,
"grad_norm": 7.242786407470703,
"learning_rate": 4.981174362688159e-06,
"loss": 4.0026,
"step": 37
},
{
"epoch": 0.1237785016286645,
"grad_norm": 5.995569705963135,
"learning_rate": 4.980115375878659e-06,
"loss": 4.2044,
"step": 38
},
{
"epoch": 0.1270358306188925,
"grad_norm": 6.272200107574463,
"learning_rate": 4.979027532018045e-06,
"loss": 4.0437,
"step": 39
},
{
"epoch": 0.13029315960912052,
"grad_norm": 6.379438877105713,
"learning_rate": 4.977910843763777e-06,
"loss": 4.4689,
"step": 40
},
{
"epoch": 0.13355048859934854,
"grad_norm": 5.513742923736572,
"learning_rate": 4.976765324108932e-06,
"loss": 4.2111,
"step": 41
},
{
"epoch": 0.13680781758957655,
"grad_norm": 7.665502548217773,
"learning_rate": 4.975590986382053e-06,
"loss": 3.5765,
"step": 42
},
{
"epoch": 0.14006514657980457,
"grad_norm": 4.788407802581787,
"learning_rate": 4.974387844246987e-06,
"loss": 4.3693,
"step": 43
},
{
"epoch": 0.14332247557003258,
"grad_norm": 5.028674602508545,
"learning_rate": 4.973155911702736e-06,
"loss": 4.1687,
"step": 44
},
{
"epoch": 0.1465798045602606,
"grad_norm": 8.316819190979004,
"learning_rate": 4.971895203083285e-06,
"loss": 4.0222,
"step": 45
},
{
"epoch": 0.1498371335504886,
"grad_norm": 5.167773723602295,
"learning_rate": 4.970605733057441e-06,
"loss": 4.1067,
"step": 46
},
{
"epoch": 0.15309446254071662,
"grad_norm": 9.906169891357422,
"learning_rate": 4.969287516628661e-06,
"loss": 3.4177,
"step": 47
},
{
"epoch": 0.1563517915309446,
"grad_norm": 5.868465423583984,
"learning_rate": 4.967940569134875e-06,
"loss": 4.2688,
"step": 48
},
{
"epoch": 0.15960912052117263,
"grad_norm": 4.171844005584717,
"learning_rate": 4.9665649062483115e-06,
"loss": 4.2632,
"step": 49
},
{
"epoch": 0.16286644951140064,
"grad_norm": 7.338898181915283,
"learning_rate": 4.965160543975312e-06,
"loss": 4.2469,
"step": 50
},
{
"epoch": 0.16612377850162866,
"grad_norm": 4.580145359039307,
"learning_rate": 4.963727498656146e-06,
"loss": 4.3323,
"step": 51
},
{
"epoch": 0.16938110749185667,
"grad_norm": 7.303488254547119,
"learning_rate": 4.962265786964821e-06,
"loss": 3.9707,
"step": 52
},
{
"epoch": 0.17263843648208468,
"grad_norm": 6.1601080894470215,
"learning_rate": 4.960775425908887e-06,
"loss": 4.0127,
"step": 53
},
{
"epoch": 0.1758957654723127,
"grad_norm": 4.484189510345459,
"learning_rate": 4.959256432829242e-06,
"loss": 4.2545,
"step": 54
},
{
"epoch": 0.1791530944625407,
"grad_norm": 5.867091178894043,
"learning_rate": 4.957708825399928e-06,
"loss": 4.178,
"step": 55
},
{
"epoch": 0.18241042345276873,
"grad_norm": 6.303260326385498,
"learning_rate": 4.956132621627922e-06,
"loss": 4.0996,
"step": 56
},
{
"epoch": 0.18566775244299674,
"grad_norm": 4.000598907470703,
"learning_rate": 4.954527839852935e-06,
"loss": 4.1641,
"step": 57
},
{
"epoch": 0.18892508143322476,
"grad_norm": 6.654462814331055,
"learning_rate": 4.952894498747189e-06,
"loss": 4.0859,
"step": 58
},
{
"epoch": 0.19218241042345277,
"grad_norm": 6.11088228225708,
"learning_rate": 4.951232617315207e-06,
"loss": 4.5088,
"step": 59
},
{
"epoch": 0.19543973941368079,
"grad_norm": 4.228384494781494,
"learning_rate": 4.949542214893589e-06,
"loss": 4.2673,
"step": 60
},
{
"epoch": 0.1986970684039088,
"grad_norm": 7.4116950035095215,
"learning_rate": 4.947823311150785e-06,
"loss": 3.9379,
"step": 61
},
{
"epoch": 0.20195439739413681,
"grad_norm": 6.180227279663086,
"learning_rate": 4.946075926086872e-06,
"loss": 4.0369,
"step": 62
},
{
"epoch": 0.20521172638436483,
"grad_norm": 4.931899547576904,
"learning_rate": 4.9443000800333135e-06,
"loss": 4.2796,
"step": 63
},
{
"epoch": 0.20846905537459284,
"grad_norm": 4.462183475494385,
"learning_rate": 4.9424957936527295e-06,
"loss": 4.2172,
"step": 64
},
{
"epoch": 0.21172638436482086,
"grad_norm": 8.078901290893555,
"learning_rate": 4.940663087938654e-06,
"loss": 3.7498,
"step": 65
},
{
"epoch": 0.21498371335504887,
"grad_norm": 5.682638645172119,
"learning_rate": 4.938801984215289e-06,
"loss": 3.915,
"step": 66
},
{
"epoch": 0.2182410423452769,
"grad_norm": 5.182820796966553,
"learning_rate": 4.936912504137257e-06,
"loss": 4.0408,
"step": 67
},
{
"epoch": 0.22149837133550487,
"grad_norm": 5.436476230621338,
"learning_rate": 4.934994669689353e-06,
"loss": 4.3085,
"step": 68
},
{
"epoch": 0.2247557003257329,
"grad_norm": 5.0985212326049805,
"learning_rate": 4.933048503186282e-06,
"loss": 4.348,
"step": 69
},
{
"epoch": 0.2280130293159609,
"grad_norm": 6.351657390594482,
"learning_rate": 4.931074027272406e-06,
"loss": 4.2169,
"step": 70
},
{
"epoch": 0.23127035830618892,
"grad_norm": 5.03814697265625,
"learning_rate": 4.9290712649214735e-06,
"loss": 4.0666,
"step": 71
},
{
"epoch": 0.23452768729641693,
"grad_norm": 4.342626094818115,
"learning_rate": 4.9270402394363604e-06,
"loss": 4.1099,
"step": 72
},
{
"epoch": 0.23778501628664495,
"grad_norm": 4.489912033081055,
"learning_rate": 4.924980974448791e-06,
"loss": 3.9711,
"step": 73
},
{
"epoch": 0.24104234527687296,
"grad_norm": 6.246860980987549,
"learning_rate": 4.922893493919068e-06,
"loss": 3.7721,
"step": 74
},
{
"epoch": 0.24429967426710097,
"grad_norm": 6.77864933013916,
"learning_rate": 4.920777822135793e-06,
"loss": 4.072,
"step": 75
},
{
"epoch": 0.247557003257329,
"grad_norm": 5.387760162353516,
"learning_rate": 4.918633983715582e-06,
"loss": 4.0807,
"step": 76
},
{
"epoch": 0.250814332247557,
"grad_norm": 5.91160249710083,
"learning_rate": 4.91646200360278e-06,
"loss": 4.1652,
"step": 77
},
{
"epoch": 0.254071661237785,
"grad_norm": 6.564038276672363,
"learning_rate": 4.914261907069172e-06,
"loss": 4.2095,
"step": 78
},
{
"epoch": 0.25732899022801303,
"grad_norm": 5.434807777404785,
"learning_rate": 4.912033719713687e-06,
"loss": 3.8395,
"step": 79
},
{
"epoch": 0.26058631921824105,
"grad_norm": 4.123232841491699,
"learning_rate": 4.909777467462103e-06,
"loss": 4.0243,
"step": 80
},
{
"epoch": 0.26384364820846906,
"grad_norm": 4.283027172088623,
"learning_rate": 4.9074931765667386e-06,
"loss": 4.1818,
"step": 81
},
{
"epoch": 0.2671009771986971,
"grad_norm": 7.180938243865967,
"learning_rate": 4.905180873606157e-06,
"loss": 4.3134,
"step": 82
},
{
"epoch": 0.2703583061889251,
"grad_norm": 7.892354488372803,
"learning_rate": 4.90284058548485e-06,
"loss": 3.7998,
"step": 83
},
{
"epoch": 0.2736156351791531,
"grad_norm": 5.744028568267822,
"learning_rate": 4.900472339432928e-06,
"loss": 4.0191,
"step": 84
},
{
"epoch": 0.2768729641693811,
"grad_norm": 7.475683212280273,
"learning_rate": 4.898076163005802e-06,
"loss": 4.2697,
"step": 85
},
{
"epoch": 0.28013029315960913,
"grad_norm": 7.0953240394592285,
"learning_rate": 4.89565208408386e-06,
"loss": 4.143,
"step": 86
},
{
"epoch": 0.28338762214983715,
"grad_norm": 5.530087947845459,
"learning_rate": 4.893200130872152e-06,
"loss": 4.055,
"step": 87
},
{
"epoch": 0.28664495114006516,
"grad_norm": 7.326042175292969,
"learning_rate": 4.890720331900049e-06,
"loss": 4.0776,
"step": 88
},
{
"epoch": 0.2899022801302932,
"grad_norm": 6.444993495941162,
"learning_rate": 4.8882127160209234e-06,
"loss": 4.3407,
"step": 89
},
{
"epoch": 0.2931596091205212,
"grad_norm": 5.226675033569336,
"learning_rate": 4.885677312411802e-06,
"loss": 3.9822,
"step": 90
},
{
"epoch": 0.2964169381107492,
"grad_norm": 4.015732765197754,
"learning_rate": 4.883114150573037e-06,
"loss": 4.2537,
"step": 91
},
{
"epoch": 0.2996742671009772,
"grad_norm": 7.535611152648926,
"learning_rate": 4.880523260327954e-06,
"loss": 4.0133,
"step": 92
},
{
"epoch": 0.30293159609120524,
"grad_norm": 6.678111553192139,
"learning_rate": 4.8779046718225105e-06,
"loss": 3.8224,
"step": 93
},
{
"epoch": 0.30618892508143325,
"grad_norm": 5.473388671875,
"learning_rate": 4.875258415524945e-06,
"loss": 4.2571,
"step": 94
},
{
"epoch": 0.30944625407166126,
"grad_norm": 5.16727876663208,
"learning_rate": 4.872584522225417e-06,
"loss": 4.1357,
"step": 95
},
{
"epoch": 0.3127035830618892,
"grad_norm": 5.79735803604126,
"learning_rate": 4.8698830230356555e-06,
"loss": 4.2043,
"step": 96
},
{
"epoch": 0.31596091205211724,
"grad_norm": 6.028461933135986,
"learning_rate": 4.867153949388594e-06,
"loss": 3.9955,
"step": 97
},
{
"epoch": 0.31921824104234525,
"grad_norm": 4.4654340744018555,
"learning_rate": 4.864397333038002e-06,
"loss": 4.0395,
"step": 98
},
{
"epoch": 0.32247557003257327,
"grad_norm": 4.826022624969482,
"learning_rate": 4.861613206058123e-06,
"loss": 3.8778,
"step": 99
},
{
"epoch": 0.3257328990228013,
"grad_norm": 5.090777397155762,
"learning_rate": 4.858801600843295e-06,
"loss": 4.0367,
"step": 100
},
{
"epoch": 0.3289902280130293,
"grad_norm": 5.630136966705322,
"learning_rate": 4.855962550107574e-06,
"loss": 4.2245,
"step": 101
},
{
"epoch": 0.3322475570032573,
"grad_norm": 4.38795804977417,
"learning_rate": 4.853096086884354e-06,
"loss": 4.2784,
"step": 102
},
{
"epoch": 0.3355048859934853,
"grad_norm": 7.042239665985107,
"learning_rate": 4.850202244525987e-06,
"loss": 3.6864,
"step": 103
},
{
"epoch": 0.33876221498371334,
"grad_norm": 5.578479766845703,
"learning_rate": 4.847281056703388e-06,
"loss": 3.9581,
"step": 104
},
{
"epoch": 0.34201954397394135,
"grad_norm": 6.32352352142334,
"learning_rate": 4.844332557405649e-06,
"loss": 4.3841,
"step": 105
},
{
"epoch": 0.34527687296416937,
"grad_norm": 6.242619514465332,
"learning_rate": 4.841356780939638e-06,
"loss": 4.0657,
"step": 106
},
{
"epoch": 0.3485342019543974,
"grad_norm": 5.670553684234619,
"learning_rate": 4.838353761929605e-06,
"loss": 4.1274,
"step": 107
},
{
"epoch": 0.3517915309446254,
"grad_norm": 5.2623748779296875,
"learning_rate": 4.835323535316777e-06,
"loss": 4.4132,
"step": 108
},
{
"epoch": 0.3550488599348534,
"grad_norm": 7.746681213378906,
"learning_rate": 4.832266136358951e-06,
"loss": 3.8874,
"step": 109
},
{
"epoch": 0.3583061889250814,
"grad_norm": 8.414247512817383,
"learning_rate": 4.829181600630084e-06,
"loss": 4.0962,
"step": 110
},
{
"epoch": 0.36156351791530944,
"grad_norm": 6.013370990753174,
"learning_rate": 4.826069964019878e-06,
"loss": 3.9995,
"step": 111
},
{
"epoch": 0.36482084690553745,
"grad_norm": 5.62821626663208,
"learning_rate": 4.822931262733367e-06,
"loss": 3.8944,
"step": 112
},
{
"epoch": 0.36807817589576547,
"grad_norm": 5.668822288513184,
"learning_rate": 4.819765533290489e-06,
"loss": 3.9762,
"step": 113
},
{
"epoch": 0.3713355048859935,
"grad_norm": 6.864443778991699,
"learning_rate": 4.816572812525668e-06,
"loss": 3.9365,
"step": 114
},
{
"epoch": 0.3745928338762215,
"grad_norm": 6.25184965133667,
"learning_rate": 4.813353137587377e-06,
"loss": 4.0144,
"step": 115
},
{
"epoch": 0.3778501628664495,
"grad_norm": 5.299489974975586,
"learning_rate": 4.810106545937716e-06,
"loss": 4.2232,
"step": 116
},
{
"epoch": 0.3811074918566775,
"grad_norm": 5.129780292510986,
"learning_rate": 4.806833075351968e-06,
"loss": 3.9329,
"step": 117
},
{
"epoch": 0.38436482084690554,
"grad_norm": 4.818124771118164,
"learning_rate": 4.803532763918162e-06,
"loss": 4.2085,
"step": 118
},
{
"epoch": 0.38762214983713356,
"grad_norm": 6.111238956451416,
"learning_rate": 4.80020565003663e-06,
"loss": 4.2062,
"step": 119
},
{
"epoch": 0.39087947882736157,
"grad_norm": 6.151260852813721,
"learning_rate": 4.796851772419563e-06,
"loss": 3.7084,
"step": 120
},
{
"epoch": 0.3941368078175896,
"grad_norm": 6.352945327758789,
"learning_rate": 4.793471170090555e-06,
"loss": 4.0596,
"step": 121
},
{
"epoch": 0.3973941368078176,
"grad_norm": 6.536398887634277,
"learning_rate": 4.7900638823841525e-06,
"loss": 4.1707,
"step": 122
},
{
"epoch": 0.4006514657980456,
"grad_norm": 7.450645923614502,
"learning_rate": 4.786629948945397e-06,
"loss": 3.7428,
"step": 123
},
{
"epoch": 0.40390879478827363,
"grad_norm": 7.927166938781738,
"learning_rate": 4.783169409729363e-06,
"loss": 3.6772,
"step": 124
},
{
"epoch": 0.40716612377850164,
"grad_norm": 4.440308094024658,
"learning_rate": 4.779682305000689e-06,
"loss": 3.8738,
"step": 125
},
{
"epoch": 0.41042345276872966,
"grad_norm": 6.223262310028076,
"learning_rate": 4.7761686753331195e-06,
"loss": 4.3548,
"step": 126
},
{
"epoch": 0.41368078175895767,
"grad_norm": 4.511125087738037,
"learning_rate": 4.772628561609021e-06,
"loss": 4.1852,
"step": 127
},
{
"epoch": 0.4169381107491857,
"grad_norm": 4.9265828132629395,
"learning_rate": 4.769062005018916e-06,
"loss": 4.0206,
"step": 128
},
{
"epoch": 0.4201954397394137,
"grad_norm": 5.572083950042725,
"learning_rate": 4.765469047060996e-06,
"loss": 3.8828,
"step": 129
},
{
"epoch": 0.4234527687296417,
"grad_norm": 5.799539089202881,
"learning_rate": 4.761849729540643e-06,
"loss": 3.7871,
"step": 130
},
{
"epoch": 0.42671009771986973,
"grad_norm": 5.041553020477295,
"learning_rate": 4.758204094569942e-06,
"loss": 4.2724,
"step": 131
},
{
"epoch": 0.42996742671009774,
"grad_norm": 4.930086135864258,
"learning_rate": 4.754532184567193e-06,
"loss": 4.2007,
"step": 132
},
{
"epoch": 0.43322475570032576,
"grad_norm": 4.0916056632995605,
"learning_rate": 4.750834042256414e-06,
"loss": 4.1914,
"step": 133
},
{
"epoch": 0.4364820846905538,
"grad_norm": 6.131953716278076,
"learning_rate": 4.747109710666844e-06,
"loss": 3.9056,
"step": 134
},
{
"epoch": 0.43973941368078173,
"grad_norm": 4.703607559204102,
"learning_rate": 4.743359233132449e-06,
"loss": 4.0255,
"step": 135
},
{
"epoch": 0.44299674267100975,
"grad_norm": 8.509166717529297,
"learning_rate": 4.7395826532914056e-06,
"loss": 4.4678,
"step": 136
},
{
"epoch": 0.44625407166123776,
"grad_norm": 4.48606538772583,
"learning_rate": 4.735780015085607e-06,
"loss": 3.6985,
"step": 137
},
{
"epoch": 0.4495114006514658,
"grad_norm": 4.333646297454834,
"learning_rate": 4.73195136276014e-06,
"loss": 4.2048,
"step": 138
},
{
"epoch": 0.4527687296416938,
"grad_norm": 7.2084197998046875,
"learning_rate": 4.728096740862778e-06,
"loss": 3.7695,
"step": 139
},
{
"epoch": 0.4560260586319218,
"grad_norm": 4.8456926345825195,
"learning_rate": 4.72421619424346e-06,
"loss": 4.1851,
"step": 140
},
{
"epoch": 0.4592833876221498,
"grad_norm": 4.744941234588623,
"learning_rate": 4.720309768053766e-06,
"loss": 4.2188,
"step": 141
},
{
"epoch": 0.46254071661237783,
"grad_norm": 6.611759662628174,
"learning_rate": 4.716377507746397e-06,
"loss": 3.9045,
"step": 142
},
{
"epoch": 0.46579804560260585,
"grad_norm": 7.642755031585693,
"learning_rate": 4.712419459074643e-06,
"loss": 3.4203,
"step": 143
},
{
"epoch": 0.46905537459283386,
"grad_norm": 4.797499656677246,
"learning_rate": 4.708435668091849e-06,
"loss": 3.9821,
"step": 144
},
{
"epoch": 0.4723127035830619,
"grad_norm": 5.430541515350342,
"learning_rate": 4.704426181150884e-06,
"loss": 3.7922,
"step": 145
},
{
"epoch": 0.4755700325732899,
"grad_norm": 5.740099906921387,
"learning_rate": 4.700391044903597e-06,
"loss": 4.0892,
"step": 146
},
{
"epoch": 0.4788273615635179,
"grad_norm": 4.150174617767334,
"learning_rate": 4.696330306300277e-06,
"loss": 4.0174,
"step": 147
},
{
"epoch": 0.4820846905537459,
"grad_norm": 8.157079696655273,
"learning_rate": 4.692244012589107e-06,
"loss": 3.9708,
"step": 148
},
{
"epoch": 0.48534201954397393,
"grad_norm": 5.4859466552734375,
"learning_rate": 4.688132211315608e-06,
"loss": 3.8235,
"step": 149
},
{
"epoch": 0.48859934853420195,
"grad_norm": 5.100396633148193,
"learning_rate": 4.683994950322098e-06,
"loss": 4.0108,
"step": 150
},
{
"epoch": 0.49185667752442996,
"grad_norm": 8.440217971801758,
"learning_rate": 4.679832277747122e-06,
"loss": 3.5701,
"step": 151
},
{
"epoch": 0.495114006514658,
"grad_norm": 6.211641788482666,
"learning_rate": 4.675644242024902e-06,
"loss": 3.7399,
"step": 152
},
{
"epoch": 0.498371335504886,
"grad_norm": 8.332008361816406,
"learning_rate": 4.671430891884768e-06,
"loss": 4.1424,
"step": 153
},
{
"epoch": 0.501628664495114,
"grad_norm": 6.8990607261657715,
"learning_rate": 4.6671922763505915e-06,
"loss": 3.9826,
"step": 154
},
{
"epoch": 0.504885993485342,
"grad_norm": 4.40015983581543,
"learning_rate": 4.662928444740219e-06,
"loss": 3.9535,
"step": 155
},
{
"epoch": 0.50814332247557,
"grad_norm": 6.208529949188232,
"learning_rate": 4.658639446664893e-06,
"loss": 3.6694,
"step": 156
},
{
"epoch": 0.511400651465798,
"grad_norm": 5.551063537597656,
"learning_rate": 4.654325332028676e-06,
"loss": 4.1019,
"step": 157
},
{
"epoch": 0.5146579804560261,
"grad_norm": 4.968599319458008,
"learning_rate": 4.649986151027875e-06,
"loss": 3.8717,
"step": 158
},
{
"epoch": 0.5179153094462541,
"grad_norm": 6.236921310424805,
"learning_rate": 4.645621954150449e-06,
"loss": 3.9015,
"step": 159
},
{
"epoch": 0.5211726384364821,
"grad_norm": 8.418625831604004,
"learning_rate": 4.641232792175428e-06,
"loss": 3.7186,
"step": 160
},
{
"epoch": 0.5244299674267101,
"grad_norm": 5.640598297119141,
"learning_rate": 4.636818716172322e-06,
"loss": 3.8935,
"step": 161
},
{
"epoch": 0.5276872964169381,
"grad_norm": 8.81381607055664,
"learning_rate": 4.632379777500519e-06,
"loss": 3.5697,
"step": 162
},
{
"epoch": 0.5309446254071661,
"grad_norm": 5.180728912353516,
"learning_rate": 4.6279160278087e-06,
"loss": 4.1232,
"step": 163
},
{
"epoch": 0.5342019543973942,
"grad_norm": 6.380191326141357,
"learning_rate": 4.623427519034224e-06,
"loss": 4.0181,
"step": 164
},
{
"epoch": 0.5374592833876222,
"grad_norm": 5.962952136993408,
"learning_rate": 4.618914303402541e-06,
"loss": 4.1417,
"step": 165
},
{
"epoch": 0.5407166123778502,
"grad_norm": 4.534554481506348,
"learning_rate": 4.614376433426565e-06,
"loss": 4.1021,
"step": 166
},
{
"epoch": 0.5439739413680782,
"grad_norm": 6.459926605224609,
"learning_rate": 4.609813961906076e-06,
"loss": 3.6299,
"step": 167
},
{
"epoch": 0.5472312703583062,
"grad_norm": 6.262177467346191,
"learning_rate": 4.605226941927102e-06,
"loss": 3.7101,
"step": 168
},
{
"epoch": 0.5504885993485342,
"grad_norm": 5.087495803833008,
"learning_rate": 4.6006154268613015e-06,
"loss": 3.8704,
"step": 169
},
{
"epoch": 0.5537459283387622,
"grad_norm": 5.9942193031311035,
"learning_rate": 4.595979470365341e-06,
"loss": 4.2554,
"step": 170
},
{
"epoch": 0.5570032573289903,
"grad_norm": 4.429585933685303,
"learning_rate": 4.591319126380275e-06,
"loss": 4.131,
"step": 171
},
{
"epoch": 0.5602605863192183,
"grad_norm": 4.608005523681641,
"learning_rate": 4.586634449130911e-06,
"loss": 3.7907,
"step": 172
},
{
"epoch": 0.5635179153094463,
"grad_norm": 6.586268424987793,
"learning_rate": 4.581925493125187e-06,
"loss": 4.0965,
"step": 173
},
{
"epoch": 0.5667752442996743,
"grad_norm": 6.221005916595459,
"learning_rate": 4.577192313153531e-06,
"loss": 3.733,
"step": 174
},
{
"epoch": 0.5700325732899023,
"grad_norm": 4.957738399505615,
"learning_rate": 4.572434964288226e-06,
"loss": 4.2729,
"step": 175
},
{
"epoch": 0.5732899022801303,
"grad_norm": 7.497585773468018,
"learning_rate": 4.5676535018827685e-06,
"loss": 3.7724,
"step": 176
},
{
"epoch": 0.5765472312703583,
"grad_norm": 5.472403526306152,
"learning_rate": 4.562847981571226e-06,
"loss": 4.2249,
"step": 177
},
{
"epoch": 0.5798045602605864,
"grad_norm": 4.5801568031311035,
"learning_rate": 4.55801845926759e-06,
"loss": 4.0099,
"step": 178
},
{
"epoch": 0.5830618892508144,
"grad_norm": 4.295859336853027,
"learning_rate": 4.553164991165119e-06,
"loss": 4.0598,
"step": 179
},
{
"epoch": 0.5863192182410424,
"grad_norm": 5.069422721862793,
"learning_rate": 4.548287633735694e-06,
"loss": 4.1214,
"step": 180
},
{
"epoch": 0.5895765472312704,
"grad_norm": 4.593508243560791,
"learning_rate": 4.543386443729157e-06,
"loss": 3.8567,
"step": 181
},
{
"epoch": 0.5928338762214984,
"grad_norm": 5.974512100219727,
"learning_rate": 4.538461478172647e-06,
"loss": 4.1337,
"step": 182
},
{
"epoch": 0.5960912052117264,
"grad_norm": 4.851910591125488,
"learning_rate": 4.5335127943699445e-06,
"loss": 4.0842,
"step": 183
},
{
"epoch": 0.5993485342019544,
"grad_norm": 7.21715784072876,
"learning_rate": 4.528540449900799e-06,
"loss": 3.7263,
"step": 184
},
{
"epoch": 0.6026058631921825,
"grad_norm": 5.761256217956543,
"learning_rate": 4.523544502620258e-06,
"loss": 3.7832,
"step": 185
},
{
"epoch": 0.6058631921824105,
"grad_norm": 6.854928493499756,
"learning_rate": 4.518525010658001e-06,
"loss": 3.858,
"step": 186
},
{
"epoch": 0.6091205211726385,
"grad_norm": 6.20596981048584,
"learning_rate": 4.513482032417656e-06,
"loss": 3.7106,
"step": 187
},
{
"epoch": 0.6123778501628665,
"grad_norm": 5.5723161697387695,
"learning_rate": 4.508415626576122e-06,
"loss": 3.9256,
"step": 188
},
{
"epoch": 0.6156351791530945,
"grad_norm": 5.427545547485352,
"learning_rate": 4.503325852082886e-06,
"loss": 4.0918,
"step": 189
},
{
"epoch": 0.6188925081433225,
"grad_norm": 4.53942346572876,
"learning_rate": 4.4982127681593414e-06,
"loss": 3.8621,
"step": 190
},
{
"epoch": 0.6221498371335505,
"grad_norm": 6.003444194793701,
"learning_rate": 4.493076434298091e-06,
"loss": 3.7557,
"step": 191
},
{
"epoch": 0.6254071661237784,
"grad_norm": 6.072404384613037,
"learning_rate": 4.487916910262263e-06,
"loss": 3.7174,
"step": 192
},
{
"epoch": 0.6286644951140065,
"grad_norm": 5.752810955047607,
"learning_rate": 4.48273425608481e-06,
"loss": 4.1727,
"step": 193
},
{
"epoch": 0.6319218241042345,
"grad_norm": 5.078545570373535,
"learning_rate": 4.4775285320678106e-06,
"loss": 4.2206,
"step": 194
},
{
"epoch": 0.6351791530944625,
"grad_norm": 5.559292316436768,
"learning_rate": 4.4722997987817714e-06,
"loss": 4.1013,
"step": 195
},
{
"epoch": 0.6384364820846905,
"grad_norm": 4.91386604309082,
"learning_rate": 4.467048117064921e-06,
"loss": 3.8137,
"step": 196
},
{
"epoch": 0.6416938110749185,
"grad_norm": 5.978386402130127,
"learning_rate": 4.461773548022502e-06,
"loss": 3.9084,
"step": 197
},
{
"epoch": 0.6449511400651465,
"grad_norm": 5.226948261260986,
"learning_rate": 4.4564761530260545e-06,
"loss": 4.1762,
"step": 198
},
{
"epoch": 0.6482084690553745,
"grad_norm": 5.278410911560059,
"learning_rate": 4.451155993712711e-06,
"loss": 4.245,
"step": 199
},
{
"epoch": 0.6514657980456026,
"grad_norm": 5.923932075500488,
"learning_rate": 4.445813131984476e-06,
"loss": 3.6665,
"step": 200
},
{
"epoch": 0.6547231270358306,
"grad_norm": 5.126694202423096,
"learning_rate": 4.440447630007503e-06,
"loss": 3.776,
"step": 201
},
{
"epoch": 0.6579804560260586,
"grad_norm": 5.655832767486572,
"learning_rate": 4.435059550211371e-06,
"loss": 4.1377,
"step": 202
},
{
"epoch": 0.6612377850162866,
"grad_norm": 5.701308250427246,
"learning_rate": 4.429648955288366e-06,
"loss": 3.9133,
"step": 203
},
{
"epoch": 0.6644951140065146,
"grad_norm": 5.6329545974731445,
"learning_rate": 4.42421590819274e-06,
"loss": 4.049,
"step": 204
},
{
"epoch": 0.6677524429967426,
"grad_norm": 5.668838024139404,
"learning_rate": 4.418760472139988e-06,
"loss": 3.7711,
"step": 205
},
{
"epoch": 0.6710097719869706,
"grad_norm": 4.098300933837891,
"learning_rate": 4.413282710606107e-06,
"loss": 4.0585,
"step": 206
},
{
"epoch": 0.6742671009771987,
"grad_norm": 5.279233455657959,
"learning_rate": 4.407782687326859e-06,
"loss": 3.6991,
"step": 207
},
{
"epoch": 0.6775244299674267,
"grad_norm": 6.012117385864258,
"learning_rate": 4.4022604662970305e-06,
"loss": 4.0131,
"step": 208
},
{
"epoch": 0.6807817589576547,
"grad_norm": 6.257996559143066,
"learning_rate": 4.3967161117696864e-06,
"loss": 3.9206,
"step": 209
},
{
"epoch": 0.6840390879478827,
"grad_norm": 4.88628625869751,
"learning_rate": 4.391149688255423e-06,
"loss": 3.8542,
"step": 210
},
{
"epoch": 0.6872964169381107,
"grad_norm": 4.90071964263916,
"learning_rate": 4.385561260521618e-06,
"loss": 3.87,
"step": 211
},
{
"epoch": 0.6905537459283387,
"grad_norm": 5.792081832885742,
"learning_rate": 4.379950893591675e-06,
"loss": 4.042,
"step": 212
},
{
"epoch": 0.6938110749185668,
"grad_norm": 5.57878303527832,
"learning_rate": 4.3743186527442685e-06,
"loss": 4.2162,
"step": 213
},
{
"epoch": 0.6970684039087948,
"grad_norm": 7.439386367797852,
"learning_rate": 4.368664603512586e-06,
"loss": 3.6792,
"step": 214
},
{
"epoch": 0.7003257328990228,
"grad_norm": 4.927967071533203,
"learning_rate": 4.36298881168356e-06,
"loss": 3.9544,
"step": 215
},
{
"epoch": 0.7035830618892508,
"grad_norm": 4.070362567901611,
"learning_rate": 4.35729134329711e-06,
"loss": 4.1422,
"step": 216
},
{
"epoch": 0.7068403908794788,
"grad_norm": 3.362583875656128,
"learning_rate": 4.351572264645366e-06,
"loss": 3.9582,
"step": 217
},
{
"epoch": 0.7100977198697068,
"grad_norm": 6.386814117431641,
"learning_rate": 4.345831642271906e-06,
"loss": 4.3383,
"step": 218
},
{
"epoch": 0.7133550488599348,
"grad_norm": 4.9985880851745605,
"learning_rate": 4.3400695429709725e-06,
"loss": 4.0183,
"step": 219
},
{
"epoch": 0.7166123778501629,
"grad_norm": 4.032768249511719,
"learning_rate": 4.3342860337867045e-06,
"loss": 3.978,
"step": 220
},
{
"epoch": 0.7198697068403909,
"grad_norm": 8.807585716247559,
"learning_rate": 4.328481182012349e-06,
"loss": 3.6196,
"step": 221
},
{
"epoch": 0.7231270358306189,
"grad_norm": 6.042029857635498,
"learning_rate": 4.3226550551894815e-06,
"loss": 3.9239,
"step": 222
},
{
"epoch": 0.7263843648208469,
"grad_norm": 5.543368816375732,
"learning_rate": 4.316807721107226e-06,
"loss": 3.7864,
"step": 223
},
{
"epoch": 0.7296416938110749,
"grad_norm": 5.127748966217041,
"learning_rate": 4.310939247801455e-06,
"loss": 4.0228,
"step": 224
},
{
"epoch": 0.7328990228013029,
"grad_norm": 4.826234817504883,
"learning_rate": 4.305049703554005e-06,
"loss": 3.9928,
"step": 225
},
{
"epoch": 0.7361563517915309,
"grad_norm": 7.3278632164001465,
"learning_rate": 4.299139156891883e-06,
"loss": 3.9358,
"step": 226
},
{
"epoch": 0.739413680781759,
"grad_norm": 4.310771942138672,
"learning_rate": 4.293207676586464e-06,
"loss": 3.9118,
"step": 227
},
{
"epoch": 0.742671009771987,
"grad_norm": 6.319010257720947,
"learning_rate": 4.287255331652694e-06,
"loss": 3.8571,
"step": 228
},
{
"epoch": 0.745928338762215,
"grad_norm": 6.4100422859191895,
"learning_rate": 4.281282191348289e-06,
"loss": 4.2424,
"step": 229
},
{
"epoch": 0.749185667752443,
"grad_norm": 4.665522575378418,
"learning_rate": 4.275288325172924e-06,
"loss": 4.0637,
"step": 230
},
{
"epoch": 0.752442996742671,
"grad_norm": 6.5943684577941895,
"learning_rate": 4.269273802867427e-06,
"loss": 3.4363,
"step": 231
},
{
"epoch": 0.755700325732899,
"grad_norm": 6.329523086547852,
"learning_rate": 4.26323869441297e-06,
"loss": 4.1044,
"step": 232
},
{
"epoch": 0.758957654723127,
"grad_norm": 6.649689674377441,
"learning_rate": 4.257183070030252e-06,
"loss": 3.7455,
"step": 233
},
{
"epoch": 0.762214983713355,
"grad_norm": 5.669809341430664,
"learning_rate": 4.2511070001786806e-06,
"loss": 3.9673,
"step": 234
},
{
"epoch": 0.7654723127035831,
"grad_norm": 5.982848644256592,
"learning_rate": 4.245010555555554e-06,
"loss": 3.8431,
"step": 235
},
{
"epoch": 0.7687296416938111,
"grad_norm": 4.247102737426758,
"learning_rate": 4.23889380709524e-06,
"loss": 4.1477,
"step": 236
},
{
"epoch": 0.7719869706840391,
"grad_norm": 6.063522815704346,
"learning_rate": 4.232756825968348e-06,
"loss": 4.0559,
"step": 237
},
{
"epoch": 0.7752442996742671,
"grad_norm": 7.113022804260254,
"learning_rate": 4.226599683580902e-06,
"loss": 3.4791,
"step": 238
},
{
"epoch": 0.7785016286644951,
"grad_norm": 7.49078369140625,
"learning_rate": 4.22042245157351e-06,
"loss": 4.0812,
"step": 239
},
{
"epoch": 0.7817589576547231,
"grad_norm": 6.068420886993408,
"learning_rate": 4.214225201820529e-06,
"loss": 3.6831,
"step": 240
},
{
"epoch": 0.7850162866449512,
"grad_norm": 4.464322566986084,
"learning_rate": 4.20800800642923e-06,
"loss": 3.9978,
"step": 241
},
{
"epoch": 0.7882736156351792,
"grad_norm": 4.343051910400391,
"learning_rate": 4.201770937738962e-06,
"loss": 4.0457,
"step": 242
},
{
"epoch": 0.7915309446254072,
"grad_norm": 5.923551082611084,
"learning_rate": 4.195514068320302e-06,
"loss": 3.6406,
"step": 243
},
{
"epoch": 0.7947882736156352,
"grad_norm": 5.177840232849121,
"learning_rate": 4.1892374709742186e-06,
"loss": 3.6954,
"step": 244
},
{
"epoch": 0.7980456026058632,
"grad_norm": 5.686809062957764,
"learning_rate": 4.182941218731222e-06,
"loss": 3.8836,
"step": 245
},
{
"epoch": 0.8013029315960912,
"grad_norm": 4.131414413452148,
"learning_rate": 4.176625384850516e-06,
"loss": 3.6393,
"step": 246
},
{
"epoch": 0.8045602605863192,
"grad_norm": 4.142212390899658,
"learning_rate": 4.170290042819137e-06,
"loss": 3.8273,
"step": 247
},
{
"epoch": 0.8078175895765473,
"grad_norm": 5.210618495941162,
"learning_rate": 4.163935266351115e-06,
"loss": 3.7623,
"step": 248
},
{
"epoch": 0.8110749185667753,
"grad_norm": 4.716037273406982,
"learning_rate": 4.1575611293866025e-06,
"loss": 4.2419,
"step": 249
},
{
"epoch": 0.8143322475570033,
"grad_norm": 7.191868782043457,
"learning_rate": 4.151167706091017e-06,
"loss": 4.1355,
"step": 250
},
{
"epoch": 0.8175895765472313,
"grad_norm": 5.328645706176758,
"learning_rate": 4.1447550708541815e-06,
"loss": 3.986,
"step": 251
},
{
"epoch": 0.8208469055374593,
"grad_norm": 4.756157875061035,
"learning_rate": 4.138323298289456e-06,
"loss": 4.0062,
"step": 252
},
{
"epoch": 0.8241042345276873,
"grad_norm": 5.154695987701416,
"learning_rate": 4.131872463232872e-06,
"loss": 3.9789,
"step": 253
},
{
"epoch": 0.8273615635179153,
"grad_norm": 5.032461643218994,
"learning_rate": 4.125402640742259e-06,
"loss": 3.7178,
"step": 254
},
{
"epoch": 0.8306188925081434,
"grad_norm": 4.375637054443359,
"learning_rate": 4.11891390609637e-06,
"loss": 3.9338,
"step": 255
},
{
"epoch": 0.8338762214983714,
"grad_norm": 7.046738624572754,
"learning_rate": 4.112406334794014e-06,
"loss": 3.9795,
"step": 256
},
{
"epoch": 0.8371335504885994,
"grad_norm": 5.982004642486572,
"learning_rate": 4.105880002553164e-06,
"loss": 4.0637,
"step": 257
},
{
"epoch": 0.8403908794788274,
"grad_norm": 5.773483753204346,
"learning_rate": 4.099334985310089e-06,
"loss": 3.6393,
"step": 258
},
{
"epoch": 0.8436482084690554,
"grad_norm": 4.926510810852051,
"learning_rate": 4.092771359218462e-06,
"loss": 4.0211,
"step": 259
},
{
"epoch": 0.8469055374592834,
"grad_norm": 6.6061530113220215,
"learning_rate": 4.086189200648476e-06,
"loss": 3.9733,
"step": 260
},
{
"epoch": 0.8501628664495114,
"grad_norm": 5.56645393371582,
"learning_rate": 4.079588586185961e-06,
"loss": 4.061,
"step": 261
},
{
"epoch": 0.8534201954397395,
"grad_norm": 5.1084675788879395,
"learning_rate": 4.072969592631481e-06,
"loss": 3.8312,
"step": 262
},
{
"epoch": 0.8566775244299675,
"grad_norm": 6.386423110961914,
"learning_rate": 4.066332296999455e-06,
"loss": 3.6092,
"step": 263
},
{
"epoch": 0.8599348534201955,
"grad_norm": 5.3035688400268555,
"learning_rate": 4.0596767765172465e-06,
"loss": 3.8357,
"step": 264
},
{
"epoch": 0.8631921824104235,
"grad_norm": 4.10700798034668,
"learning_rate": 4.053003108624276e-06,
"loss": 3.8495,
"step": 265
},
{
"epoch": 0.8664495114006515,
"grad_norm": 7.300857067108154,
"learning_rate": 4.046311370971114e-06,
"loss": 3.754,
"step": 266
},
{
"epoch": 0.8697068403908795,
"grad_norm": 5.575901508331299,
"learning_rate": 4.039601641418582e-06,
"loss": 3.9931,
"step": 267
},
{
"epoch": 0.8729641693811075,
"grad_norm": 4.204562664031982,
"learning_rate": 4.032873998036841e-06,
"loss": 3.9574,
"step": 268
},
{
"epoch": 0.8762214983713354,
"grad_norm": 6.569215297698975,
"learning_rate": 4.026128519104484e-06,
"loss": 3.7635,
"step": 269
},
{
"epoch": 0.8794788273615635,
"grad_norm": 6.4127020835876465,
"learning_rate": 4.019365283107634e-06,
"loss": 3.8189,
"step": 270
},
{
"epoch": 0.8827361563517915,
"grad_norm": 4.822223663330078,
"learning_rate": 4.012584368739017e-06,
"loss": 3.7812,
"step": 271
},
{
"epoch": 0.8859934853420195,
"grad_norm": 5.920554161071777,
"learning_rate": 4.005785854897057e-06,
"loss": 3.9864,
"step": 272
},
{
"epoch": 0.8892508143322475,
"grad_norm": 5.076132774353027,
"learning_rate": 3.998969820684954e-06,
"loss": 3.8068,
"step": 273
},
{
"epoch": 0.8925081433224755,
"grad_norm": 5.978939533233643,
"learning_rate": 3.992136345409765e-06,
"loss": 3.6497,
"step": 274
},
{
"epoch": 0.8957654723127035,
"grad_norm": 5.440021514892578,
"learning_rate": 3.985285508581475e-06,
"loss": 3.9533,
"step": 275
},
{
"epoch": 0.8990228013029316,
"grad_norm": 4.745472431182861,
"learning_rate": 3.9784173899120836e-06,
"loss": 4.1676,
"step": 276
},
{
"epoch": 0.9022801302931596,
"grad_norm": 5.8460259437561035,
"learning_rate": 3.971532069314666e-06,
"loss": 3.698,
"step": 277
},
{
"epoch": 0.9055374592833876,
"grad_norm": 7.4045820236206055,
"learning_rate": 3.964629626902452e-06,
"loss": 3.573,
"step": 278
},
{
"epoch": 0.9087947882736156,
"grad_norm": 6.916861534118652,
"learning_rate": 3.957710142987886e-06,
"loss": 3.5988,
"step": 279
},
{
"epoch": 0.9120521172638436,
"grad_norm": 6.336071014404297,
"learning_rate": 3.9507736980817e-06,
"loss": 3.4439,
"step": 280
},
{
"epoch": 0.9153094462540716,
"grad_norm": 6.711271286010742,
"learning_rate": 3.943820372891972e-06,
"loss": 3.9141,
"step": 281
},
{
"epoch": 0.9185667752442996,
"grad_norm": 4.0247802734375,
"learning_rate": 3.936850248323189e-06,
"loss": 4.0297,
"step": 282
},
{
"epoch": 0.9218241042345277,
"grad_norm": 7.413387775421143,
"learning_rate": 3.929863405475303e-06,
"loss": 3.6856,
"step": 283
},
{
"epoch": 0.9250814332247557,
"grad_norm": 6.697014808654785,
"learning_rate": 3.92285992564279e-06,
"loss": 3.6538,
"step": 284
},
{
"epoch": 0.9283387622149837,
"grad_norm": 6.154468059539795,
"learning_rate": 3.915839890313706e-06,
"loss": 3.2771,
"step": 285
},
{
"epoch": 0.9315960912052117,
"grad_norm": 7.574310779571533,
"learning_rate": 3.908803381168732e-06,
"loss": 4.1216,
"step": 286
},
{
"epoch": 0.9348534201954397,
"grad_norm": 3.7167022228240967,
"learning_rate": 3.901750480080232e-06,
"loss": 3.8945,
"step": 287
},
{
"epoch": 0.9381107491856677,
"grad_norm": 5.904863357543945,
"learning_rate": 3.894681269111292e-06,
"loss": 3.9315,
"step": 288
},
{
"epoch": 0.9413680781758957,
"grad_norm": 6.429924011230469,
"learning_rate": 3.887595830514775e-06,
"loss": 4.3139,
"step": 289
},
{
"epoch": 0.9446254071661238,
"grad_norm": 6.790943622589111,
"learning_rate": 3.880494246732352e-06,
"loss": 3.5595,
"step": 290
},
{
"epoch": 0.9478827361563518,
"grad_norm": 4.0954742431640625,
"learning_rate": 3.873376600393555e-06,
"loss": 4.0051,
"step": 291
},
{
"epoch": 0.9511400651465798,
"grad_norm": 6.410258769989014,
"learning_rate": 3.866242974314805e-06,
"loss": 3.5565,
"step": 292
},
{
"epoch": 0.9543973941368078,
"grad_norm": 6.833970546722412,
"learning_rate": 3.859093451498456e-06,
"loss": 4.0412,
"step": 293
},
{
"epoch": 0.9576547231270358,
"grad_norm": 8.363605499267578,
"learning_rate": 3.851928115131826e-06,
"loss": 4.552,
"step": 294
},
{
"epoch": 0.9609120521172638,
"grad_norm": 6.14967679977417,
"learning_rate": 3.844747048586228e-06,
"loss": 3.828,
"step": 295
},
{
"epoch": 0.9641693811074918,
"grad_norm": 7.708178520202637,
"learning_rate": 3.8375503354160036e-06,
"loss": 3.4753,
"step": 296
},
{
"epoch": 0.9674267100977199,
"grad_norm": 6.226506233215332,
"learning_rate": 3.830338059357546e-06,
"loss": 4.2654,
"step": 297
},
{
"epoch": 0.9706840390879479,
"grad_norm": 5.9939775466918945,
"learning_rate": 3.823110304328331e-06,
"loss": 3.7368,
"step": 298
},
{
"epoch": 0.9739413680781759,
"grad_norm": 5.407865047454834,
"learning_rate": 3.815867154425936e-06,
"loss": 3.8956,
"step": 299
},
{
"epoch": 0.9771986970684039,
"grad_norm": 5.42250919342041,
"learning_rate": 3.808608693927065e-06,
"loss": 3.9053,
"step": 300
},
{
"epoch": 0.9804560260586319,
"grad_norm": 6.157458782196045,
"learning_rate": 3.801335007286564e-06,
"loss": 3.7298,
"step": 301
},
{
"epoch": 0.9837133550488599,
"grad_norm": 4.539289951324463,
"learning_rate": 3.7940461791364425e-06,
"loss": 3.8612,
"step": 302
},
{
"epoch": 0.9869706840390879,
"grad_norm": 4.508944511413574,
"learning_rate": 3.7867422942848877e-06,
"loss": 3.9865,
"step": 303
},
{
"epoch": 0.990228013029316,
"grad_norm": 6.667001247406006,
"learning_rate": 3.779423437715274e-06,
"loss": 4.2309,
"step": 304
},
{
"epoch": 0.993485342019544,
"grad_norm": 4.5301289558410645,
"learning_rate": 3.772089694585181e-06,
"loss": 3.9932,
"step": 305
},
{
"epoch": 0.996742671009772,
"grad_norm": 6.858119010925293,
"learning_rate": 3.764741150225396e-06,
"loss": 3.7958,
"step": 306
},
{
"epoch": 1.0,
"grad_norm": 8.143805503845215,
"learning_rate": 3.757377890138927e-06,
"loss": 4.2006,
"step": 307
},
{
"epoch": 1.003257328990228,
"grad_norm": 5.382366180419922,
"learning_rate": 3.7500000000000005e-06,
"loss": 3.4269,
"step": 308
},
{
"epoch": 1.006514657980456,
"grad_norm": 4.062572002410889,
"learning_rate": 3.742607565653073e-06,
"loss": 3.8892,
"step": 309
},
{
"epoch": 1.009771986970684,
"grad_norm": 5.761571884155273,
"learning_rate": 3.7352006731118266e-06,
"loss": 3.8548,
"step": 310
},
{
"epoch": 1.013029315960912,
"grad_norm": 5.544031620025635,
"learning_rate": 3.7277794085581697e-06,
"loss": 3.5327,
"step": 311
},
{
"epoch": 1.01628664495114,
"grad_norm": 5.719707489013672,
"learning_rate": 3.7203438583412343e-06,
"loss": 3.6865,
"step": 312
},
{
"epoch": 1.019543973941368,
"grad_norm": 3.956254720687866,
"learning_rate": 3.712894108976372e-06,
"loss": 3.7916,
"step": 313
},
{
"epoch": 1.022801302931596,
"grad_norm": 5.764235019683838,
"learning_rate": 3.7054302471441462e-06,
"loss": 3.7351,
"step": 314
},
{
"epoch": 1.0260586319218241,
"grad_norm": 6.1902570724487305,
"learning_rate": 3.697952359689324e-06,
"loss": 3.6078,
"step": 315
},
{
"epoch": 1.0293159609120521,
"grad_norm": 6.540966987609863,
"learning_rate": 3.690460533619866e-06,
"loss": 3.4576,
"step": 316
},
{
"epoch": 1.0325732899022801,
"grad_norm": 4.03215217590332,
"learning_rate": 3.6829548561059133e-06,
"loss": 3.7704,
"step": 317
},
{
"epoch": 1.0358306188925082,
"grad_norm": 4.043310642242432,
"learning_rate": 3.6754354144787734e-06,
"loss": 3.6919,
"step": 318
},
{
"epoch": 1.0390879478827362,
"grad_norm": 6.889205455780029,
"learning_rate": 3.6679022962299054e-06,
"loss": 3.456,
"step": 319
},
{
"epoch": 1.0423452768729642,
"grad_norm": 5.511551856994629,
"learning_rate": 3.660355589009901e-06,
"loss": 3.6732,
"step": 320
},
{
"epoch": 1.0456026058631922,
"grad_norm": 5.021662712097168,
"learning_rate": 3.652795380627462e-06,
"loss": 3.8559,
"step": 321
},
{
"epoch": 1.0488599348534202,
"grad_norm": 7.119377613067627,
"learning_rate": 3.6452217590483847e-06,
"loss": 3.4773,
"step": 322
},
{
"epoch": 1.0521172638436482,
"grad_norm": 6.943653583526611,
"learning_rate": 3.63763481239453e-06,
"loss": 3.3563,
"step": 323
},
{
"epoch": 1.0553745928338762,
"grad_norm": 6.504844665527344,
"learning_rate": 3.6300346289428025e-06,
"loss": 3.1729,
"step": 324
},
{
"epoch": 1.0586319218241043,
"grad_norm": 7.149539947509766,
"learning_rate": 3.622421297124122e-06,
"loss": 3.1502,
"step": 325
},
{
"epoch": 1.0618892508143323,
"grad_norm": 6.777254104614258,
"learning_rate": 3.6147949055223925e-06,
"loss": 3.6956,
"step": 326
},
{
"epoch": 1.0651465798045603,
"grad_norm": 7.371313571929932,
"learning_rate": 3.607155542873475e-06,
"loss": 2.7985,
"step": 327
},
{
"epoch": 1.0684039087947883,
"grad_norm": 5.818310737609863,
"learning_rate": 3.5995032980641538e-06,
"loss": 3.3967,
"step": 328
},
{
"epoch": 1.0716612377850163,
"grad_norm": 5.086653709411621,
"learning_rate": 3.5918382601311003e-06,
"loss": 3.7434,
"step": 329
},
{
"epoch": 1.0749185667752443,
"grad_norm": 5.7530083656311035,
"learning_rate": 3.5841605182598393e-06,
"loss": 3.4631,
"step": 330
},
{
"epoch": 1.0781758957654723,
"grad_norm": 4.993615627288818,
"learning_rate": 3.5764701617837118e-06,
"loss": 3.4732,
"step": 331
},
{
"epoch": 1.0814332247557004,
"grad_norm": 4.600963592529297,
"learning_rate": 3.568767280182831e-06,
"loss": 3.7875,
"step": 332
},
{
"epoch": 1.0846905537459284,
"grad_norm": 7.0127081871032715,
"learning_rate": 3.561051963083048e-06,
"loss": 3.7191,
"step": 333
},
{
"epoch": 1.0879478827361564,
"grad_norm": 5.862270832061768,
"learning_rate": 3.5533243002549044e-06,
"loss": 3.4695,
"step": 334
},
{
"epoch": 1.0912052117263844,
"grad_norm": 4.856973648071289,
"learning_rate": 3.5455843816125878e-06,
"loss": 3.8399,
"step": 335
},
{
"epoch": 1.0944625407166124,
"grad_norm": 6.645925998687744,
"learning_rate": 3.5378322972128886e-06,
"loss": 3.502,
"step": 336
},
{
"epoch": 1.0977198697068404,
"grad_norm": 4.62603759765625,
"learning_rate": 3.530068137254148e-06,
"loss": 3.8772,
"step": 337
},
{
"epoch": 1.1009771986970684,
"grad_norm": 4.81012487411499,
"learning_rate": 3.5222919920752126e-06,
"loss": 3.5262,
"step": 338
},
{
"epoch": 1.1042345276872965,
"grad_norm": 8.265521049499512,
"learning_rate": 3.5145039521543806e-06,
"loss": 3.8737,
"step": 339
},
{
"epoch": 1.1074918566775245,
"grad_norm": 3.3320538997650146,
"learning_rate": 3.50670410810835e-06,
"loss": 3.7063,
"step": 340
},
{
"epoch": 1.1107491856677525,
"grad_norm": 6.44738245010376,
"learning_rate": 3.498892550691164e-06,
"loss": 3.8521,
"step": 341
},
{
"epoch": 1.1140065146579805,
"grad_norm": 5.850455284118652,
"learning_rate": 3.491069370793155e-06,
"loss": 3.2702,
"step": 342
},
{
"epoch": 1.1172638436482085,
"grad_norm": 4.26442289352417,
"learning_rate": 3.4832346594398888e-06,
"loss": 3.6619,
"step": 343
},
{
"epoch": 1.1205211726384365,
"grad_norm": 5.107183456420898,
"learning_rate": 3.475388507791101e-06,
"loss": 3.335,
"step": 344
},
{
"epoch": 1.1237785016286646,
"grad_norm": 4.226919174194336,
"learning_rate": 3.4675310071396425e-06,
"loss": 3.7333,
"step": 345
},
{
"epoch": 1.1270358306188926,
"grad_norm": 6.164919853210449,
"learning_rate": 3.4596622489104113e-06,
"loss": 3.8968,
"step": 346
},
{
"epoch": 1.1302931596091206,
"grad_norm": 5.433590888977051,
"learning_rate": 3.451782324659293e-06,
"loss": 3.4041,
"step": 347
},
{
"epoch": 1.1335504885993486,
"grad_norm": 4.933192253112793,
"learning_rate": 3.443891326072093e-06,
"loss": 3.5943,
"step": 348
},
{
"epoch": 1.1368078175895766,
"grad_norm": 5.232339859008789,
"learning_rate": 3.4359893449634713e-06,
"loss": 3.4548,
"step": 349
},
{
"epoch": 1.1400651465798046,
"grad_norm": 6.424132823944092,
"learning_rate": 3.428076473275873e-06,
"loss": 3.3356,
"step": 350
},
{
"epoch": 1.1433224755700326,
"grad_norm": 5.729311943054199,
"learning_rate": 3.42015280307846e-06,
"loss": 3.2489,
"step": 351
},
{
"epoch": 1.1465798045602607,
"grad_norm": 4.083253860473633,
"learning_rate": 3.4122184265660398e-06,
"loss": 3.3285,
"step": 352
},
{
"epoch": 1.1498371335504887,
"grad_norm": 5.305028915405273,
"learning_rate": 3.404273436057988e-06,
"loss": 3.7341,
"step": 353
},
{
"epoch": 1.1530944625407167,
"grad_norm": 6.984644889831543,
"learning_rate": 3.39631792399718e-06,
"loss": 3.054,
"step": 354
},
{
"epoch": 1.1563517915309447,
"grad_norm": 5.173111438751221,
"learning_rate": 3.3883519829489155e-06,
"loss": 3.7036,
"step": 355
},
{
"epoch": 1.1596091205211727,
"grad_norm": 5.271275043487549,
"learning_rate": 3.3803757055998354e-06,
"loss": 3.6782,
"step": 356
},
{
"epoch": 1.1628664495114007,
"grad_norm": 4.47402811050415,
"learning_rate": 3.37238918475685e-06,
"loss": 3.4326,
"step": 357
},
{
"epoch": 1.1661237785016287,
"grad_norm": 5.682492733001709,
"learning_rate": 3.3643925133460563e-06,
"loss": 3.4877,
"step": 358
},
{
"epoch": 1.1693811074918568,
"grad_norm": 4.897508144378662,
"learning_rate": 3.356385784411656e-06,
"loss": 3.4859,
"step": 359
},
{
"epoch": 1.1726384364820848,
"grad_norm": 7.068917274475098,
"learning_rate": 3.348369091114873e-06,
"loss": 3.7516,
"step": 360
},
{
"epoch": 1.1758957654723128,
"grad_norm": 5.376421928405762,
"learning_rate": 3.3403425267328715e-06,
"loss": 3.6323,
"step": 361
},
{
"epoch": 1.1791530944625408,
"grad_norm": 4.889153480529785,
"learning_rate": 3.3323061846576692e-06,
"loss": 3.5759,
"step": 362
},
{
"epoch": 1.1824104234527688,
"grad_norm": 4.02139139175415,
"learning_rate": 3.324260158395051e-06,
"loss": 3.8266,
"step": 363
},
{
"epoch": 1.1856677524429968,
"grad_norm": 5.562239170074463,
"learning_rate": 3.3162045415634793e-06,
"loss": 3.7196,
"step": 364
},
{
"epoch": 1.1889250814332248,
"grad_norm": 6.0636725425720215,
"learning_rate": 3.308139427893008e-06,
"loss": 3.4255,
"step": 365
},
{
"epoch": 1.1921824104234529,
"grad_norm": 4.625339031219482,
"learning_rate": 3.3000649112241904e-06,
"loss": 3.7757,
"step": 366
},
{
"epoch": 1.1954397394136809,
"grad_norm": 5.8719305992126465,
"learning_rate": 3.291981085506987e-06,
"loss": 3.7622,
"step": 367
},
{
"epoch": 1.1986970684039089,
"grad_norm": 7.395297527313232,
"learning_rate": 3.2838880447996697e-06,
"loss": 3.1235,
"step": 368
},
{
"epoch": 1.201954397394137,
"grad_norm": 4.938228607177734,
"learning_rate": 3.2757858832677346e-06,
"loss": 3.4661,
"step": 369
},
{
"epoch": 1.205211726384365,
"grad_norm": 4.547780990600586,
"learning_rate": 3.2676746951827985e-06,
"loss": 3.4966,
"step": 370
},
{
"epoch": 1.208469055374593,
"grad_norm": 5.542564868927002,
"learning_rate": 3.2595545749215073e-06,
"loss": 3.6376,
"step": 371
},
{
"epoch": 1.211726384364821,
"grad_norm": 5.063738822937012,
"learning_rate": 3.251425616964436e-06,
"loss": 3.7234,
"step": 372
},
{
"epoch": 1.214983713355049,
"grad_norm": 7.921319961547852,
"learning_rate": 3.243287915894987e-06,
"loss": 3.2747,
"step": 373
},
{
"epoch": 1.218241042345277,
"grad_norm": 4.5900044441223145,
"learning_rate": 3.2351415663982956e-06,
"loss": 3.8113,
"step": 374
},
{
"epoch": 1.221498371335505,
"grad_norm": 5.936985969543457,
"learning_rate": 3.2269866632601227e-06,
"loss": 3.4585,
"step": 375
},
{
"epoch": 1.224755700325733,
"grad_norm": 5.615612983703613,
"learning_rate": 3.2188233013657545e-06,
"loss": 3.4545,
"step": 376
},
{
"epoch": 1.228013029315961,
"grad_norm": 4.169675827026367,
"learning_rate": 3.2106515756988964e-06,
"loss": 3.7711,
"step": 377
},
{
"epoch": 1.231270358306189,
"grad_norm": 6.375253677368164,
"learning_rate": 3.202471581340572e-06,
"loss": 3.6459,
"step": 378
},
{
"epoch": 1.234527687296417,
"grad_norm": 5.786118030548096,
"learning_rate": 3.1942834134680123e-06,
"loss": 3.4114,
"step": 379
},
{
"epoch": 1.237785016286645,
"grad_norm": 5.953446865081787,
"learning_rate": 3.186087167353551e-06,
"loss": 3.278,
"step": 380
},
{
"epoch": 1.241042345276873,
"grad_norm": 7.0892462730407715,
"learning_rate": 3.177882938363514e-06,
"loss": 3.4727,
"step": 381
},
{
"epoch": 1.244299674267101,
"grad_norm": 7.629514217376709,
"learning_rate": 3.1696708219571128e-06,
"loss": 2.858,
"step": 382
},
{
"epoch": 1.247557003257329,
"grad_norm": 8.90638256072998,
"learning_rate": 3.1614509136853295e-06,
"loss": 3.2422,
"step": 383
},
{
"epoch": 1.2508143322475571,
"grad_norm": 5.899227142333984,
"learning_rate": 3.1532233091898094e-06,
"loss": 3.5167,
"step": 384
},
{
"epoch": 1.2540716612377851,
"grad_norm": 6.004745960235596,
"learning_rate": 3.144988104201745e-06,
"loss": 3.4772,
"step": 385
},
{
"epoch": 1.2573289902280131,
"grad_norm": 5.448160648345947,
"learning_rate": 3.1367453945407646e-06,
"loss": 3.615,
"step": 386
},
{
"epoch": 1.2605863192182412,
"grad_norm": 7.477326393127441,
"learning_rate": 3.1284952761138137e-06,
"loss": 3.2672,
"step": 387
},
{
"epoch": 1.2638436482084692,
"grad_norm": 6.218169212341309,
"learning_rate": 3.1202378449140437e-06,
"loss": 3.4072,
"step": 388
},
{
"epoch": 1.2671009771986972,
"grad_norm": 6.035054683685303,
"learning_rate": 3.111973197019693e-06,
"loss": 3.4844,
"step": 389
},
{
"epoch": 1.2703583061889252,
"grad_norm": 5.626272201538086,
"learning_rate": 3.1037014285929672e-06,
"loss": 3.7772,
"step": 390
},
{
"epoch": 1.2736156351791532,
"grad_norm": 3.958400011062622,
"learning_rate": 3.095422635878923e-06,
"loss": 3.8474,
"step": 391
},
{
"epoch": 1.2768729641693812,
"grad_norm": 6.569092750549316,
"learning_rate": 3.087136915204347e-06,
"loss": 3.4613,
"step": 392
},
{
"epoch": 1.2801302931596092,
"grad_norm": 4.738685607910156,
"learning_rate": 3.0788443629766348e-06,
"loss": 3.6467,
"step": 393
},
{
"epoch": 1.2833876221498373,
"grad_norm": 4.849681377410889,
"learning_rate": 3.0705450756826707e-06,
"loss": 3.7038,
"step": 394
},
{
"epoch": 1.2866449511400653,
"grad_norm": 7.685214519500732,
"learning_rate": 3.0622391498877012e-06,
"loss": 3.344,
"step": 395
},
{
"epoch": 1.2899022801302933,
"grad_norm": 5.982230186462402,
"learning_rate": 3.053926682234219e-06,
"loss": 3.3405,
"step": 396
},
{
"epoch": 1.2931596091205213,
"grad_norm": 4.924343585968018,
"learning_rate": 3.0456077694408292e-06,
"loss": 3.6514,
"step": 397
},
{
"epoch": 1.2964169381107493,
"grad_norm": 5.400077819824219,
"learning_rate": 3.0372825083011314e-06,
"loss": 3.3848,
"step": 398
},
{
"epoch": 1.2996742671009773,
"grad_norm": 6.141596794128418,
"learning_rate": 3.0289509956825878e-06,
"loss": 3.3221,
"step": 399
},
{
"epoch": 1.3029315960912053,
"grad_norm": 4.01882266998291,
"learning_rate": 3.020613328525402e-06,
"loss": 3.5865,
"step": 400
},
{
"epoch": 1.3061889250814334,
"grad_norm": 4.70369815826416,
"learning_rate": 3.0122696038413857e-06,
"loss": 3.721,
"step": 401
},
{
"epoch": 1.3094462540716614,
"grad_norm": 4.236996173858643,
"learning_rate": 3.0039199187128322e-06,
"loss": 3.6175,
"step": 402
},
{
"epoch": 1.3127035830618892,
"grad_norm": 5.709525108337402,
"learning_rate": 2.995564370291387e-06,
"loss": 3.3072,
"step": 403
},
{
"epoch": 1.3159609120521172,
"grad_norm": 7.52371072769165,
"learning_rate": 2.987203055796919e-06,
"loss": 3.5663,
"step": 404
},
{
"epoch": 1.3192182410423452,
"grad_norm": 5.285858631134033,
"learning_rate": 2.978836072516385e-06,
"loss": 3.6411,
"step": 405
},
{
"epoch": 1.3224755700325732,
"grad_norm": 4.379350662231445,
"learning_rate": 2.9704635178027012e-06,
"loss": 3.483,
"step": 406
},
{
"epoch": 1.3257328990228012,
"grad_norm": 4.9760918617248535,
"learning_rate": 2.9620854890736095e-06,
"loss": 3.8238,
"step": 407
},
{
"epoch": 1.3289902280130292,
"grad_norm": 4.094661235809326,
"learning_rate": 2.9537020838105434e-06,
"loss": 3.5433,
"step": 408
},
{
"epoch": 1.3322475570032573,
"grad_norm": 4.46438455581665,
"learning_rate": 2.9453133995574955e-06,
"loss": 3.7084,
"step": 409
},
{
"epoch": 1.3355048859934853,
"grad_norm": 5.638455867767334,
"learning_rate": 2.93691953391988e-06,
"loss": 3.6346,
"step": 410
},
{
"epoch": 1.3387622149837133,
"grad_norm": 6.500235557556152,
"learning_rate": 2.9285205845634007e-06,
"loss": 3.2883,
"step": 411
},
{
"epoch": 1.3420195439739413,
"grad_norm": 6.090638160705566,
"learning_rate": 2.920116649212909e-06,
"loss": 3.374,
"step": 412
},
{
"epoch": 1.3452768729641693,
"grad_norm": 4.844772815704346,
"learning_rate": 2.9117078256512725e-06,
"loss": 3.7777,
"step": 413
},
{
"epoch": 1.3485342019543973,
"grad_norm": 5.7558274269104,
"learning_rate": 2.9032942117182345e-06,
"loss": 3.383,
"step": 414
},
{
"epoch": 1.3517915309446253,
"grad_norm": 6.560629367828369,
"learning_rate": 2.8948759053092756e-06,
"loss": 3.6765,
"step": 415
},
{
"epoch": 1.3550488599348534,
"grad_norm": 4.3585100173950195,
"learning_rate": 2.8864530043744754e-06,
"loss": 3.4845,
"step": 416
},
{
"epoch": 1.3583061889250814,
"grad_norm": 5.6310248374938965,
"learning_rate": 2.8780256069173724e-06,
"loss": 3.4286,
"step": 417
},
{
"epoch": 1.3615635179153094,
"grad_norm": 4.276134014129639,
"learning_rate": 2.8695938109938244e-06,
"loss": 3.6676,
"step": 418
},
{
"epoch": 1.3648208469055374,
"grad_norm": 7.116851806640625,
"learning_rate": 2.8611577147108656e-06,
"loss": 3.2261,
"step": 419
},
{
"epoch": 1.3680781758957654,
"grad_norm": 7.104155540466309,
"learning_rate": 2.8527174162255677e-06,
"loss": 3.2597,
"step": 420
},
{
"epoch": 1.3713355048859934,
"grad_norm": 6.247655868530273,
"learning_rate": 2.8442730137438964e-06,
"loss": 3.175,
"step": 421
},
{
"epoch": 1.3745928338762214,
"grad_norm": 6.24178409576416,
"learning_rate": 2.8358246055195677e-06,
"loss": 3.4389,
"step": 422
},
{
"epoch": 1.3778501628664495,
"grad_norm": 7.848822593688965,
"learning_rate": 2.8273722898529075e-06,
"loss": 4.1385,
"step": 423
},
{
"epoch": 1.3811074918566775,
"grad_norm": 7.670816898345947,
"learning_rate": 2.8189161650897045e-06,
"loss": 3.0239,
"step": 424
},
{
"epoch": 1.3843648208469055,
"grad_norm": 7.171327590942383,
"learning_rate": 2.8104563296200704e-06,
"loss": 3.286,
"step": 425
},
{
"epoch": 1.3876221498371335,
"grad_norm": 6.696552276611328,
"learning_rate": 2.8019928818772897e-06,
"loss": 3.1105,
"step": 426
},
{
"epoch": 1.3908794788273615,
"grad_norm": 4.991602897644043,
"learning_rate": 2.793525920336678e-06,
"loss": 3.4433,
"step": 427
},
{
"epoch": 1.3941368078175895,
"grad_norm": 5.104501724243164,
"learning_rate": 2.785055543514434e-06,
"loss": 3.3963,
"step": 428
},
{
"epoch": 1.3973941368078175,
"grad_norm": 7.394959449768066,
"learning_rate": 2.776581849966497e-06,
"loss": 3.1235,
"step": 429
},
{
"epoch": 1.4006514657980456,
"grad_norm": 5.726476669311523,
"learning_rate": 2.7681049382873963e-06,
"loss": 3.5493,
"step": 430
},
{
"epoch": 1.4039087947882736,
"grad_norm": 5.474902153015137,
"learning_rate": 2.7596249071091042e-06,
"loss": 3.4242,
"step": 431
},
{
"epoch": 1.4071661237785016,
"grad_norm": 4.774505615234375,
"learning_rate": 2.7511418550998907e-06,
"loss": 3.387,
"step": 432
},
{
"epoch": 1.4104234527687296,
"grad_norm": 7.272900581359863,
"learning_rate": 2.7426558809631748e-06,
"loss": 3.715,
"step": 433
},
{
"epoch": 1.4136807817589576,
"grad_norm": 5.06602144241333,
"learning_rate": 2.734167083436375e-06,
"loss": 3.3789,
"step": 434
},
{
"epoch": 1.4169381107491856,
"grad_norm": 6.0230278968811035,
"learning_rate": 2.72567556128976e-06,
"loss": 3.8185,
"step": 435
},
{
"epoch": 1.4201954397394136,
"grad_norm": 7.184913635253906,
"learning_rate": 2.7171814133253015e-06,
"loss": 3.34,
"step": 436
},
{
"epoch": 1.4234527687296417,
"grad_norm": 5.576634883880615,
"learning_rate": 2.708684738375524e-06,
"loss": 3.4975,
"step": 437
},
{
"epoch": 1.4267100977198697,
"grad_norm": 5.067847728729248,
"learning_rate": 2.7001856353023527e-06,
"loss": 3.8647,
"step": 438
},
{
"epoch": 1.4299674267100977,
"grad_norm": 5.121155738830566,
"learning_rate": 2.691684202995966e-06,
"loss": 3.2674,
"step": 439
},
{
"epoch": 1.4332247557003257,
"grad_norm": 4.543545246124268,
"learning_rate": 2.683180540373645e-06,
"loss": 3.7039,
"step": 440
},
{
"epoch": 1.4364820846905537,
"grad_norm": 6.284096717834473,
"learning_rate": 2.6746747463786187e-06,
"loss": 3.1269,
"step": 441
},
{
"epoch": 1.4397394136807817,
"grad_norm": 3.755789041519165,
"learning_rate": 2.6661669199789176e-06,
"loss": 3.6417,
"step": 442
},
{
"epoch": 1.4429967426710097,
"grad_norm": 4.775397300720215,
"learning_rate": 2.657657160166219e-06,
"loss": 3.3866,
"step": 443
},
{
"epoch": 1.4462540716612378,
"grad_norm": 8.88444995880127,
"learning_rate": 2.6491455659546957e-06,
"loss": 3.0159,
"step": 444
},
{
"epoch": 1.4495114006514658,
"grad_norm": 4.328442573547363,
"learning_rate": 2.6406322363798657e-06,
"loss": 3.5742,
"step": 445
},
{
"epoch": 1.4527687296416938,
"grad_norm": 8.353851318359375,
"learning_rate": 2.6321172704974374e-06,
"loss": 3.475,
"step": 446
},
{
"epoch": 1.4560260586319218,
"grad_norm": 4.968841552734375,
"learning_rate": 2.6236007673821585e-06,
"loss": 3.4786,
"step": 447
},
{
"epoch": 1.4592833876221498,
"grad_norm": 6.597938537597656,
"learning_rate": 2.6150828261266644e-06,
"loss": 3.4609,
"step": 448
},
{
"epoch": 1.4625407166123778,
"grad_norm": 6.208395957946777,
"learning_rate": 2.6065635458403214e-06,
"loss": 3.2623,
"step": 449
},
{
"epoch": 1.4657980456026058,
"grad_norm": 4.916355133056641,
"learning_rate": 2.598043025648078e-06,
"loss": 3.5028,
"step": 450
},
{
"epoch": 1.4690553745928339,
"grad_norm": 4.385219097137451,
"learning_rate": 2.589521364689308e-06,
"loss": 3.5348,
"step": 451
},
{
"epoch": 1.4723127035830619,
"grad_norm": 4.333066940307617,
"learning_rate": 2.5809986621166593e-06,
"loss": 3.6123,
"step": 452
},
{
"epoch": 1.47557003257329,
"grad_norm": 7.059334754943848,
"learning_rate": 2.572475017094899e-06,
"loss": 3.6275,
"step": 453
},
{
"epoch": 1.478827361563518,
"grad_norm": 4.183060646057129,
"learning_rate": 2.5639505287997584e-06,
"loss": 3.6582,
"step": 454
},
{
"epoch": 1.482084690553746,
"grad_norm": 4.17788028717041,
"learning_rate": 2.555425296416785e-06,
"loss": 3.6591,
"step": 455
},
{
"epoch": 1.485342019543974,
"grad_norm": 5.921770095825195,
"learning_rate": 2.5468994191401795e-06,
"loss": 3.6156,
"step": 456
},
{
"epoch": 1.488599348534202,
"grad_norm": 7.887184143066406,
"learning_rate": 2.5383729961716487e-06,
"loss": 3.0889,
"step": 457
},
{
"epoch": 1.49185667752443,
"grad_norm": 4.037430763244629,
"learning_rate": 2.5298461267192476e-06,
"loss": 3.7687,
"step": 458
},
{
"epoch": 1.495114006514658,
"grad_norm": 5.913503646850586,
"learning_rate": 2.521318909996226e-06,
"loss": 3.3295,
"step": 459
},
{
"epoch": 1.498371335504886,
"grad_norm": 6.712876319885254,
"learning_rate": 2.512791445219876e-06,
"loss": 3.6517,
"step": 460
},
{
"epoch": 1.501628664495114,
"grad_norm": 4.9107770919799805,
"learning_rate": 2.5042638316103733e-06,
"loss": 3.6666,
"step": 461
},
{
"epoch": 1.504885993485342,
"grad_norm": 6.594664573669434,
"learning_rate": 2.495736168389627e-06,
"loss": 3.4433,
"step": 462
},
{
"epoch": 1.50814332247557,
"grad_norm": 5.935478687286377,
"learning_rate": 2.487208554780125e-06,
"loss": 3.7178,
"step": 463
},
{
"epoch": 1.511400651465798,
"grad_norm": 3.9866998195648193,
"learning_rate": 2.4786810900037747e-06,
"loss": 3.58,
"step": 464
},
{
"epoch": 1.514657980456026,
"grad_norm": 7.126434803009033,
"learning_rate": 2.4701538732807532e-06,
"loss": 3.2685,
"step": 465
},
{
"epoch": 1.517915309446254,
"grad_norm": 5.334031581878662,
"learning_rate": 2.4616270038283517e-06,
"loss": 3.3296,
"step": 466
},
{
"epoch": 1.521172638436482,
"grad_norm": 5.371250629425049,
"learning_rate": 2.453100580859821e-06,
"loss": 3.4741,
"step": 467
},
{
"epoch": 1.52442996742671,
"grad_norm": 5.606956481933594,
"learning_rate": 2.4445747035832157e-06,
"loss": 3.404,
"step": 468
},
{
"epoch": 1.5276872964169381,
"grad_norm": 5.078698635101318,
"learning_rate": 2.436049471200242e-06,
"loss": 3.587,
"step": 469
},
{
"epoch": 1.5309446254071661,
"grad_norm": 4.911012649536133,
"learning_rate": 2.427524982905102e-06,
"loss": 3.877,
"step": 470
},
{
"epoch": 1.5342019543973942,
"grad_norm": 4.630041122436523,
"learning_rate": 2.4190013378833416e-06,
"loss": 3.8273,
"step": 471
},
{
"epoch": 1.5374592833876222,
"grad_norm": 7.098541259765625,
"learning_rate": 2.4104786353106927e-06,
"loss": 3.8225,
"step": 472
},
{
"epoch": 1.5407166123778502,
"grad_norm": 5.203646183013916,
"learning_rate": 2.4019569743519223e-06,
"loss": 3.4602,
"step": 473
},
{
"epoch": 1.5439739413680782,
"grad_norm": 8.603754997253418,
"learning_rate": 2.393436454159679e-06,
"loss": 2.8917,
"step": 474
},
{
"epoch": 1.5472312703583062,
"grad_norm": 5.113753318786621,
"learning_rate": 2.384917173873336e-06,
"loss": 3.4855,
"step": 475
},
{
"epoch": 1.5504885993485342,
"grad_norm": 5.767629146575928,
"learning_rate": 2.376399232617842e-06,
"loss": 3.3516,
"step": 476
},
{
"epoch": 1.5537459283387622,
"grad_norm": 6.133004188537598,
"learning_rate": 2.3678827295025634e-06,
"loss": 3.726,
"step": 477
},
{
"epoch": 1.5570032573289903,
"grad_norm": 6.120333194732666,
"learning_rate": 2.359367763620135e-06,
"loss": 3.2949,
"step": 478
},
{
"epoch": 1.5602605863192183,
"grad_norm": 7.7329792976379395,
"learning_rate": 2.3508544340453047e-06,
"loss": 3.6583,
"step": 479
},
{
"epoch": 1.5635179153094463,
"grad_norm": 6.699610710144043,
"learning_rate": 2.342342839833782e-06,
"loss": 3.4231,
"step": 480
},
{
"epoch": 1.5667752442996743,
"grad_norm": 10.26463508605957,
"learning_rate": 2.333833080021083e-06,
"loss": 2.65,
"step": 481
},
{
"epoch": 1.5700325732899023,
"grad_norm": 6.48724365234375,
"learning_rate": 2.3253252536213817e-06,
"loss": 3.348,
"step": 482
},
{
"epoch": 1.5732899022801303,
"grad_norm": 4.94360876083374,
"learning_rate": 2.316819459626356e-06,
"loss": 3.6939,
"step": 483
},
{
"epoch": 1.5765472312703583,
"grad_norm": 6.517416954040527,
"learning_rate": 2.3083157970040344e-06,
"loss": 3.9084,
"step": 484
},
{
"epoch": 1.5798045602605864,
"grad_norm": 6.802534103393555,
"learning_rate": 2.2998143646976477e-06,
"loss": 3.08,
"step": 485
},
{
"epoch": 1.5830618892508144,
"grad_norm": 4.083236217498779,
"learning_rate": 2.291315261624477e-06,
"loss": 3.8346,
"step": 486
},
{
"epoch": 1.5863192182410424,
"grad_norm": 6.189084053039551,
"learning_rate": 2.2828185866746993e-06,
"loss": 3.7014,
"step": 487
},
{
"epoch": 1.5895765472312704,
"grad_norm": 6.799473285675049,
"learning_rate": 2.2743244387102404e-06,
"loss": 3.4053,
"step": 488
},
{
"epoch": 1.5928338762214984,
"grad_norm": 6.345086097717285,
"learning_rate": 2.265832916563626e-06,
"loss": 3.4448,
"step": 489
},
{
"epoch": 1.5960912052117264,
"grad_norm": 5.72438383102417,
"learning_rate": 2.2573441190368256e-06,
"loss": 3.5056,
"step": 490
},
{
"epoch": 1.5993485342019544,
"grad_norm": 4.741360187530518,
"learning_rate": 2.2488581449001097e-06,
"loss": 3.7963,
"step": 491
},
{
"epoch": 1.6026058631921825,
"grad_norm": 5.115567207336426,
"learning_rate": 2.240375092890896e-06,
"loss": 3.4823,
"step": 492
},
{
"epoch": 1.6058631921824105,
"grad_norm": 5.392317771911621,
"learning_rate": 2.2318950617126045e-06,
"loss": 3.9584,
"step": 493
},
{
"epoch": 1.6091205211726385,
"grad_norm": 4.737048149108887,
"learning_rate": 2.2234181500335033e-06,
"loss": 3.5862,
"step": 494
},
{
"epoch": 1.6123778501628665,
"grad_norm": 6.983661651611328,
"learning_rate": 2.2149444564855664e-06,
"loss": 3.1922,
"step": 495
},
{
"epoch": 1.6156351791530945,
"grad_norm": 4.231821060180664,
"learning_rate": 2.2064740796633234e-06,
"loss": 3.5761,
"step": 496
},
{
"epoch": 1.6188925081433225,
"grad_norm": 6.307882308959961,
"learning_rate": 2.198007118122711e-06,
"loss": 3.1334,
"step": 497
},
{
"epoch": 1.6221498371335505,
"grad_norm": 7.474777698516846,
"learning_rate": 2.1895436703799305e-06,
"loss": 3.2486,
"step": 498
},
{
"epoch": 1.6254071661237783,
"grad_norm": 5.582712173461914,
"learning_rate": 2.1810838349102963e-06,
"loss": 3.4783,
"step": 499
},
{
"epoch": 1.6286644951140063,
"grad_norm": 6.286925315856934,
"learning_rate": 2.1726277101470933e-06,
"loss": 3.9185,
"step": 500
},
{
"epoch": 1.6319218241042344,
"grad_norm": 4.55825138092041,
"learning_rate": 2.164175394480433e-06,
"loss": 3.6785,
"step": 501
},
{
"epoch": 1.6351791530944624,
"grad_norm": 7.3772430419921875,
"learning_rate": 2.1557269862561045e-06,
"loss": 3.1871,
"step": 502
},
{
"epoch": 1.6384364820846904,
"grad_norm": 7.50354528427124,
"learning_rate": 2.147282583774433e-06,
"loss": 3.1048,
"step": 503
},
{
"epoch": 1.6416938110749184,
"grad_norm": 4.185749530792236,
"learning_rate": 2.138842285289135e-06,
"loss": 3.6414,
"step": 504
},
{
"epoch": 1.6449511400651464,
"grad_norm": 6.309109687805176,
"learning_rate": 2.1304061890061764e-06,
"loss": 3.3061,
"step": 505
},
{
"epoch": 1.6482084690553744,
"grad_norm": 6.976665019989014,
"learning_rate": 2.1219743930826284e-06,
"loss": 3.415,
"step": 506
},
{
"epoch": 1.6514657980456025,
"grad_norm": 7.75382661819458,
"learning_rate": 2.1135469956255254e-06,
"loss": 3.1916,
"step": 507
},
{
"epoch": 1.6547231270358305,
"grad_norm": 4.2923736572265625,
"learning_rate": 2.1051240946907252e-06,
"loss": 3.7138,
"step": 508
},
{
"epoch": 1.6579804560260585,
"grad_norm": 5.76967716217041,
"learning_rate": 2.0967057882817664e-06,
"loss": 3.7076,
"step": 509
},
{
"epoch": 1.6612377850162865,
"grad_norm": 5.392721176147461,
"learning_rate": 2.0882921743487283e-06,
"loss": 3.4379,
"step": 510
},
{
"epoch": 1.6644951140065145,
"grad_norm": 5.194660663604736,
"learning_rate": 2.079883350787092e-06,
"loss": 3.8899,
"step": 511
},
{
"epoch": 1.6677524429967425,
"grad_norm": 6.226246356964111,
"learning_rate": 2.0714794154366e-06,
"loss": 3.6434,
"step": 512
},
{
"epoch": 1.6710097719869705,
"grad_norm": 6.618658542633057,
"learning_rate": 2.0630804660801203e-06,
"loss": 3.1193,
"step": 513
},
{
"epoch": 1.6742671009771986,
"grad_norm": 6.338504314422607,
"learning_rate": 2.0546866004425053e-06,
"loss": 3.2267,
"step": 514
},
{
"epoch": 1.6775244299674266,
"grad_norm": 6.449606418609619,
"learning_rate": 2.046297916189457e-06,
"loss": 3.8425,
"step": 515
},
{
"epoch": 1.6807817589576546,
"grad_norm": 5.436049938201904,
"learning_rate": 2.0379145109263914e-06,
"loss": 3.7856,
"step": 516
},
{
"epoch": 1.6840390879478826,
"grad_norm": 4.790528297424316,
"learning_rate": 2.0295364821972996e-06,
"loss": 3.3842,
"step": 517
},
{
"epoch": 1.6872964169381106,
"grad_norm": 4.875779151916504,
"learning_rate": 2.0211639274836155e-06,
"loss": 3.9334,
"step": 518
},
{
"epoch": 1.6905537459283386,
"grad_norm": 8.919984817504883,
"learning_rate": 2.0127969442030816e-06,
"loss": 3.0129,
"step": 519
},
{
"epoch": 1.6938110749185666,
"grad_norm": 4.52994966506958,
"learning_rate": 2.0044356297086136e-06,
"loss": 3.7951,
"step": 520
},
{
"epoch": 1.6970684039087947,
"grad_norm": 5.16178035736084,
"learning_rate": 1.996080081287169e-06,
"loss": 3.7712,
"step": 521
},
{
"epoch": 1.7003257328990227,
"grad_norm": 5.213560104370117,
"learning_rate": 1.987730396158615e-06,
"loss": 3.6248,
"step": 522
},
{
"epoch": 1.7035830618892507,
"grad_norm": 5.562010765075684,
"learning_rate": 1.979386671474598e-06,
"loss": 3.8039,
"step": 523
},
{
"epoch": 1.7068403908794787,
"grad_norm": 5.998531341552734,
"learning_rate": 1.9710490043174118e-06,
"loss": 3.6943,
"step": 524
},
{
"epoch": 1.7100977198697067,
"grad_norm": 6.784372329711914,
"learning_rate": 1.962717491698869e-06,
"loss": 3.2823,
"step": 525
},
{
"epoch": 1.7133550488599347,
"grad_norm": 5.637960433959961,
"learning_rate": 1.9543922305591708e-06,
"loss": 3.4504,
"step": 526
},
{
"epoch": 1.7166123778501627,
"grad_norm": 8.034090995788574,
"learning_rate": 1.9460733177657813e-06,
"loss": 3.0897,
"step": 527
},
{
"epoch": 1.7198697068403908,
"grad_norm": 5.726444244384766,
"learning_rate": 1.937760850112299e-06,
"loss": 3.5505,
"step": 528
},
{
"epoch": 1.7231270358306188,
"grad_norm": 6.473389148712158,
"learning_rate": 1.9294549243173306e-06,
"loss": 3.2587,
"step": 529
},
{
"epoch": 1.7263843648208468,
"grad_norm": 4.210171222686768,
"learning_rate": 1.9211556370233652e-06,
"loss": 3.661,
"step": 530
},
{
"epoch": 1.7296416938110748,
"grad_norm": 4.9132280349731445,
"learning_rate": 1.9128630847956535e-06,
"loss": 3.6479,
"step": 531
},
{
"epoch": 1.7328990228013028,
"grad_norm": 6.2500176429748535,
"learning_rate": 1.9045773641210772e-06,
"loss": 3.1326,
"step": 532
},
{
"epoch": 1.7361563517915308,
"grad_norm": 4.56230354309082,
"learning_rate": 1.8962985714070327e-06,
"loss": 3.7075,
"step": 533
},
{
"epoch": 1.7394136807817588,
"grad_norm": 5.635771751403809,
"learning_rate": 1.8880268029803072e-06,
"loss": 3.8035,
"step": 534
},
{
"epoch": 1.7426710097719869,
"grad_norm": 6.24404239654541,
"learning_rate": 1.8797621550859563e-06,
"loss": 3.1673,
"step": 535
},
{
"epoch": 1.7459283387622149,
"grad_norm": 5.211395740509033,
"learning_rate": 1.871504723886187e-06,
"loss": 3.6729,
"step": 536
},
{
"epoch": 1.7491856677524429,
"grad_norm": 6.440443515777588,
"learning_rate": 1.8632546054592365e-06,
"loss": 3.3578,
"step": 537
},
{
"epoch": 1.752442996742671,
"grad_norm": 5.6076436042785645,
"learning_rate": 1.855011895798255e-06,
"loss": 3.2926,
"step": 538
},
{
"epoch": 1.755700325732899,
"grad_norm": 4.676731586456299,
"learning_rate": 1.8467766908101908e-06,
"loss": 3.6138,
"step": 539
},
{
"epoch": 1.758957654723127,
"grad_norm": 4.973746299743652,
"learning_rate": 1.8385490863146707e-06,
"loss": 3.7079,
"step": 540
},
{
"epoch": 1.762214983713355,
"grad_norm": 6.0938897132873535,
"learning_rate": 1.8303291780428879e-06,
"loss": 3.5518,
"step": 541
},
{
"epoch": 1.765472312703583,
"grad_norm": 4.93472957611084,
"learning_rate": 1.8221170616364864e-06,
"loss": 3.4361,
"step": 542
},
{
"epoch": 1.768729641693811,
"grad_norm": 5.908854007720947,
"learning_rate": 1.8139128326464495e-06,
"loss": 3.7288,
"step": 543
},
{
"epoch": 1.771986970684039,
"grad_norm": 6.220162868499756,
"learning_rate": 1.805716586531988e-06,
"loss": 2.9277,
"step": 544
},
{
"epoch": 1.775244299674267,
"grad_norm": 4.5047287940979,
"learning_rate": 1.7975284186594286e-06,
"loss": 3.7543,
"step": 545
},
{
"epoch": 1.778501628664495,
"grad_norm": 6.460538864135742,
"learning_rate": 1.789348424301104e-06,
"loss": 3.4487,
"step": 546
},
{
"epoch": 1.781758957654723,
"grad_norm": 5.632236480712891,
"learning_rate": 1.781176698634246e-06,
"loss": 3.667,
"step": 547
},
{
"epoch": 1.785016286644951,
"grad_norm": 6.939136028289795,
"learning_rate": 1.7730133367398775e-06,
"loss": 3.1925,
"step": 548
},
{
"epoch": 1.788273615635179,
"grad_norm": 6.888382434844971,
"learning_rate": 1.7648584336017044e-06,
"loss": 3.2651,
"step": 549
},
{
"epoch": 1.791530944625407,
"grad_norm": 5.201353549957275,
"learning_rate": 1.7567120841050133e-06,
"loss": 3.905,
"step": 550
},
{
"epoch": 1.794788273615635,
"grad_norm": 6.2696309089660645,
"learning_rate": 1.748574383035565e-06,
"loss": 3.2628,
"step": 551
},
{
"epoch": 1.798045602605863,
"grad_norm": 5.081679821014404,
"learning_rate": 1.740445425078493e-06,
"loss": 3.4254,
"step": 552
},
{
"epoch": 1.8013029315960911,
"grad_norm": 5.418735980987549,
"learning_rate": 1.7323253048172015e-06,
"loss": 3.6783,
"step": 553
},
{
"epoch": 1.8045602605863191,
"grad_norm": 6.63270378112793,
"learning_rate": 1.7242141167322656e-06,
"loss": 3.6155,
"step": 554
},
{
"epoch": 1.8078175895765471,
"grad_norm": 5.2097625732421875,
"learning_rate": 1.7161119552003303e-06,
"loss": 3.4332,
"step": 555
},
{
"epoch": 1.8110749185667752,
"grad_norm": 7.32343053817749,
"learning_rate": 1.7080189144930136e-06,
"loss": 3.512,
"step": 556
},
{
"epoch": 1.8143322475570032,
"grad_norm": 6.2383012771606445,
"learning_rate": 1.6999350887758098e-06,
"loss": 3.2288,
"step": 557
},
{
"epoch": 1.8175895765472312,
"grad_norm": 6.8209733963012695,
"learning_rate": 1.6918605721069925e-06,
"loss": 3.2334,
"step": 558
},
{
"epoch": 1.8208469055374592,
"grad_norm": 4.392072677612305,
"learning_rate": 1.6837954584365217e-06,
"loss": 3.6259,
"step": 559
},
{
"epoch": 1.8241042345276872,
"grad_norm": 6.251183986663818,
"learning_rate": 1.6757398416049502e-06,
"loss": 3.3141,
"step": 560
},
{
"epoch": 1.8273615635179152,
"grad_norm": 7.095953941345215,
"learning_rate": 1.6676938153423312e-06,
"loss": 3.4431,
"step": 561
},
{
"epoch": 1.8306188925081432,
"grad_norm": 7.436147212982178,
"learning_rate": 1.659657473267129e-06,
"loss": 3.1196,
"step": 562
},
{
"epoch": 1.8338762214983713,
"grad_norm": 4.6533989906311035,
"learning_rate": 1.6516309088851273e-06,
"loss": 3.4344,
"step": 563
},
{
"epoch": 1.8371335504885993,
"grad_norm": 8.499824523925781,
"learning_rate": 1.6436142155883442e-06,
"loss": 3.129,
"step": 564
},
{
"epoch": 1.8403908794788273,
"grad_norm": 6.272755146026611,
"learning_rate": 1.6356074866539434e-06,
"loss": 3.5271,
"step": 565
},
{
"epoch": 1.8436482084690553,
"grad_norm": 5.055791854858398,
"learning_rate": 1.6276108152431497e-06,
"loss": 3.4117,
"step": 566
},
{
"epoch": 1.8469055374592833,
"grad_norm": 5.689114570617676,
"learning_rate": 1.619624294400165e-06,
"loss": 3.5649,
"step": 567
},
{
"epoch": 1.8501628664495113,
"grad_norm": 6.562279224395752,
"learning_rate": 1.6116480170510852e-06,
"loss": 3.3717,
"step": 568
},
{
"epoch": 1.8534201954397393,
"grad_norm": 4.955641269683838,
"learning_rate": 1.6036820760028202e-06,
"loss": 3.4331,
"step": 569
},
{
"epoch": 1.8566775244299674,
"grad_norm": 6.789297103881836,
"learning_rate": 1.5957265639420128e-06,
"loss": 3.8255,
"step": 570
},
{
"epoch": 1.8599348534201954,
"grad_norm": 4.328821659088135,
"learning_rate": 1.5877815734339608e-06,
"loss": 3.5312,
"step": 571
},
{
"epoch": 1.8631921824104234,
"grad_norm": 8.573647499084473,
"learning_rate": 1.5798471969215394e-06,
"loss": 3.3997,
"step": 572
},
{
"epoch": 1.8664495114006514,
"grad_norm": 5.302987098693848,
"learning_rate": 1.5719235267241273e-06,
"loss": 3.4652,
"step": 573
},
{
"epoch": 1.8697068403908794,
"grad_norm": 6.52237606048584,
"learning_rate": 1.5640106550365298e-06,
"loss": 3.5065,
"step": 574
},
{
"epoch": 1.8729641693811074,
"grad_norm": 6.677807807922363,
"learning_rate": 1.556108673927908e-06,
"loss": 2.987,
"step": 575
},
{
"epoch": 1.8762214983713354,
"grad_norm": 5.502460956573486,
"learning_rate": 1.5482176753407075e-06,
"loss": 3.6778,
"step": 576
},
{
"epoch": 1.8794788273615635,
"grad_norm": 5.219401836395264,
"learning_rate": 1.54033775108959e-06,
"loss": 3.5377,
"step": 577
},
{
"epoch": 1.8827361563517915,
"grad_norm": 6.289682865142822,
"learning_rate": 1.5324689928603586e-06,
"loss": 3.4786,
"step": 578
},
{
"epoch": 1.8859934853420195,
"grad_norm": 5.785194396972656,
"learning_rate": 1.5246114922089e-06,
"loss": 3.5605,
"step": 579
},
{
"epoch": 1.8892508143322475,
"grad_norm": 5.868037700653076,
"learning_rate": 1.5167653405601125e-06,
"loss": 3.5138,
"step": 580
},
{
"epoch": 1.8925081433224755,
"grad_norm": 5.104717254638672,
"learning_rate": 1.5089306292068456e-06,
"loss": 3.6772,
"step": 581
},
{
"epoch": 1.8957654723127035,
"grad_norm": 6.35374641418457,
"learning_rate": 1.5011074493088372e-06,
"loss": 3.8539,
"step": 582
},
{
"epoch": 1.8990228013029316,
"grad_norm": 7.706153392791748,
"learning_rate": 1.4932958918916512e-06,
"loss": 3.5753,
"step": 583
},
{
"epoch": 1.9022801302931596,
"grad_norm": 6.74048376083374,
"learning_rate": 1.4854960478456207e-06,
"loss": 3.5602,
"step": 584
},
{
"epoch": 1.9055374592833876,
"grad_norm": 6.806337833404541,
"learning_rate": 1.4777080079247884e-06,
"loss": 3.5366,
"step": 585
},
{
"epoch": 1.9087947882736156,
"grad_norm": 6.2833428382873535,
"learning_rate": 1.469931862745853e-06,
"loss": 3.1881,
"step": 586
},
{
"epoch": 1.9120521172638436,
"grad_norm": 8.585888862609863,
"learning_rate": 1.4621677027871129e-06,
"loss": 3.4442,
"step": 587
},
{
"epoch": 1.9153094462540716,
"grad_norm": 4.6327433586120605,
"learning_rate": 1.4544156183874129e-06,
"loss": 3.7583,
"step": 588
},
{
"epoch": 1.9185667752442996,
"grad_norm": 5.917350769042969,
"learning_rate": 1.446675699745097e-06,
"loss": 3.7279,
"step": 589
},
{
"epoch": 1.9218241042345277,
"grad_norm": 5.715301990509033,
"learning_rate": 1.4389480369169528e-06,
"loss": 3.2236,
"step": 590
},
{
"epoch": 1.9250814332247557,
"grad_norm": 6.128412246704102,
"learning_rate": 1.4312327198171705e-06,
"loss": 3.5584,
"step": 591
},
{
"epoch": 1.9283387622149837,
"grad_norm": 4.744357109069824,
"learning_rate": 1.42352983821629e-06,
"loss": 3.6764,
"step": 592
},
{
"epoch": 1.9315960912052117,
"grad_norm": 6.520962715148926,
"learning_rate": 1.4158394817401611e-06,
"loss": 3.179,
"step": 593
},
{
"epoch": 1.9348534201954397,
"grad_norm": 6.759045124053955,
"learning_rate": 1.408161739868901e-06,
"loss": 3.6262,
"step": 594
},
{
"epoch": 1.9381107491856677,
"grad_norm": 5.7115936279296875,
"learning_rate": 1.400496701935847e-06,
"loss": 3.6611,
"step": 595
},
{
"epoch": 1.9413680781758957,
"grad_norm": 7.379328727722168,
"learning_rate": 1.3928444571265262e-06,
"loss": 3.1594,
"step": 596
},
{
"epoch": 1.9446254071661238,
"grad_norm": 5.522282600402832,
"learning_rate": 1.3852050944776088e-06,
"loss": 3.6881,
"step": 597
},
{
"epoch": 1.9478827361563518,
"grad_norm": 6.506474494934082,
"learning_rate": 1.37757870287588e-06,
"loss": 3.5295,
"step": 598
},
{
"epoch": 1.9511400651465798,
"grad_norm": 5.814865589141846,
"learning_rate": 1.3699653710571987e-06,
"loss": 3.5468,
"step": 599
},
{
"epoch": 1.9543973941368078,
"grad_norm": 4.4180908203125,
"learning_rate": 1.362365187605471e-06,
"loss": 3.7826,
"step": 600
},
{
"epoch": 1.9576547231270358,
"grad_norm": 6.508097171783447,
"learning_rate": 1.354778240951617e-06,
"loss": 3.8519,
"step": 601
},
{
"epoch": 1.9609120521172638,
"grad_norm": 5.066211700439453,
"learning_rate": 1.3472046193725386e-06,
"loss": 3.6248,
"step": 602
},
{
"epoch": 1.9641693811074918,
"grad_norm": 5.988237380981445,
"learning_rate": 1.3396444109901008e-06,
"loss": 3.5285,
"step": 603
},
{
"epoch": 1.9674267100977199,
"grad_norm": 5.127301216125488,
"learning_rate": 1.3320977037700952e-06,
"loss": 3.7062,
"step": 604
},
{
"epoch": 1.9706840390879479,
"grad_norm": 5.3811798095703125,
"learning_rate": 1.324564585521228e-06,
"loss": 3.4126,
"step": 605
},
{
"epoch": 1.9739413680781759,
"grad_norm": 7.705234527587891,
"learning_rate": 1.3170451438940882e-06,
"loss": 3.2128,
"step": 606
},
{
"epoch": 1.977198697068404,
"grad_norm": 4.88129997253418,
"learning_rate": 1.3095394663801348e-06,
"loss": 3.7014,
"step": 607
},
{
"epoch": 1.980456026058632,
"grad_norm": 7.479042053222656,
"learning_rate": 1.302047640310677e-06,
"loss": 3.2829,
"step": 608
},
{
"epoch": 1.98371335504886,
"grad_norm": 4.332832336425781,
"learning_rate": 1.2945697528558542e-06,
"loss": 3.6514,
"step": 609
},
{
"epoch": 1.986970684039088,
"grad_norm": 4.368464946746826,
"learning_rate": 1.2871058910236293e-06,
"loss": 3.7944,
"step": 610
},
{
"epoch": 1.990228013029316,
"grad_norm": 6.625438213348389,
"learning_rate": 1.2796561416587666e-06,
"loss": 3.5045,
"step": 611
},
{
"epoch": 1.993485342019544,
"grad_norm": 4.590055465698242,
"learning_rate": 1.2722205914418318e-06,
"loss": 3.29,
"step": 612
},
{
"epoch": 1.996742671009772,
"grad_norm": 5.7786335945129395,
"learning_rate": 1.2647993268881744e-06,
"loss": 3.3603,
"step": 613
},
{
"epoch": 2.0,
"grad_norm": 5.448220252990723,
"learning_rate": 1.2573924343469274e-06,
"loss": 3.4126,
"step": 614
},
{
"epoch": 2.003257328990228,
"grad_norm": 4.431787490844727,
"learning_rate": 1.2500000000000007e-06,
"loss": 3.6297,
"step": 615
},
{
"epoch": 2.006514657980456,
"grad_norm": 7.178677558898926,
"learning_rate": 1.242622109861074e-06,
"loss": 3.1546,
"step": 616
},
{
"epoch": 2.009771986970684,
"grad_norm": 6.207696914672852,
"learning_rate": 1.2352588497746046e-06,
"loss": 3.1396,
"step": 617
},
{
"epoch": 2.013029315960912,
"grad_norm": 5.861324787139893,
"learning_rate": 1.2279103054148197e-06,
"loss": 3.0488,
"step": 618
},
{
"epoch": 2.01628664495114,
"grad_norm": 5.195066928863525,
"learning_rate": 1.2205765622847273e-06,
"loss": 3.2397,
"step": 619
},
{
"epoch": 2.019543973941368,
"grad_norm": 4.896651744842529,
"learning_rate": 1.2132577057151138e-06,
"loss": 3.5348,
"step": 620
},
{
"epoch": 2.022801302931596,
"grad_norm": 6.000244617462158,
"learning_rate": 1.2059538208635587e-06,
"loss": 3.1314,
"step": 621
},
{
"epoch": 2.026058631921824,
"grad_norm": 5.479099273681641,
"learning_rate": 1.1986649927134371e-06,
"loss": 3.6143,
"step": 622
},
{
"epoch": 2.029315960912052,
"grad_norm": 6.785484313964844,
"learning_rate": 1.1913913060729356e-06,
"loss": 2.8881,
"step": 623
},
{
"epoch": 2.03257328990228,
"grad_norm": 5.43539571762085,
"learning_rate": 1.1841328455740644e-06,
"loss": 3.297,
"step": 624
},
{
"epoch": 2.035830618892508,
"grad_norm": 6.371501445770264,
"learning_rate": 1.1768896956716693e-06,
"loss": 3.6874,
"step": 625
},
{
"epoch": 2.039087947882736,
"grad_norm": 5.3042707443237305,
"learning_rate": 1.169661940642455e-06,
"loss": 3.2485,
"step": 626
},
{
"epoch": 2.042345276872964,
"grad_norm": 5.197566032409668,
"learning_rate": 1.1624496645839975e-06,
"loss": 3.2668,
"step": 627
},
{
"epoch": 2.045602605863192,
"grad_norm": 7.447043418884277,
"learning_rate": 1.1552529514137734e-06,
"loss": 3.3702,
"step": 628
},
{
"epoch": 2.04885993485342,
"grad_norm": 4.471124172210693,
"learning_rate": 1.1480718848681752e-06,
"loss": 3.47,
"step": 629
},
{
"epoch": 2.0521172638436482,
"grad_norm": 6.143259048461914,
"learning_rate": 1.1409065485015445e-06,
"loss": 3.1034,
"step": 630
},
{
"epoch": 2.0553745928338762,
"grad_norm": 6.116540908813477,
"learning_rate": 1.1337570256851962e-06,
"loss": 3.4506,
"step": 631
},
{
"epoch": 2.0586319218241043,
"grad_norm": 6.157170295715332,
"learning_rate": 1.1266233996064457e-06,
"loss": 3.4515,
"step": 632
},
{
"epoch": 2.0618892508143323,
"grad_norm": 5.997113227844238,
"learning_rate": 1.1195057532676487e-06,
"loss": 3.0388,
"step": 633
},
{
"epoch": 2.0651465798045603,
"grad_norm": 5.394718647003174,
"learning_rate": 1.112404169485226e-06,
"loss": 3.0905,
"step": 634
},
{
"epoch": 2.0684039087947883,
"grad_norm": 8.713213920593262,
"learning_rate": 1.1053187308887087e-06,
"loss": 2.8543,
"step": 635
},
{
"epoch": 2.0716612377850163,
"grad_norm": 5.968896389007568,
"learning_rate": 1.098249519919769e-06,
"loss": 3.3605,
"step": 636
},
{
"epoch": 2.0749185667752443,
"grad_norm": 7.154015064239502,
"learning_rate": 1.091196618831268e-06,
"loss": 2.6968,
"step": 637
},
{
"epoch": 2.0781758957654723,
"grad_norm": 6.021311283111572,
"learning_rate": 1.084160109686295e-06,
"loss": 3.3583,
"step": 638
},
{
"epoch": 2.0814332247557004,
"grad_norm": 5.368682861328125,
"learning_rate": 1.07714007435721e-06,
"loss": 3.7121,
"step": 639
},
{
"epoch": 2.0846905537459284,
"grad_norm": 5.130136966705322,
"learning_rate": 1.070136594524698e-06,
"loss": 3.2675,
"step": 640
},
{
"epoch": 2.0879478827361564,
"grad_norm": 6.868380069732666,
"learning_rate": 1.0631497516768113e-06,
"loss": 3.0014,
"step": 641
},
{
"epoch": 2.0912052117263844,
"grad_norm": 7.281705379486084,
"learning_rate": 1.0561796271080283e-06,
"loss": 3.2664,
"step": 642
},
{
"epoch": 2.0944625407166124,
"grad_norm": 4.6429443359375,
"learning_rate": 1.0492263019183002e-06,
"loss": 3.177,
"step": 643
},
{
"epoch": 2.0977198697068404,
"grad_norm": 7.4768571853637695,
"learning_rate": 1.042289857012115e-06,
"loss": 2.9303,
"step": 644
},
{
"epoch": 2.1009771986970684,
"grad_norm": 5.8586745262146,
"learning_rate": 1.0353703730975493e-06,
"loss": 3.388,
"step": 645
},
{
"epoch": 2.1042345276872965,
"grad_norm": 4.739225387573242,
"learning_rate": 1.0284679306853343e-06,
"loss": 3.1248,
"step": 646
},
{
"epoch": 2.1074918566775245,
"grad_norm": 7.75331449508667,
"learning_rate": 1.0215826100879175e-06,
"loss": 2.3332,
"step": 647
},
{
"epoch": 2.1107491856677525,
"grad_norm": 4.79758882522583,
"learning_rate": 1.0147144914185253e-06,
"loss": 3.2238,
"step": 648
},
{
"epoch": 2.1140065146579805,
"grad_norm": 4.328355312347412,
"learning_rate": 1.0078636545902363e-06,
"loss": 3.3991,
"step": 649
},
{
"epoch": 2.1172638436482085,
"grad_norm": 4.247138023376465,
"learning_rate": 1.0010301793150456e-06,
"loss": 3.5185,
"step": 650
},
{
"epoch": 2.1205211726384365,
"grad_norm": 6.359057903289795,
"learning_rate": 9.942141451029436e-07,
"loss": 3.6669,
"step": 651
},
{
"epoch": 2.1237785016286646,
"grad_norm": 7.362981796264648,
"learning_rate": 9.874156312609837e-07,
"loss": 2.862,
"step": 652
},
{
"epoch": 2.1270358306188926,
"grad_norm": 8.532692909240723,
"learning_rate": 9.806347168923667e-07,
"loss": 2.494,
"step": 653
},
{
"epoch": 2.1302931596091206,
"grad_norm": 8.926546096801758,
"learning_rate": 9.738714808955167e-07,
"loss": 2.6494,
"step": 654
},
{
"epoch": 2.1335504885993486,
"grad_norm": 4.513092517852783,
"learning_rate": 9.671260019631603e-07,
"loss": 3.2468,
"step": 655
},
{
"epoch": 2.1368078175895766,
"grad_norm": 6.006774425506592,
"learning_rate": 9.603983585814188e-07,
"loss": 3.3401,
"step": 656
},
{
"epoch": 2.1400651465798046,
"grad_norm": 6.242079734802246,
"learning_rate": 9.53688629028886e-07,
"loss": 3.1088,
"step": 657
},
{
"epoch": 2.1433224755700326,
"grad_norm": 5.460093021392822,
"learning_rate": 9.469968913757254e-07,
"loss": 3.3652,
"step": 658
},
{
"epoch": 2.1465798045602607,
"grad_norm": 7.09798002243042,
"learning_rate": 9.403232234827548e-07,
"loss": 3.2013,
"step": 659
},
{
"epoch": 2.1498371335504887,
"grad_norm": 8.465774536132812,
"learning_rate": 9.336677030005459e-07,
"loss": 2.8236,
"step": 660
},
{
"epoch": 2.1530944625407167,
"grad_norm": 7.581086158752441,
"learning_rate": 9.270304073685193e-07,
"loss": 2.9084,
"step": 661
},
{
"epoch": 2.1563517915309447,
"grad_norm": 6.972714424133301,
"learning_rate": 9.2041141381404e-07,
"loss": 3.0302,
"step": 662
},
{
"epoch": 2.1596091205211727,
"grad_norm": 8.414934158325195,
"learning_rate": 9.138107993515244e-07,
"loss": 2.6214,
"step": 663
},
{
"epoch": 2.1628664495114007,
"grad_norm": 8.747556686401367,
"learning_rate": 9.07228640781539e-07,
"loss": 3.0217,
"step": 664
},
{
"epoch": 2.1661237785016287,
"grad_norm": 4.145984172821045,
"learning_rate": 9.006650146899121e-07,
"loss": 3.6553,
"step": 665
},
{
"epoch": 2.1693811074918568,
"grad_norm": 6.007801532745361,
"learning_rate": 8.941199974468362e-07,
"loss": 2.8565,
"step": 666
},
{
"epoch": 2.1726384364820848,
"grad_norm": 5.257616996765137,
"learning_rate": 8.875936652059872e-07,
"loss": 3.7403,
"step": 667
},
{
"epoch": 2.175895765472313,
"grad_norm": 5.72130823135376,
"learning_rate": 8.810860939036301e-07,
"loss": 3.1205,
"step": 668
},
{
"epoch": 2.179153094462541,
"grad_norm": 4.439971446990967,
"learning_rate": 8.745973592577417e-07,
"loss": 3.4111,
"step": 669
},
{
"epoch": 2.182410423452769,
"grad_norm": 5.781883239746094,
"learning_rate": 8.681275367671288e-07,
"loss": 3.0331,
"step": 670
},
{
"epoch": 2.185667752442997,
"grad_norm": 7.093985080718994,
"learning_rate": 8.616767017105443e-07,
"loss": 2.986,
"step": 671
},
{
"epoch": 2.188925081433225,
"grad_norm": 5.910228252410889,
"learning_rate": 8.552449291458198e-07,
"loss": 3.4674,
"step": 672
},
{
"epoch": 2.192182410423453,
"grad_norm": 6.024210453033447,
"learning_rate": 8.488322939089838e-07,
"loss": 3.3857,
"step": 673
},
{
"epoch": 2.195439739413681,
"grad_norm": 6.586373805999756,
"learning_rate": 8.424388706133984e-07,
"loss": 3.4031,
"step": 674
},
{
"epoch": 2.198697068403909,
"grad_norm": 7.641358852386475,
"learning_rate": 8.360647336488847e-07,
"loss": 2.7999,
"step": 675
},
{
"epoch": 2.201954397394137,
"grad_norm": 7.232587814331055,
"learning_rate": 8.297099571808626e-07,
"loss": 3.169,
"step": 676
},
{
"epoch": 2.205211726384365,
"grad_norm": 5.160323143005371,
"learning_rate": 8.233746151494856e-07,
"loss": 3.1441,
"step": 677
},
{
"epoch": 2.208469055374593,
"grad_norm": 6.639790058135986,
"learning_rate": 8.170587812687777e-07,
"loss": 3.3241,
"step": 678
},
{
"epoch": 2.211726384364821,
"grad_norm": 6.677544593811035,
"learning_rate": 8.10762529025782e-07,
"loss": 3.159,
"step": 679
},
{
"epoch": 2.214983713355049,
"grad_norm": 5.903120994567871,
"learning_rate": 8.044859316796988e-07,
"loss": 3.3389,
"step": 680
},
{
"epoch": 2.218241042345277,
"grad_norm": 4.54970121383667,
"learning_rate": 7.982290622610392e-07,
"loss": 3.665,
"step": 681
},
{
"epoch": 2.221498371335505,
"grad_norm": 6.834527969360352,
"learning_rate": 7.919919935707702e-07,
"loss": 2.8488,
"step": 682
},
{
"epoch": 2.224755700325733,
"grad_norm": 5.058896541595459,
"learning_rate": 7.857747981794717e-07,
"loss": 3.2633,
"step": 683
},
{
"epoch": 2.228013029315961,
"grad_norm": 5.747068405151367,
"learning_rate": 7.795775484264911e-07,
"loss": 3.2937,
"step": 684
},
{
"epoch": 2.231270358306189,
"grad_norm": 5.85347318649292,
"learning_rate": 7.734003164190984e-07,
"loss": 3.382,
"step": 685
},
{
"epoch": 2.234527687296417,
"grad_norm": 6.000707626342773,
"learning_rate": 7.672431740316527e-07,
"loss": 3.2161,
"step": 686
},
{
"epoch": 2.237785016286645,
"grad_norm": 7.966262340545654,
"learning_rate": 7.611061929047603e-07,
"loss": 3.3168,
"step": 687
},
{
"epoch": 2.241042345276873,
"grad_norm": 5.129033088684082,
"learning_rate": 7.54989444444447e-07,
"loss": 3.249,
"step": 688
},
{
"epoch": 2.244299674267101,
"grad_norm": 5.477846622467041,
"learning_rate": 7.488929998213202e-07,
"loss": 3.1673,
"step": 689
},
{
"epoch": 2.247557003257329,
"grad_norm": 6.5664472579956055,
"learning_rate": 7.42816929969748e-07,
"loss": 2.7741,
"step": 690
},
{
"epoch": 2.250814332247557,
"grad_norm": 5.1526970863342285,
"learning_rate": 7.367613055870302e-07,
"loss": 3.2483,
"step": 691
},
{
"epoch": 2.254071661237785,
"grad_norm": 4.2331719398498535,
"learning_rate": 7.307261971325733e-07,
"loss": 3.499,
"step": 692
},
{
"epoch": 2.257328990228013,
"grad_norm": 6.132472991943359,
"learning_rate": 7.247116748270774e-07,
"loss": 3.4622,
"step": 693
},
{
"epoch": 2.260586319218241,
"grad_norm": 6.091642379760742,
"learning_rate": 7.187178086517116e-07,
"loss": 3.1034,
"step": 694
},
{
"epoch": 2.263843648208469,
"grad_norm": 7.409130096435547,
"learning_rate": 7.127446683473066e-07,
"loss": 3.3242,
"step": 695
},
{
"epoch": 2.267100977198697,
"grad_norm": 4.992582321166992,
"learning_rate": 7.067923234135368e-07,
"loss": 3.578,
"step": 696
},
{
"epoch": 2.270358306188925,
"grad_norm": 7.707672595977783,
"learning_rate": 7.008608431081179e-07,
"loss": 3.3818,
"step": 697
},
{
"epoch": 2.273615635179153,
"grad_norm": 4.5212483406066895,
"learning_rate": 6.949502964459951e-07,
"loss": 3.4887,
"step": 698
},
{
"epoch": 2.2768729641693812,
"grad_norm": 5.691729545593262,
"learning_rate": 6.890607521985454e-07,
"loss": 3.2088,
"step": 699
},
{
"epoch": 2.2801302931596092,
"grad_norm": 5.936895847320557,
"learning_rate": 6.831922788927744e-07,
"loss": 3.0461,
"step": 700
},
{
"epoch": 2.2833876221498373,
"grad_norm": 5.542623043060303,
"learning_rate": 6.773449448105182e-07,
"loss": 3.5307,
"step": 701
},
{
"epoch": 2.2866449511400653,
"grad_norm": 6.767984867095947,
"learning_rate": 6.715188179876525e-07,
"loss": 2.976,
"step": 702
},
{
"epoch": 2.2899022801302933,
"grad_norm": 5.975040435791016,
"learning_rate": 6.657139662132961e-07,
"loss": 3.1286,
"step": 703
},
{
"epoch": 2.2931596091205213,
"grad_norm": 6.311220169067383,
"learning_rate": 6.59930457029028e-07,
"loss": 3.4112,
"step": 704
},
{
"epoch": 2.2964169381107493,
"grad_norm": 6.7899932861328125,
"learning_rate": 6.541683577280947e-07,
"loss": 3.283,
"step": 705
},
{
"epoch": 2.2996742671009773,
"grad_norm": 4.831838607788086,
"learning_rate": 6.484277353546342e-07,
"loss": 3.481,
"step": 706
},
{
"epoch": 2.3029315960912053,
"grad_norm": 8.137195587158203,
"learning_rate": 6.427086567028912e-07,
"loss": 2.7856,
"step": 707
},
{
"epoch": 2.3061889250814334,
"grad_norm": 5.096379280090332,
"learning_rate": 6.370111883164406e-07,
"loss": 3.2754,
"step": 708
},
{
"epoch": 2.3094462540716614,
"grad_norm": 4.889212608337402,
"learning_rate": 6.313353964874155e-07,
"loss": 3.4334,
"step": 709
},
{
"epoch": 2.3127035830618894,
"grad_norm": 5.239722728729248,
"learning_rate": 6.256813472557322e-07,
"loss": 3.4799,
"step": 710
},
{
"epoch": 2.3159609120521174,
"grad_norm": 5.829878807067871,
"learning_rate": 6.200491064083264e-07,
"loss": 3.2489,
"step": 711
},
{
"epoch": 2.3192182410423454,
"grad_norm": 6.974037170410156,
"learning_rate": 6.144387394783829e-07,
"loss": 3.3498,
"step": 712
},
{
"epoch": 2.3224755700325734,
"grad_norm": 4.331153869628906,
"learning_rate": 6.088503117445774e-07,
"loss": 3.3922,
"step": 713
},
{
"epoch": 2.3257328990228014,
"grad_norm": 6.132293224334717,
"learning_rate": 6.032838882303144e-07,
"loss": 3.5287,
"step": 714
},
{
"epoch": 2.3289902280130295,
"grad_norm": 4.317237377166748,
"learning_rate": 5.977395337029701e-07,
"loss": 3.4481,
"step": 715
},
{
"epoch": 2.3322475570032575,
"grad_norm": 6.361827373504639,
"learning_rate": 5.922173126731418e-07,
"loss": 3.3759,
"step": 716
},
{
"epoch": 2.3355048859934855,
"grad_norm": 6.041193008422852,
"learning_rate": 5.867172893938936e-07,
"loss": 3.0176,
"step": 717
},
{
"epoch": 2.3387622149837135,
"grad_norm": 7.7040228843688965,
"learning_rate": 5.812395278600127e-07,
"loss": 2.7771,
"step": 718
},
{
"epoch": 2.3420195439739415,
"grad_norm": 6.479687213897705,
"learning_rate": 5.757840918072601e-07,
"loss": 3.2959,
"step": 719
},
{
"epoch": 2.3452768729641695,
"grad_norm": 6.083302021026611,
"learning_rate": 5.703510447116351e-07,
"loss": 3.5096,
"step": 720
},
{
"epoch": 2.3485342019543975,
"grad_norm": 6.406294822692871,
"learning_rate": 5.64940449788629e-07,
"loss": 3.426,
"step": 721
},
{
"epoch": 2.3517915309446256,
"grad_norm": 8.7530517578125,
"learning_rate": 5.595523699924979e-07,
"loss": 3.1966,
"step": 722
},
{
"epoch": 2.3550488599348536,
"grad_norm": 6.54054069519043,
"learning_rate": 5.541868680155243e-07,
"loss": 3.5258,
"step": 723
},
{
"epoch": 2.3583061889250816,
"grad_norm": 5.455934524536133,
"learning_rate": 5.48844006287289e-07,
"loss": 3.0753,
"step": 724
},
{
"epoch": 2.3615635179153096,
"grad_norm": 6.472217559814453,
"learning_rate": 5.435238469739465e-07,
"loss": 3.6595,
"step": 725
},
{
"epoch": 2.3648208469055376,
"grad_norm": 5.907280445098877,
"learning_rate": 5.382264519774988e-07,
"loss": 3.126,
"step": 726
},
{
"epoch": 2.3680781758957656,
"grad_norm": 6.544402122497559,
"learning_rate": 5.329518829350788e-07,
"loss": 3.331,
"step": 727
},
{
"epoch": 2.3713355048859937,
"grad_norm": 6.544687271118164,
"learning_rate": 5.277002012182287e-07,
"loss": 2.8258,
"step": 728
},
{
"epoch": 2.3745928338762217,
"grad_norm": 5.569584369659424,
"learning_rate": 5.224714679321898e-07,
"loss": 3.6026,
"step": 729
},
{
"epoch": 2.3778501628664497,
"grad_norm": 5.201624393463135,
"learning_rate": 5.172657439151913e-07,
"loss": 3.3477,
"step": 730
},
{
"epoch": 2.3811074918566777,
"grad_norm": 6.729457855224609,
"learning_rate": 5.120830897377371e-07,
"loss": 2.9732,
"step": 731
},
{
"epoch": 2.3843648208469057,
"grad_norm": 6.149277210235596,
"learning_rate": 5.069235657019095e-07,
"loss": 2.8766,
"step": 732
},
{
"epoch": 2.3876221498371337,
"grad_norm": 6.09014368057251,
"learning_rate": 5.017872318406594e-07,
"loss": 3.522,
"step": 733
},
{
"epoch": 2.3908794788273617,
"grad_norm": 6.65205192565918,
"learning_rate": 4.966741479171147e-07,
"loss": 3.6316,
"step": 734
},
{
"epoch": 2.3941368078175898,
"grad_norm": 6.388942718505859,
"learning_rate": 4.915843734238789e-07,
"loss": 2.9395,
"step": 735
},
{
"epoch": 2.3973941368078178,
"grad_norm": 5.279838562011719,
"learning_rate": 4.865179675823442e-07,
"loss": 3.2396,
"step": 736
},
{
"epoch": 2.400651465798046,
"grad_norm": 5.323233127593994,
"learning_rate": 4.81474989341999e-07,
"loss": 3.6813,
"step": 737
},
{
"epoch": 2.403908794788274,
"grad_norm": 8.088998794555664,
"learning_rate": 4.764554973797417e-07,
"loss": 2.8468,
"step": 738
},
{
"epoch": 2.407166123778502,
"grad_norm": 4.269301414489746,
"learning_rate": 4.71459550099202e-07,
"loss": 3.512,
"step": 739
},
{
"epoch": 2.41042345276873,
"grad_norm": 4.38401985168457,
"learning_rate": 4.664872056300557e-07,
"loss": 3.0347,
"step": 740
},
{
"epoch": 2.413680781758958,
"grad_norm": 5.353303909301758,
"learning_rate": 4.6153852182735354e-07,
"loss": 3.5764,
"step": 741
},
{
"epoch": 2.416938110749186,
"grad_norm": 4.9428863525390625,
"learning_rate": 4.5661355627084375e-07,
"loss": 3.5818,
"step": 742
},
{
"epoch": 2.420195439739414,
"grad_norm": 6.521563529968262,
"learning_rate": 4.517123662643061e-07,
"loss": 2.7058,
"step": 743
},
{
"epoch": 2.423452768729642,
"grad_norm": 8.82436466217041,
"learning_rate": 4.468350088348811e-07,
"loss": 3.3391,
"step": 744
},
{
"epoch": 2.42671009771987,
"grad_norm": 6.4387078285217285,
"learning_rate": 4.419815407324102e-07,
"loss": 3.323,
"step": 745
},
{
"epoch": 2.429967426710098,
"grad_norm": 6.8446736335754395,
"learning_rate": 4.371520184287736e-07,
"loss": 2.9756,
"step": 746
},
{
"epoch": 2.433224755700326,
"grad_norm": 7.508485794067383,
"learning_rate": 4.323464981172315e-07,
"loss": 2.9233,
"step": 747
},
{
"epoch": 2.436482084690554,
"grad_norm": 4.2981791496276855,
"learning_rate": 4.275650357117747e-07,
"loss": 3.4482,
"step": 748
},
{
"epoch": 2.4397394136807815,
"grad_norm": 4.362144947052002,
"learning_rate": 4.228076868464695e-07,
"loss": 3.5062,
"step": 749
},
{
"epoch": 2.44299674267101,
"grad_norm": 4.476921081542969,
"learning_rate": 4.180745068748135e-07,
"loss": 3.6147,
"step": 750
},
{
"epoch": 2.4462540716612375,
"grad_norm": 4.931582450866699,
"learning_rate": 4.1336555086908895e-07,
"loss": 3.4297,
"step": 751
},
{
"epoch": 2.449511400651466,
"grad_norm": 5.8569207191467285,
"learning_rate": 4.086808736197254e-07,
"loss": 3.3144,
"step": 752
},
{
"epoch": 2.4527687296416936,
"grad_norm": 7.034333229064941,
"learning_rate": 4.0402052963465913e-07,
"loss": 3.6045,
"step": 753
},
{
"epoch": 2.456026058631922,
"grad_norm": 4.413856506347656,
"learning_rate": 3.9938457313869914e-07,
"loss": 3.3704,
"step": 754
},
{
"epoch": 2.4592833876221496,
"grad_norm": 5.120318412780762,
"learning_rate": 3.9477305807289895e-07,
"loss": 3.4678,
"step": 755
},
{
"epoch": 2.462540716612378,
"grad_norm": 6.234142780303955,
"learning_rate": 3.9018603809392484e-07,
"loss": 3.3536,
"step": 756
},
{
"epoch": 2.4657980456026056,
"grad_norm": 6.449676513671875,
"learning_rate": 3.856235665734359e-07,
"loss": 2.9766,
"step": 757
},
{
"epoch": 2.469055374592834,
"grad_norm": 4.504249095916748,
"learning_rate": 3.8108569659745907e-07,
"loss": 3.2879,
"step": 758
},
{
"epoch": 2.4723127035830617,
"grad_norm": 8.269672393798828,
"learning_rate": 3.7657248096577504e-07,
"loss": 2.8618,
"step": 759
},
{
"epoch": 2.47557003257329,
"grad_norm": 5.094267845153809,
"learning_rate": 3.720839721913011e-07,
"loss": 3.4205,
"step": 760
},
{
"epoch": 2.4788273615635177,
"grad_norm": 6.253308296203613,
"learning_rate": 3.67620222499481e-07,
"loss": 3.1134,
"step": 761
},
{
"epoch": 2.482084690553746,
"grad_norm": 5.233059406280518,
"learning_rate": 3.631812838276791e-07,
"loss": 3.4115,
"step": 762
},
{
"epoch": 2.4853420195439737,
"grad_norm": 5.0791826248168945,
"learning_rate": 3.587672078245716e-07,
"loss": 3.5843,
"step": 763
},
{
"epoch": 2.488599348534202,
"grad_norm": 6.180685520172119,
"learning_rate": 3.543780458495513e-07,
"loss": 3.4273,
"step": 764
},
{
"epoch": 2.4918566775244297,
"grad_norm": 8.303179740905762,
"learning_rate": 3.5001384897212556e-07,
"loss": 3.7289,
"step": 765
},
{
"epoch": 2.495114006514658,
"grad_norm": 7.029627799987793,
"learning_rate": 3.456746679713238e-07,
"loss": 2.7946,
"step": 766
},
{
"epoch": 2.4983713355048858,
"grad_norm": 5.279702663421631,
"learning_rate": 3.41360553335108e-07,
"loss": 3.1989,
"step": 767
},
{
"epoch": 2.5016286644951142,
"grad_norm": 7.8925933837890625,
"learning_rate": 3.3707155525978116e-07,
"loss": 3.0259,
"step": 768
},
{
"epoch": 2.504885993485342,
"grad_norm": 5.007329940795898,
"learning_rate": 3.328077236494087e-07,
"loss": 3.4699,
"step": 769
},
{
"epoch": 2.5081433224755703,
"grad_norm": 5.810083866119385,
"learning_rate": 3.2856910811523256e-07,
"loss": 3.4737,
"step": 770
},
{
"epoch": 2.511400651465798,
"grad_norm": 4.268073081970215,
"learning_rate": 3.243557579750986e-07,
"loss": 3.5634,
"step": 771
},
{
"epoch": 2.5146579804560263,
"grad_norm": 6.415236473083496,
"learning_rate": 3.2016772225287844e-07,
"loss": 3.0534,
"step": 772
},
{
"epoch": 2.517915309446254,
"grad_norm": 7.551133155822754,
"learning_rate": 3.16005049677903e-07,
"loss": 2.9791,
"step": 773
},
{
"epoch": 2.5211726384364823,
"grad_norm": 5.403765678405762,
"learning_rate": 3.118677886843921e-07,
"loss": 3.4118,
"step": 774
},
{
"epoch": 2.52442996742671,
"grad_norm": 6.152907371520996,
"learning_rate": 3.077559874108937e-07,
"loss": 3.1713,
"step": 775
},
{
"epoch": 2.5276872964169383,
"grad_norm": 7.548793315887451,
"learning_rate": 3.03669693699723e-07,
"loss": 2.837,
"step": 776
},
{
"epoch": 2.530944625407166,
"grad_norm": 6.374703407287598,
"learning_rate": 2.996089550964029e-07,
"loss": 3.3303,
"step": 777
},
{
"epoch": 2.5342019543973944,
"grad_norm": 4.437128067016602,
"learning_rate": 2.9557381884911667e-07,
"loss": 3.4547,
"step": 778
},
{
"epoch": 2.537459283387622,
"grad_norm": 5.333934307098389,
"learning_rate": 2.9156433190815155e-07,
"loss": 3.5414,
"step": 779
},
{
"epoch": 2.5407166123778504,
"grad_norm": 6.211347579956055,
"learning_rate": 2.875805409253582e-07,
"loss": 3.0307,
"step": 780
},
{
"epoch": 2.543973941368078,
"grad_norm": 5.27606725692749,
"learning_rate": 2.836224922536035e-07,
"loss": 3.1969,
"step": 781
},
{
"epoch": 2.5472312703583064,
"grad_norm": 6.280512809753418,
"learning_rate": 2.796902319462344e-07,
"loss": 2.9867,
"step": 782
},
{
"epoch": 2.550488599348534,
"grad_norm": 4.462002277374268,
"learning_rate": 2.7578380575654096e-07,
"loss": 3.4119,
"step": 783
},
{
"epoch": 2.5537459283387625,
"grad_norm": 5.325382709503174,
"learning_rate": 2.71903259137222e-07,
"loss": 3.1448,
"step": 784
},
{
"epoch": 2.55700325732899,
"grad_norm": 7.347408771514893,
"learning_rate": 2.680486372398605e-07,
"loss": 3.0755,
"step": 785
},
{
"epoch": 2.5602605863192185,
"grad_norm": 5.137568950653076,
"learning_rate": 2.642199849143937e-07,
"loss": 3.6602,
"step": 786
},
{
"epoch": 2.563517915309446,
"grad_norm": 5.19792366027832,
"learning_rate": 2.604173467085949e-07,
"loss": 3.1483,
"step": 787
},
{
"epoch": 2.5667752442996745,
"grad_norm": 5.868303298950195,
"learning_rate": 2.566407668675519e-07,
"loss": 3.1571,
"step": 788
},
{
"epoch": 2.570032573289902,
"grad_norm": 7.622596263885498,
"learning_rate": 2.5289028933315587e-07,
"loss": 2.7567,
"step": 789
},
{
"epoch": 2.5732899022801305,
"grad_norm": 5.980817794799805,
"learning_rate": 2.4916595774358704e-07,
"loss": 3.3892,
"step": 790
},
{
"epoch": 2.576547231270358,
"grad_norm": 5.690640926361084,
"learning_rate": 2.4546781543280716e-07,
"loss": 2.9866,
"step": 791
},
{
"epoch": 2.5798045602605866,
"grad_norm": 7.25706148147583,
"learning_rate": 2.4179590543005835e-07,
"loss": 3.3073,
"step": 792
},
{
"epoch": 2.583061889250814,
"grad_norm": 8.114654541015625,
"learning_rate": 2.3815027045935774e-07,
"loss": 2.6904,
"step": 793
},
{
"epoch": 2.5863192182410426,
"grad_norm": 6.251055717468262,
"learning_rate": 2.345309529390047e-07,
"loss": 3.1565,
"step": 794
},
{
"epoch": 2.58957654723127,
"grad_norm": 3.57120418548584,
"learning_rate": 2.3093799498108388e-07,
"loss": 3.4674,
"step": 795
},
{
"epoch": 2.5928338762214986,
"grad_norm": 5.681574821472168,
"learning_rate": 2.2737143839097893e-07,
"loss": 3.298,
"step": 796
},
{
"epoch": 2.596091205211726,
"grad_norm": 5.519050121307373,
"learning_rate": 2.238313246668808e-07,
"loss": 2.9638,
"step": 797
},
{
"epoch": 2.5993485342019547,
"grad_norm": 5.44046688079834,
"learning_rate": 2.2031769499931105e-07,
"loss": 3.4083,
"step": 798
},
{
"epoch": 2.6026058631921822,
"grad_norm": 4.957735538482666,
"learning_rate": 2.168305902706383e-07,
"loss": 3.4802,
"step": 799
},
{
"epoch": 2.6058631921824107,
"grad_norm": 10.058741569519043,
"learning_rate": 2.13370051054603e-07,
"loss": 2.2093,
"step": 800
},
{
"epoch": 2.6091205211726383,
"grad_norm": 5.969457626342773,
"learning_rate": 2.0993611761584765e-07,
"loss": 3.3213,
"step": 801
},
{
"epoch": 2.6123778501628667,
"grad_norm": 6.594350814819336,
"learning_rate": 2.0652882990944535e-07,
"loss": 2.8912,
"step": 802
},
{
"epoch": 2.6156351791530943,
"grad_norm": 7.069642543792725,
"learning_rate": 2.031482275804375e-07,
"loss": 3.3256,
"step": 803
},
{
"epoch": 2.6188925081433228,
"grad_norm": 4.171195983886719,
"learning_rate": 1.9979434996337005e-07,
"loss": 3.3647,
"step": 804
},
{
"epoch": 2.6221498371335503,
"grad_norm": 5.1055378913879395,
"learning_rate": 1.964672360818387e-07,
"loss": 3.5159,
"step": 805
},
{
"epoch": 2.6254071661237783,
"grad_norm": 6.381522178649902,
"learning_rate": 1.9316692464803276e-07,
"loss": 3.4417,
"step": 806
},
{
"epoch": 2.6286644951140063,
"grad_norm": 5.769819259643555,
"learning_rate": 1.898934540622846e-07,
"loss": 3.4975,
"step": 807
},
{
"epoch": 2.6319218241042344,
"grad_norm": 8.080145835876465,
"learning_rate": 1.866468624126236e-07,
"loss": 2.8327,
"step": 808
},
{
"epoch": 2.6351791530944624,
"grad_norm": 5.503101825714111,
"learning_rate": 1.834271874743332e-07,
"loss": 3.3785,
"step": 809
},
{
"epoch": 2.6384364820846904,
"grad_norm": 5.769625663757324,
"learning_rate": 1.802344667095113e-07,
"loss": 3.5412,
"step": 810
},
{
"epoch": 2.6416938110749184,
"grad_norm": 7.4825005531311035,
"learning_rate": 1.7706873726663383e-07,
"loss": 2.9187,
"step": 811
},
{
"epoch": 2.6449511400651464,
"grad_norm": 5.458498001098633,
"learning_rate": 1.7393003598012243e-07,
"loss": 3.3779,
"step": 812
},
{
"epoch": 2.6482084690553744,
"grad_norm": 5.304836750030518,
"learning_rate": 1.7081839936991724e-07,
"loss": 3.5122,
"step": 813
},
{
"epoch": 2.6514657980456025,
"grad_norm": 5.748223304748535,
"learning_rate": 1.6773386364104972e-07,
"loss": 3.061,
"step": 814
},
{
"epoch": 2.6547231270358305,
"grad_norm": 4.899912357330322,
"learning_rate": 1.6467646468322358e-07,
"loss": 3.3704,
"step": 815
},
{
"epoch": 2.6579804560260585,
"grad_norm": 7.006373882293701,
"learning_rate": 1.6164623807039538e-07,
"loss": 3.1052,
"step": 816
},
{
"epoch": 2.6612377850162865,
"grad_norm": 6.537415981292725,
"learning_rate": 1.586432190603626e-07,
"loss": 3.1061,
"step": 817
},
{
"epoch": 2.6644951140065145,
"grad_norm": 6.889453411102295,
"learning_rate": 1.556674425943519e-07,
"loss": 2.9351,
"step": 818
},
{
"epoch": 2.6677524429967425,
"grad_norm": 6.120772361755371,
"learning_rate": 1.5271894329661223e-07,
"loss": 3.5839,
"step": 819
},
{
"epoch": 2.6710097719869705,
"grad_norm": 3.951160192489624,
"learning_rate": 1.4979775547401376e-07,
"loss": 3.4196,
"step": 820
},
{
"epoch": 2.6742671009771986,
"grad_norm": 6.393338203430176,
"learning_rate": 1.469039131156466e-07,
"loss": 3.1414,
"step": 821
},
{
"epoch": 2.6775244299674266,
"grad_norm": 4.9038896560668945,
"learning_rate": 1.440374498924277e-07,
"loss": 3.5466,
"step": 822
},
{
"epoch": 2.6807817589576546,
"grad_norm": 7.4714484214782715,
"learning_rate": 1.4119839915670563e-07,
"loss": 2.9009,
"step": 823
},
{
"epoch": 2.6840390879478826,
"grad_norm": 8.087553977966309,
"learning_rate": 1.3838679394187705e-07,
"loss": 2.9414,
"step": 824
},
{
"epoch": 2.6872964169381106,
"grad_norm": 5.134854793548584,
"learning_rate": 1.3560266696199864e-07,
"loss": 3.4754,
"step": 825
},
{
"epoch": 2.6905537459283386,
"grad_norm": 5.320379734039307,
"learning_rate": 1.3284605061140764e-07,
"loss": 3.5239,
"step": 826
},
{
"epoch": 2.6938110749185666,
"grad_norm": 5.643535614013672,
"learning_rate": 1.3011697696434565e-07,
"loss": 3.3429,
"step": 827
},
{
"epoch": 2.6970684039087947,
"grad_norm": 7.376565933227539,
"learning_rate": 1.274154777745837e-07,
"loss": 3.3408,
"step": 828
},
{
"epoch": 2.7003257328990227,
"grad_norm": 6.498621940612793,
"learning_rate": 1.24741584475056e-07,
"loss": 3.0349,
"step": 829
},
{
"epoch": 2.7035830618892507,
"grad_norm": 6.7927751541137695,
"learning_rate": 1.220953281774895e-07,
"loss": 3.3748,
"step": 830
},
{
"epoch": 2.7068403908794787,
"grad_norm": 5.855138301849365,
"learning_rate": 1.1947673967204643e-07,
"loss": 3.5825,
"step": 831
},
{
"epoch": 2.7100977198697067,
"grad_norm": 6.454092979431152,
"learning_rate": 1.1688584942696369e-07,
"loss": 3.4411,
"step": 832
},
{
"epoch": 2.7133550488599347,
"grad_norm": 7.363604545593262,
"learning_rate": 1.1432268758819809e-07,
"loss": 3.3443,
"step": 833
},
{
"epoch": 2.7166123778501627,
"grad_norm": 4.227643966674805,
"learning_rate": 1.1178728397907734e-07,
"loss": 3.3621,
"step": 834
},
{
"epoch": 2.7198697068403908,
"grad_norm": 4.559384346008301,
"learning_rate": 1.0927966809995084e-07,
"loss": 3.3748,
"step": 835
},
{
"epoch": 2.7231270358306188,
"grad_norm": 6.832982063293457,
"learning_rate": 1.0679986912784879e-07,
"loss": 3.1433,
"step": 836
},
{
"epoch": 2.726384364820847,
"grad_norm": 7.431112289428711,
"learning_rate": 1.043479159161398e-07,
"loss": 3.0649,
"step": 837
},
{
"epoch": 2.729641693811075,
"grad_norm": 7.875669479370117,
"learning_rate": 1.019238369941991e-07,
"loss": 2.4024,
"step": 838
},
{
"epoch": 2.732899022801303,
"grad_norm": 8.269497871398926,
"learning_rate": 9.952766056707225e-08,
"loss": 2.7252,
"step": 839
},
{
"epoch": 2.736156351791531,
"grad_norm": 7.376986503601074,
"learning_rate": 9.715941451515027e-08,
"loss": 3.1179,
"step": 840
},
{
"epoch": 2.739413680781759,
"grad_norm": 4.784783840179443,
"learning_rate": 9.481912639384388e-08,
"loss": 3.629,
"step": 841
},
{
"epoch": 2.742671009771987,
"grad_norm": 7.07935905456543,
"learning_rate": 9.25068234332624e-08,
"loss": 2.742,
"step": 842
},
{
"epoch": 2.745928338762215,
"grad_norm": 6.771409034729004,
"learning_rate": 9.02225325378986e-08,
"loss": 3.0037,
"step": 843
},
{
"epoch": 2.749185667752443,
"grad_norm": 6.435185432434082,
"learning_rate": 8.796628028631321e-08,
"loss": 2.9888,
"step": 844
},
{
"epoch": 2.752442996742671,
"grad_norm": 6.06053352355957,
"learning_rate": 8.57380929308288e-08,
"loss": 3.5483,
"step": 845
},
{
"epoch": 2.755700325732899,
"grad_norm": 6.068765640258789,
"learning_rate": 8.353799639722076e-08,
"loss": 3.0495,
"step": 846
},
{
"epoch": 2.758957654723127,
"grad_norm": 4.580106258392334,
"learning_rate": 8.136601628441876e-08,
"loss": 3.6901,
"step": 847
},
{
"epoch": 2.762214983713355,
"grad_norm": 4.707021713256836,
"learning_rate": 7.922217786420772e-08,
"loss": 3.622,
"step": 848
},
{
"epoch": 2.765472312703583,
"grad_norm": 6.918179988861084,
"learning_rate": 7.710650608093257e-08,
"loss": 3.1714,
"step": 849
},
{
"epoch": 2.768729641693811,
"grad_norm": 7.90523624420166,
"learning_rate": 7.501902555120982e-08,
"loss": 2.7148,
"step": 850
},
{
"epoch": 2.771986970684039,
"grad_norm": 5.939011096954346,
"learning_rate": 7.295976056364034e-08,
"loss": 3.4794,
"step": 851
},
{
"epoch": 2.775244299674267,
"grad_norm": 6.008779525756836,
"learning_rate": 7.092873507852676e-08,
"loss": 3.116,
"step": 852
},
{
"epoch": 2.778501628664495,
"grad_norm": 4.6474409103393555,
"learning_rate": 6.892597272759483e-08,
"loss": 3.323,
"step": 853
},
{
"epoch": 2.781758957654723,
"grad_norm": 7.1137213706970215,
"learning_rate": 6.695149681371804e-08,
"loss": 3.0716,
"step": 854
},
{
"epoch": 2.785016286644951,
"grad_norm": 8.155078887939453,
"learning_rate": 6.500533031064737e-08,
"loss": 2.8788,
"step": 855
},
{
"epoch": 2.788273615635179,
"grad_norm": 6.032046318054199,
"learning_rate": 6.30874958627431e-08,
"loss": 3.2304,
"step": 856
},
{
"epoch": 2.791530944625407,
"grad_norm": 6.509833812713623,
"learning_rate": 6.119801578471196e-08,
"loss": 3.0426,
"step": 857
},
{
"epoch": 2.794788273615635,
"grad_norm": 5.527968406677246,
"learning_rate": 5.9336912061346284e-08,
"loss": 3.3721,
"step": 858
},
{
"epoch": 2.798045602605863,
"grad_norm": 5.313396453857422,
"learning_rate": 5.750420634727083e-08,
"loss": 3.2306,
"step": 859
},
{
"epoch": 2.801302931596091,
"grad_norm": 6.423414707183838,
"learning_rate": 5.5699919966686886e-08,
"loss": 2.9182,
"step": 860
},
{
"epoch": 2.804560260586319,
"grad_norm": 5.296036243438721,
"learning_rate": 5.3924073913128874e-08,
"loss": 3.2474,
"step": 861
},
{
"epoch": 2.807817589576547,
"grad_norm": 5.368740081787109,
"learning_rate": 5.217668884921506e-08,
"loss": 3.5761,
"step": 862
},
{
"epoch": 2.811074918566775,
"grad_norm": 4.735677242279053,
"learning_rate": 5.0457785106411414e-08,
"loss": 3.2216,
"step": 863
},
{
"epoch": 2.814332247557003,
"grad_norm": 7.0894575119018555,
"learning_rate": 4.876738268479342e-08,
"loss": 2.87,
"step": 864
},
{
"epoch": 2.817589576547231,
"grad_norm": 4.9213948249816895,
"learning_rate": 4.710550125281155e-08,
"loss": 3.1997,
"step": 865
},
{
"epoch": 2.820846905537459,
"grad_norm": 6.972508430480957,
"learning_rate": 4.54721601470659e-08,
"loss": 3.2063,
"step": 866
},
{
"epoch": 2.824104234527687,
"grad_norm": 8.271316528320312,
"learning_rate": 4.3867378372078604e-08,
"loss": 2.9345,
"step": 867
},
{
"epoch": 2.8273615635179152,
"grad_norm": 4.559358596801758,
"learning_rate": 4.2291174600073425e-08,
"loss": 3.5209,
"step": 868
},
{
"epoch": 2.8306188925081432,
"grad_norm": 4.966446876525879,
"learning_rate": 4.074356717075845e-08,
"loss": 3.3913,
"step": 869
},
{
"epoch": 2.8338762214983713,
"grad_norm": 4.800805568695068,
"learning_rate": 3.9224574091113745e-08,
"loss": 3.4585,
"step": 870
},
{
"epoch": 2.8371335504885993,
"grad_norm": 5.908380508422852,
"learning_rate": 3.773421303518043e-08,
"loss": 3.2263,
"step": 871
},
{
"epoch": 2.8403908794788273,
"grad_norm": 6.047445297241211,
"learning_rate": 3.627250134385474e-08,
"loss": 3.0944,
"step": 872
},
{
"epoch": 2.8436482084690553,
"grad_norm": 6.993686199188232,
"learning_rate": 3.4839456024688686e-08,
"loss": 3.0605,
"step": 873
},
{
"epoch": 2.8469055374592833,
"grad_norm": 5.375525951385498,
"learning_rate": 3.343509375168863e-08,
"loss": 3.5609,
"step": 874
},
{
"epoch": 2.8501628664495113,
"grad_norm": 5.510409355163574,
"learning_rate": 3.205943086512508e-08,
"loss": 3.0868,
"step": 875
},
{
"epoch": 2.8534201954397393,
"grad_norm": 7.239731788635254,
"learning_rate": 3.0712483371339306e-08,
"loss": 2.7588,
"step": 876
},
{
"epoch": 2.8566775244299674,
"grad_norm": 8.21719741821289,
"learning_rate": 2.939426694255898e-08,
"loss": 2.9941,
"step": 877
},
{
"epoch": 2.8599348534201954,
"grad_norm": 7.529686450958252,
"learning_rate": 2.8104796916715304e-08,
"loss": 3.0337,
"step": 878
},
{
"epoch": 2.8631921824104234,
"grad_norm": 4.830805778503418,
"learning_rate": 2.6844088297264258e-08,
"loss": 3.1156,
"step": 879
},
{
"epoch": 2.8664495114006514,
"grad_norm": 4.539928913116455,
"learning_rate": 2.5612155753013125e-08,
"loss": 3.342,
"step": 880
},
{
"epoch": 2.8697068403908794,
"grad_norm": 7.2940874099731445,
"learning_rate": 2.4409013617947842e-08,
"loss": 2.9825,
"step": 881
},
{
"epoch": 2.8729641693811074,
"grad_norm": 5.533882141113281,
"learning_rate": 2.3234675891068147e-08,
"loss": 3.2988,
"step": 882
},
{
"epoch": 2.8762214983713354,
"grad_norm": 5.131170749664307,
"learning_rate": 2.2089156236224096e-08,
"loss": 3.7499,
"step": 883
},
{
"epoch": 2.8794788273615635,
"grad_norm": 6.615204811096191,
"learning_rate": 2.097246798195618e-08,
"loss": 3.1587,
"step": 884
},
{
"epoch": 2.8827361563517915,
"grad_norm": 5.77875280380249,
"learning_rate": 1.988462412134129e-08,
"loss": 3.1013,
"step": 885
},
{
"epoch": 2.8859934853420195,
"grad_norm": 7.753470420837402,
"learning_rate": 1.8825637311841727e-08,
"loss": 2.9051,
"step": 886
},
{
"epoch": 2.8892508143322475,
"grad_norm": 6.783411026000977,
"learning_rate": 1.7795519875157262e-08,
"loss": 3.0544,
"step": 887
},
{
"epoch": 2.8925081433224755,
"grad_norm": 6.783967971801758,
"learning_rate": 1.6794283797080813e-08,
"loss": 3.123,
"step": 888
},
{
"epoch": 2.8957654723127035,
"grad_norm": 5.180431842803955,
"learning_rate": 1.5821940727361874e-08,
"loss": 3.2508,
"step": 889
},
{
"epoch": 2.8990228013029316,
"grad_norm": 4.254175186157227,
"learning_rate": 1.487850197956775e-08,
"loss": 3.6052,
"step": 890
},
{
"epoch": 2.9022801302931596,
"grad_norm": 6.317099571228027,
"learning_rate": 1.3963978530954491e-08,
"loss": 3.1259,
"step": 891
},
{
"epoch": 2.9055374592833876,
"grad_norm": 6.046303749084473,
"learning_rate": 1.3078381022336717e-08,
"loss": 3.0762,
"step": 892
},
{
"epoch": 2.9087947882736156,
"grad_norm": 4.559985160827637,
"learning_rate": 1.2221719757966877e-08,
"loss": 3.5281,
"step": 893
},
{
"epoch": 2.9120521172638436,
"grad_norm": 6.569419860839844,
"learning_rate": 1.139400470541202e-08,
"loss": 3.0477,
"step": 894
},
{
"epoch": 2.9153094462540716,
"grad_norm": 6.211750507354736,
"learning_rate": 1.0595245495439999e-08,
"loss": 3.3027,
"step": 895
},
{
"epoch": 2.9185667752442996,
"grad_norm": 7.381767749786377,
"learning_rate": 9.825451421907328e-09,
"loss": 3.2725,
"step": 896
},
{
"epoch": 2.9218241042345277,
"grad_norm": 7.202872276306152,
"learning_rate": 9.084631441649837e-09,
"loss": 3.1016,
"step": 897
},
{
"epoch": 2.9250814332247557,
"grad_norm": 4.793407440185547,
"learning_rate": 8.372794174379418e-09,
"loss": 3.5427,
"step": 898
},
{
"epoch": 2.9283387622149837,
"grad_norm": 6.582172870635986,
"learning_rate": 7.689947902583816e-09,
"loss": 3.0381,
"step": 899
},
{
"epoch": 2.9315960912052117,
"grad_norm": 5.219425201416016,
"learning_rate": 7.03610057142895e-09,
"loss": 3.5185,
"step": 900
},
{
"epoch": 2.9348534201954397,
"grad_norm": 5.58571195602417,
"learning_rate": 6.411259788668967e-09,
"loss": 3.4211,
"step": 901
},
{
"epoch": 2.9381107491856677,
"grad_norm": 4.745430946350098,
"learning_rate": 5.815432824554379e-09,
"loss": 3.595,
"step": 902
},
{
"epoch": 2.9413680781758957,
"grad_norm": 6.919724941253662,
"learning_rate": 5.2486266117510176e-09,
"loss": 3.5303,
"step": 903
},
{
"epoch": 2.9446254071661238,
"grad_norm": 6.697390556335449,
"learning_rate": 4.710847745256209e-09,
"loss": 3.3494,
"step": 904
},
{
"epoch": 2.9478827361563518,
"grad_norm": 4.164398670196533,
"learning_rate": 4.202102482324666e-09,
"loss": 3.5631,
"step": 905
},
{
"epoch": 2.95114006514658,
"grad_norm": 9.10593032836914,
"learning_rate": 3.7223967423935524e-09,
"loss": 3.1123,
"step": 906
},
{
"epoch": 2.954397394136808,
"grad_norm": 4.9842634201049805,
"learning_rate": 3.271736107015033e-09,
"loss": 3.534,
"step": 907
},
{
"epoch": 2.957654723127036,
"grad_norm": 5.11368989944458,
"learning_rate": 2.850125819790772e-09,
"loss": 3.0846,
"step": 908
},
{
"epoch": 2.960912052117264,
"grad_norm": 6.25923490524292,
"learning_rate": 2.45757078631087e-09,
"loss": 3.0751,
"step": 909
},
{
"epoch": 2.964169381107492,
"grad_norm": 5.586625576019287,
"learning_rate": 2.0940755740969654e-09,
"loss": 3.3112,
"step": 910
},
{
"epoch": 2.96742671009772,
"grad_norm": 4.510610580444336,
"learning_rate": 1.7596444125489442e-09,
"loss": 3.4386,
"step": 911
},
{
"epoch": 2.970684039087948,
"grad_norm": 6.0876617431640625,
"learning_rate": 1.4542811928963673e-09,
"loss": 3.3032,
"step": 912
},
{
"epoch": 2.973941368078176,
"grad_norm": 7.20369815826416,
"learning_rate": 1.1779894681515635e-09,
"loss": 3.3911,
"step": 913
},
{
"epoch": 2.977198697068404,
"grad_norm": 5.171374797821045,
"learning_rate": 9.307724530702166e-10,
"loss": 3.2326,
"step": 914
},
{
"epoch": 2.980456026058632,
"grad_norm": 5.716080665588379,
"learning_rate": 7.12633024113063e-10,
"loss": 3.4967,
"step": 915
},
{
"epoch": 2.98371335504886,
"grad_norm": 7.97988748550415,
"learning_rate": 5.235737194120294e-10,
"loss": 2.6134,
"step": 916
},
{
"epoch": 2.986970684039088,
"grad_norm": 4.841028213500977,
"learning_rate": 3.6359673874164505e-10,
"loss": 3.2698,
"step": 917
},
{
"epoch": 2.990228013029316,
"grad_norm": 6.722901821136475,
"learning_rate": 2.3270394349267367e-10,
"loss": 2.7767,
"step": 918
},
{
"epoch": 2.993485342019544,
"grad_norm": 6.788003921508789,
"learning_rate": 1.3089685665046426e-10,
"loss": 2.9311,
"step": 919
},
{
"epoch": 2.996742671009772,
"grad_norm": 5.636250019073486,
"learning_rate": 5.817666277802003e-11,
"loss": 3.2762,
"step": 920
},
{
"epoch": 3.0,
"grad_norm": 4.332888126373291,
"learning_rate": 1.4544208001288085e-11,
"loss": 3.3124,
"step": 921
}
],
"logging_steps": 1,
"max_steps": 921,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8679148719759360.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}