Llama-3.2-1B-Instruct_bma_v00.01 / trainer_state.json
Neelectric's picture
Model save
6be1479 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 882,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011337868480725624,
"grad_norm": 74.33029174804688,
"learning_rate": 0.0,
"loss": 2.7802,
"num_tokens": 1145.0,
"step": 1
},
{
"epoch": 0.0022675736961451248,
"grad_norm": 90.74027252197266,
"learning_rate": 7.407407407407407e-07,
"loss": 3.001,
"num_tokens": 2223.0,
"step": 2
},
{
"epoch": 0.003401360544217687,
"grad_norm": 84.00408172607422,
"learning_rate": 1.4814814814814815e-06,
"loss": 3.0741,
"num_tokens": 3315.0,
"step": 3
},
{
"epoch": 0.0045351473922902496,
"grad_norm": 74.34835052490234,
"learning_rate": 2.222222222222222e-06,
"loss": 2.9966,
"num_tokens": 4429.0,
"step": 4
},
{
"epoch": 0.005668934240362812,
"grad_norm": 74.42596435546875,
"learning_rate": 2.962962962962963e-06,
"loss": 2.8384,
"num_tokens": 5594.0,
"step": 5
},
{
"epoch": 0.006802721088435374,
"grad_norm": 86.56283569335938,
"learning_rate": 3.7037037037037037e-06,
"loss": 2.9764,
"num_tokens": 6751.0,
"step": 6
},
{
"epoch": 0.007936507936507936,
"grad_norm": 83.2591323852539,
"learning_rate": 4.444444444444444e-06,
"loss": 3.067,
"num_tokens": 7768.0,
"step": 7
},
{
"epoch": 0.009070294784580499,
"grad_norm": 79.88240814208984,
"learning_rate": 5.185185185185185e-06,
"loss": 2.7835,
"num_tokens": 8885.0,
"step": 8
},
{
"epoch": 0.01020408163265306,
"grad_norm": 62.56035232543945,
"learning_rate": 5.925925925925926e-06,
"loss": 2.8879,
"num_tokens": 10034.0,
"step": 9
},
{
"epoch": 0.011337868480725623,
"grad_norm": 65.7270278930664,
"learning_rate": 6.666666666666667e-06,
"loss": 2.7762,
"num_tokens": 11110.0,
"step": 10
},
{
"epoch": 0.012471655328798186,
"grad_norm": 66.46719360351562,
"learning_rate": 7.4074074074074075e-06,
"loss": 2.9042,
"num_tokens": 12163.0,
"step": 11
},
{
"epoch": 0.013605442176870748,
"grad_norm": 54.481658935546875,
"learning_rate": 8.148148148148148e-06,
"loss": 2.8072,
"num_tokens": 13214.0,
"step": 12
},
{
"epoch": 0.01473922902494331,
"grad_norm": 47.543304443359375,
"learning_rate": 8.888888888888888e-06,
"loss": 2.5354,
"num_tokens": 14332.0,
"step": 13
},
{
"epoch": 0.015873015873015872,
"grad_norm": 43.516944885253906,
"learning_rate": 9.62962962962963e-06,
"loss": 2.6121,
"num_tokens": 15486.0,
"step": 14
},
{
"epoch": 0.017006802721088437,
"grad_norm": 51.404109954833984,
"learning_rate": 1.037037037037037e-05,
"loss": 2.4756,
"num_tokens": 16505.0,
"step": 15
},
{
"epoch": 0.018140589569160998,
"grad_norm": 49.24734878540039,
"learning_rate": 1.1111111111111113e-05,
"loss": 2.537,
"num_tokens": 17529.0,
"step": 16
},
{
"epoch": 0.01927437641723356,
"grad_norm": 44.85908508300781,
"learning_rate": 1.1851851851851852e-05,
"loss": 2.5237,
"num_tokens": 18625.0,
"step": 17
},
{
"epoch": 0.02040816326530612,
"grad_norm": 42.729736328125,
"learning_rate": 1.2592592592592593e-05,
"loss": 2.3821,
"num_tokens": 19757.0,
"step": 18
},
{
"epoch": 0.021541950113378686,
"grad_norm": 42.92080307006836,
"learning_rate": 1.3333333333333333e-05,
"loss": 2.3942,
"num_tokens": 20818.0,
"step": 19
},
{
"epoch": 0.022675736961451247,
"grad_norm": 38.561309814453125,
"learning_rate": 1.4074074074074075e-05,
"loss": 2.4325,
"num_tokens": 21939.0,
"step": 20
},
{
"epoch": 0.023809523809523808,
"grad_norm": 36.45574188232422,
"learning_rate": 1.4814814814814815e-05,
"loss": 2.1893,
"num_tokens": 23014.0,
"step": 21
},
{
"epoch": 0.024943310657596373,
"grad_norm": 31.59442901611328,
"learning_rate": 1.555555555555556e-05,
"loss": 2.2218,
"num_tokens": 24169.0,
"step": 22
},
{
"epoch": 0.026077097505668934,
"grad_norm": 32.13104248046875,
"learning_rate": 1.6296296296296297e-05,
"loss": 2.2421,
"num_tokens": 25249.0,
"step": 23
},
{
"epoch": 0.027210884353741496,
"grad_norm": 30.995559692382812,
"learning_rate": 1.7037037037037038e-05,
"loss": 2.0935,
"num_tokens": 26313.0,
"step": 24
},
{
"epoch": 0.02834467120181406,
"grad_norm": 33.38172912597656,
"learning_rate": 1.7777777777777777e-05,
"loss": 2.1663,
"num_tokens": 27351.0,
"step": 25
},
{
"epoch": 0.02947845804988662,
"grad_norm": 30.938949584960938,
"learning_rate": 1.851851851851852e-05,
"loss": 1.9916,
"num_tokens": 28367.0,
"step": 26
},
{
"epoch": 0.030612244897959183,
"grad_norm": 31.419017791748047,
"learning_rate": 1.925925925925926e-05,
"loss": 1.9991,
"num_tokens": 29437.0,
"step": 27
},
{
"epoch": 0.031746031746031744,
"grad_norm": 28.316110610961914,
"learning_rate": 2e-05,
"loss": 1.9932,
"num_tokens": 30576.0,
"step": 28
},
{
"epoch": 0.032879818594104306,
"grad_norm": 28.506546020507812,
"learning_rate": 1.9976608187134504e-05,
"loss": 1.9384,
"num_tokens": 31772.0,
"step": 29
},
{
"epoch": 0.034013605442176874,
"grad_norm": 30.74509048461914,
"learning_rate": 1.9953216374269007e-05,
"loss": 1.897,
"num_tokens": 32868.0,
"step": 30
},
{
"epoch": 0.035147392290249435,
"grad_norm": 29.190940856933594,
"learning_rate": 1.992982456140351e-05,
"loss": 2.0675,
"num_tokens": 34025.0,
"step": 31
},
{
"epoch": 0.036281179138321996,
"grad_norm": 27.97025489807129,
"learning_rate": 1.9906432748538015e-05,
"loss": 1.9301,
"num_tokens": 35175.0,
"step": 32
},
{
"epoch": 0.03741496598639456,
"grad_norm": 26.162158966064453,
"learning_rate": 1.9883040935672515e-05,
"loss": 1.8323,
"num_tokens": 36242.0,
"step": 33
},
{
"epoch": 0.03854875283446712,
"grad_norm": 23.444116592407227,
"learning_rate": 1.9859649122807017e-05,
"loss": 1.9189,
"num_tokens": 37352.0,
"step": 34
},
{
"epoch": 0.03968253968253968,
"grad_norm": 24.441686630249023,
"learning_rate": 1.9836257309941523e-05,
"loss": 1.8367,
"num_tokens": 38425.0,
"step": 35
},
{
"epoch": 0.04081632653061224,
"grad_norm": 23.87506675720215,
"learning_rate": 1.9812865497076026e-05,
"loss": 1.7329,
"num_tokens": 39617.0,
"step": 36
},
{
"epoch": 0.04195011337868481,
"grad_norm": 24.666210174560547,
"learning_rate": 1.9789473684210528e-05,
"loss": 1.8583,
"num_tokens": 40651.0,
"step": 37
},
{
"epoch": 0.04308390022675737,
"grad_norm": 21.076745986938477,
"learning_rate": 1.976608187134503e-05,
"loss": 1.7904,
"num_tokens": 41788.0,
"step": 38
},
{
"epoch": 0.04421768707482993,
"grad_norm": 18.543474197387695,
"learning_rate": 1.9742690058479533e-05,
"loss": 1.807,
"num_tokens": 42935.0,
"step": 39
},
{
"epoch": 0.045351473922902494,
"grad_norm": 19.128664016723633,
"learning_rate": 1.9719298245614036e-05,
"loss": 1.6529,
"num_tokens": 44074.0,
"step": 40
},
{
"epoch": 0.046485260770975055,
"grad_norm": 19.04387855529785,
"learning_rate": 1.969590643274854e-05,
"loss": 1.9339,
"num_tokens": 45240.0,
"step": 41
},
{
"epoch": 0.047619047619047616,
"grad_norm": 18.83653450012207,
"learning_rate": 1.9672514619883044e-05,
"loss": 1.7952,
"num_tokens": 46381.0,
"step": 42
},
{
"epoch": 0.048752834467120185,
"grad_norm": 20.276226043701172,
"learning_rate": 1.9649122807017544e-05,
"loss": 1.7274,
"num_tokens": 47452.0,
"step": 43
},
{
"epoch": 0.049886621315192746,
"grad_norm": 18.112844467163086,
"learning_rate": 1.962573099415205e-05,
"loss": 1.7377,
"num_tokens": 48609.0,
"step": 44
},
{
"epoch": 0.05102040816326531,
"grad_norm": 18.350265502929688,
"learning_rate": 1.9602339181286552e-05,
"loss": 1.4819,
"num_tokens": 49744.0,
"step": 45
},
{
"epoch": 0.05215419501133787,
"grad_norm": 19.062835693359375,
"learning_rate": 1.9578947368421055e-05,
"loss": 1.8832,
"num_tokens": 50850.0,
"step": 46
},
{
"epoch": 0.05328798185941043,
"grad_norm": 21.639110565185547,
"learning_rate": 1.9555555555555557e-05,
"loss": 1.7446,
"num_tokens": 51963.0,
"step": 47
},
{
"epoch": 0.05442176870748299,
"grad_norm": 18.648723602294922,
"learning_rate": 1.953216374269006e-05,
"loss": 1.6512,
"num_tokens": 53070.0,
"step": 48
},
{
"epoch": 0.05555555555555555,
"grad_norm": 21.16545867919922,
"learning_rate": 1.9508771929824562e-05,
"loss": 1.7065,
"num_tokens": 54154.0,
"step": 49
},
{
"epoch": 0.05668934240362812,
"grad_norm": 18.10296630859375,
"learning_rate": 1.9485380116959065e-05,
"loss": 1.5725,
"num_tokens": 55278.0,
"step": 50
},
{
"epoch": 0.05782312925170068,
"grad_norm": 18.74391746520996,
"learning_rate": 1.9461988304093568e-05,
"loss": 1.8522,
"num_tokens": 56381.0,
"step": 51
},
{
"epoch": 0.05895691609977324,
"grad_norm": 20.406055450439453,
"learning_rate": 1.9438596491228074e-05,
"loss": 1.6082,
"num_tokens": 57413.0,
"step": 52
},
{
"epoch": 0.060090702947845805,
"grad_norm": 19.050594329833984,
"learning_rate": 1.9415204678362573e-05,
"loss": 1.7722,
"num_tokens": 58555.0,
"step": 53
},
{
"epoch": 0.061224489795918366,
"grad_norm": 19.197467803955078,
"learning_rate": 1.939181286549708e-05,
"loss": 1.6229,
"num_tokens": 59581.0,
"step": 54
},
{
"epoch": 0.06235827664399093,
"grad_norm": 20.112733840942383,
"learning_rate": 1.936842105263158e-05,
"loss": 1.71,
"num_tokens": 60776.0,
"step": 55
},
{
"epoch": 0.06349206349206349,
"grad_norm": 19.704727172851562,
"learning_rate": 1.9345029239766084e-05,
"loss": 1.6764,
"num_tokens": 61957.0,
"step": 56
},
{
"epoch": 0.06462585034013606,
"grad_norm": 19.901477813720703,
"learning_rate": 1.9321637426900586e-05,
"loss": 1.6584,
"num_tokens": 63043.0,
"step": 57
},
{
"epoch": 0.06575963718820861,
"grad_norm": 19.40325927734375,
"learning_rate": 1.929824561403509e-05,
"loss": 1.7776,
"num_tokens": 64129.0,
"step": 58
},
{
"epoch": 0.06689342403628118,
"grad_norm": 19.251150131225586,
"learning_rate": 1.927485380116959e-05,
"loss": 1.5785,
"num_tokens": 65256.0,
"step": 59
},
{
"epoch": 0.06802721088435375,
"grad_norm": 18.591737747192383,
"learning_rate": 1.9251461988304094e-05,
"loss": 1.6411,
"num_tokens": 66376.0,
"step": 60
},
{
"epoch": 0.0691609977324263,
"grad_norm": 18.51482582092285,
"learning_rate": 1.9228070175438597e-05,
"loss": 1.6408,
"num_tokens": 67572.0,
"step": 61
},
{
"epoch": 0.07029478458049887,
"grad_norm": 20.782194137573242,
"learning_rate": 1.9204678362573103e-05,
"loss": 1.5661,
"num_tokens": 68652.0,
"step": 62
},
{
"epoch": 0.07142857142857142,
"grad_norm": 20.952245712280273,
"learning_rate": 1.9181286549707602e-05,
"loss": 1.6445,
"num_tokens": 69693.0,
"step": 63
},
{
"epoch": 0.07256235827664399,
"grad_norm": 19.55459213256836,
"learning_rate": 1.9157894736842108e-05,
"loss": 1.6929,
"num_tokens": 70855.0,
"step": 64
},
{
"epoch": 0.07369614512471655,
"grad_norm": 19.258163452148438,
"learning_rate": 1.913450292397661e-05,
"loss": 1.5356,
"num_tokens": 71899.0,
"step": 65
},
{
"epoch": 0.07482993197278912,
"grad_norm": 19.81553840637207,
"learning_rate": 1.9111111111111113e-05,
"loss": 1.5472,
"num_tokens": 73092.0,
"step": 66
},
{
"epoch": 0.07596371882086168,
"grad_norm": 18.654260635375977,
"learning_rate": 1.9087719298245616e-05,
"loss": 1.6884,
"num_tokens": 74232.0,
"step": 67
},
{
"epoch": 0.07709750566893424,
"grad_norm": 20.07318878173828,
"learning_rate": 1.9064327485380118e-05,
"loss": 1.7197,
"num_tokens": 75347.0,
"step": 68
},
{
"epoch": 0.0782312925170068,
"grad_norm": 20.34659767150879,
"learning_rate": 1.904093567251462e-05,
"loss": 1.736,
"num_tokens": 76455.0,
"step": 69
},
{
"epoch": 0.07936507936507936,
"grad_norm": 18.60862159729004,
"learning_rate": 1.9017543859649123e-05,
"loss": 1.7258,
"num_tokens": 77648.0,
"step": 70
},
{
"epoch": 0.08049886621315193,
"grad_norm": 19.377056121826172,
"learning_rate": 1.8994152046783626e-05,
"loss": 1.7971,
"num_tokens": 78688.0,
"step": 71
},
{
"epoch": 0.08163265306122448,
"grad_norm": 18.83735466003418,
"learning_rate": 1.8970760233918132e-05,
"loss": 1.4687,
"num_tokens": 79760.0,
"step": 72
},
{
"epoch": 0.08276643990929705,
"grad_norm": 19.88797950744629,
"learning_rate": 1.894736842105263e-05,
"loss": 1.6935,
"num_tokens": 80889.0,
"step": 73
},
{
"epoch": 0.08390022675736962,
"grad_norm": 20.648218154907227,
"learning_rate": 1.8923976608187137e-05,
"loss": 1.5982,
"num_tokens": 81898.0,
"step": 74
},
{
"epoch": 0.08503401360544217,
"grad_norm": 20.697324752807617,
"learning_rate": 1.890058479532164e-05,
"loss": 1.7866,
"num_tokens": 83019.0,
"step": 75
},
{
"epoch": 0.08616780045351474,
"grad_norm": 21.66934585571289,
"learning_rate": 1.8877192982456142e-05,
"loss": 1.7473,
"num_tokens": 84034.0,
"step": 76
},
{
"epoch": 0.0873015873015873,
"grad_norm": 18.273752212524414,
"learning_rate": 1.8853801169590645e-05,
"loss": 1.6144,
"num_tokens": 85221.0,
"step": 77
},
{
"epoch": 0.08843537414965986,
"grad_norm": 21.30136489868164,
"learning_rate": 1.8830409356725147e-05,
"loss": 1.6525,
"num_tokens": 86291.0,
"step": 78
},
{
"epoch": 0.08956916099773243,
"grad_norm": 21.55545425415039,
"learning_rate": 1.880701754385965e-05,
"loss": 1.7933,
"num_tokens": 87273.0,
"step": 79
},
{
"epoch": 0.09070294784580499,
"grad_norm": 20.109617233276367,
"learning_rate": 1.8783625730994152e-05,
"loss": 1.6552,
"num_tokens": 88353.0,
"step": 80
},
{
"epoch": 0.09183673469387756,
"grad_norm": 20.001720428466797,
"learning_rate": 1.8760233918128655e-05,
"loss": 1.6845,
"num_tokens": 89389.0,
"step": 81
},
{
"epoch": 0.09297052154195011,
"grad_norm": 20.805374145507812,
"learning_rate": 1.873684210526316e-05,
"loss": 1.597,
"num_tokens": 90434.0,
"step": 82
},
{
"epoch": 0.09410430839002268,
"grad_norm": 18.642532348632812,
"learning_rate": 1.871345029239766e-05,
"loss": 1.5591,
"num_tokens": 91566.0,
"step": 83
},
{
"epoch": 0.09523809523809523,
"grad_norm": 20.010791778564453,
"learning_rate": 1.8690058479532166e-05,
"loss": 1.6093,
"num_tokens": 92699.0,
"step": 84
},
{
"epoch": 0.0963718820861678,
"grad_norm": 21.23207664489746,
"learning_rate": 1.866666666666667e-05,
"loss": 1.7362,
"num_tokens": 93770.0,
"step": 85
},
{
"epoch": 0.09750566893424037,
"grad_norm": 22.148468017578125,
"learning_rate": 1.864327485380117e-05,
"loss": 1.6206,
"num_tokens": 94927.0,
"step": 86
},
{
"epoch": 0.09863945578231292,
"grad_norm": 19.68358039855957,
"learning_rate": 1.8619883040935674e-05,
"loss": 1.7617,
"num_tokens": 96064.0,
"step": 87
},
{
"epoch": 0.09977324263038549,
"grad_norm": 20.17274284362793,
"learning_rate": 1.8596491228070176e-05,
"loss": 1.6107,
"num_tokens": 97169.0,
"step": 88
},
{
"epoch": 0.10090702947845805,
"grad_norm": 19.35859489440918,
"learning_rate": 1.857309941520468e-05,
"loss": 1.5928,
"num_tokens": 98366.0,
"step": 89
},
{
"epoch": 0.10204081632653061,
"grad_norm": 21.101551055908203,
"learning_rate": 1.854970760233918e-05,
"loss": 1.6669,
"num_tokens": 99457.0,
"step": 90
},
{
"epoch": 0.10317460317460317,
"grad_norm": 21.70389175415039,
"learning_rate": 1.8526315789473684e-05,
"loss": 1.7066,
"num_tokens": 100599.0,
"step": 91
},
{
"epoch": 0.10430839002267574,
"grad_norm": 19.57090950012207,
"learning_rate": 1.850292397660819e-05,
"loss": 1.4318,
"num_tokens": 101813.0,
"step": 92
},
{
"epoch": 0.1054421768707483,
"grad_norm": 20.19015884399414,
"learning_rate": 1.847953216374269e-05,
"loss": 1.5283,
"num_tokens": 102920.0,
"step": 93
},
{
"epoch": 0.10657596371882086,
"grad_norm": 21.393760681152344,
"learning_rate": 1.8456140350877195e-05,
"loss": 1.574,
"num_tokens": 104083.0,
"step": 94
},
{
"epoch": 0.10770975056689343,
"grad_norm": 19.35205078125,
"learning_rate": 1.8432748538011698e-05,
"loss": 1.4985,
"num_tokens": 105252.0,
"step": 95
},
{
"epoch": 0.10884353741496598,
"grad_norm": 22.61254119873047,
"learning_rate": 1.84093567251462e-05,
"loss": 1.5807,
"num_tokens": 106354.0,
"step": 96
},
{
"epoch": 0.10997732426303855,
"grad_norm": 21.425067901611328,
"learning_rate": 1.8385964912280703e-05,
"loss": 1.6194,
"num_tokens": 107532.0,
"step": 97
},
{
"epoch": 0.1111111111111111,
"grad_norm": 20.971893310546875,
"learning_rate": 1.8362573099415205e-05,
"loss": 1.6884,
"num_tokens": 108667.0,
"step": 98
},
{
"epoch": 0.11224489795918367,
"grad_norm": 22.08867073059082,
"learning_rate": 1.833918128654971e-05,
"loss": 1.6521,
"num_tokens": 109714.0,
"step": 99
},
{
"epoch": 0.11337868480725624,
"grad_norm": 19.163150787353516,
"learning_rate": 1.831578947368421e-05,
"loss": 1.7947,
"num_tokens": 110894.0,
"step": 100
},
{
"epoch": 0.1145124716553288,
"grad_norm": 20.42814826965332,
"learning_rate": 1.8292397660818713e-05,
"loss": 1.6677,
"num_tokens": 111990.0,
"step": 101
},
{
"epoch": 0.11564625850340136,
"grad_norm": 20.525667190551758,
"learning_rate": 1.826900584795322e-05,
"loss": 1.5593,
"num_tokens": 113085.0,
"step": 102
},
{
"epoch": 0.11678004535147392,
"grad_norm": 21.172090530395508,
"learning_rate": 1.824561403508772e-05,
"loss": 1.6892,
"num_tokens": 114209.0,
"step": 103
},
{
"epoch": 0.11791383219954649,
"grad_norm": 19.004167556762695,
"learning_rate": 1.8222222222222224e-05,
"loss": 1.6172,
"num_tokens": 115268.0,
"step": 104
},
{
"epoch": 0.11904761904761904,
"grad_norm": 21.883514404296875,
"learning_rate": 1.8198830409356727e-05,
"loss": 1.5572,
"num_tokens": 116382.0,
"step": 105
},
{
"epoch": 0.12018140589569161,
"grad_norm": 20.1040096282959,
"learning_rate": 1.817543859649123e-05,
"loss": 1.6517,
"num_tokens": 117504.0,
"step": 106
},
{
"epoch": 0.12131519274376418,
"grad_norm": 19.666257858276367,
"learning_rate": 1.8152046783625732e-05,
"loss": 1.6365,
"num_tokens": 118698.0,
"step": 107
},
{
"epoch": 0.12244897959183673,
"grad_norm": 21.657516479492188,
"learning_rate": 1.8128654970760235e-05,
"loss": 1.7495,
"num_tokens": 119791.0,
"step": 108
},
{
"epoch": 0.1235827664399093,
"grad_norm": 20.718935012817383,
"learning_rate": 1.810526315789474e-05,
"loss": 1.7333,
"num_tokens": 120996.0,
"step": 109
},
{
"epoch": 0.12471655328798185,
"grad_norm": 20.30824851989746,
"learning_rate": 1.808187134502924e-05,
"loss": 1.5361,
"num_tokens": 122066.0,
"step": 110
},
{
"epoch": 0.12585034013605442,
"grad_norm": 21.735916137695312,
"learning_rate": 1.8058479532163746e-05,
"loss": 1.5749,
"num_tokens": 123250.0,
"step": 111
},
{
"epoch": 0.12698412698412698,
"grad_norm": 22.563383102416992,
"learning_rate": 1.8035087719298248e-05,
"loss": 1.562,
"num_tokens": 124337.0,
"step": 112
},
{
"epoch": 0.12811791383219956,
"grad_norm": 22.8643856048584,
"learning_rate": 1.8011695906432747e-05,
"loss": 1.5494,
"num_tokens": 125423.0,
"step": 113
},
{
"epoch": 0.1292517006802721,
"grad_norm": 22.234722137451172,
"learning_rate": 1.7988304093567253e-05,
"loss": 1.7195,
"num_tokens": 126544.0,
"step": 114
},
{
"epoch": 0.13038548752834467,
"grad_norm": 21.315336227416992,
"learning_rate": 1.7964912280701756e-05,
"loss": 1.7304,
"num_tokens": 127718.0,
"step": 115
},
{
"epoch": 0.13151927437641722,
"grad_norm": 22.4656925201416,
"learning_rate": 1.794152046783626e-05,
"loss": 1.6433,
"num_tokens": 128897.0,
"step": 116
},
{
"epoch": 0.1326530612244898,
"grad_norm": 22.329336166381836,
"learning_rate": 1.791812865497076e-05,
"loss": 1.6484,
"num_tokens": 130034.0,
"step": 117
},
{
"epoch": 0.13378684807256236,
"grad_norm": 20.504932403564453,
"learning_rate": 1.7894736842105264e-05,
"loss": 1.5951,
"num_tokens": 131196.0,
"step": 118
},
{
"epoch": 0.1349206349206349,
"grad_norm": 20.69579315185547,
"learning_rate": 1.787134502923977e-05,
"loss": 1.589,
"num_tokens": 132362.0,
"step": 119
},
{
"epoch": 0.1360544217687075,
"grad_norm": 21.02338218688965,
"learning_rate": 1.784795321637427e-05,
"loss": 1.6319,
"num_tokens": 133454.0,
"step": 120
},
{
"epoch": 0.13718820861678005,
"grad_norm": 21.11552619934082,
"learning_rate": 1.7824561403508775e-05,
"loss": 1.6886,
"num_tokens": 134552.0,
"step": 121
},
{
"epoch": 0.1383219954648526,
"grad_norm": 19.719133377075195,
"learning_rate": 1.7801169590643277e-05,
"loss": 1.6521,
"num_tokens": 135670.0,
"step": 122
},
{
"epoch": 0.13945578231292516,
"grad_norm": 25.066865921020508,
"learning_rate": 1.7777777777777777e-05,
"loss": 1.743,
"num_tokens": 136682.0,
"step": 123
},
{
"epoch": 0.14058956916099774,
"grad_norm": 21.543785095214844,
"learning_rate": 1.7754385964912283e-05,
"loss": 1.6218,
"num_tokens": 137759.0,
"step": 124
},
{
"epoch": 0.1417233560090703,
"grad_norm": 21.717317581176758,
"learning_rate": 1.7730994152046785e-05,
"loss": 1.6657,
"num_tokens": 138859.0,
"step": 125
},
{
"epoch": 0.14285714285714285,
"grad_norm": 21.426856994628906,
"learning_rate": 1.7707602339181288e-05,
"loss": 1.4688,
"num_tokens": 140019.0,
"step": 126
},
{
"epoch": 0.14399092970521543,
"grad_norm": 23.49262237548828,
"learning_rate": 1.768421052631579e-05,
"loss": 1.5918,
"num_tokens": 141070.0,
"step": 127
},
{
"epoch": 0.14512471655328799,
"grad_norm": 23.134132385253906,
"learning_rate": 1.7660818713450293e-05,
"loss": 1.5323,
"num_tokens": 142190.0,
"step": 128
},
{
"epoch": 0.14625850340136054,
"grad_norm": 21.138839721679688,
"learning_rate": 1.76374269005848e-05,
"loss": 1.4775,
"num_tokens": 143354.0,
"step": 129
},
{
"epoch": 0.1473922902494331,
"grad_norm": 25.536008834838867,
"learning_rate": 1.7614035087719298e-05,
"loss": 1.5466,
"num_tokens": 144426.0,
"step": 130
},
{
"epoch": 0.14852607709750568,
"grad_norm": 23.146469116210938,
"learning_rate": 1.7590643274853804e-05,
"loss": 1.5849,
"num_tokens": 145548.0,
"step": 131
},
{
"epoch": 0.14965986394557823,
"grad_norm": 25.74847412109375,
"learning_rate": 1.7567251461988307e-05,
"loss": 1.5719,
"num_tokens": 146672.0,
"step": 132
},
{
"epoch": 0.15079365079365079,
"grad_norm": 27.166261672973633,
"learning_rate": 1.754385964912281e-05,
"loss": 1.5535,
"num_tokens": 147779.0,
"step": 133
},
{
"epoch": 0.15192743764172337,
"grad_norm": 36.15412139892578,
"learning_rate": 1.752046783625731e-05,
"loss": 1.5972,
"num_tokens": 148689.0,
"step": 134
},
{
"epoch": 0.15306122448979592,
"grad_norm": 27.721406936645508,
"learning_rate": 1.7497076023391814e-05,
"loss": 1.7406,
"num_tokens": 149900.0,
"step": 135
},
{
"epoch": 0.15419501133786848,
"grad_norm": 28.549758911132812,
"learning_rate": 1.7473684210526317e-05,
"loss": 1.5852,
"num_tokens": 151079.0,
"step": 136
},
{
"epoch": 0.15532879818594103,
"grad_norm": 29.91474151611328,
"learning_rate": 1.745029239766082e-05,
"loss": 1.6233,
"num_tokens": 152168.0,
"step": 137
},
{
"epoch": 0.1564625850340136,
"grad_norm": 29.763633728027344,
"learning_rate": 1.7426900584795322e-05,
"loss": 1.4513,
"num_tokens": 153262.0,
"step": 138
},
{
"epoch": 0.15759637188208617,
"grad_norm": 30.80873680114746,
"learning_rate": 1.7403508771929828e-05,
"loss": 1.4502,
"num_tokens": 154398.0,
"step": 139
},
{
"epoch": 0.15873015873015872,
"grad_norm": 32.4648323059082,
"learning_rate": 1.7380116959064327e-05,
"loss": 1.515,
"num_tokens": 155438.0,
"step": 140
},
{
"epoch": 0.1598639455782313,
"grad_norm": 30.387754440307617,
"learning_rate": 1.7356725146198833e-05,
"loss": 1.5156,
"num_tokens": 156535.0,
"step": 141
},
{
"epoch": 0.16099773242630386,
"grad_norm": 26.93579864501953,
"learning_rate": 1.7333333333333336e-05,
"loss": 1.5558,
"num_tokens": 157676.0,
"step": 142
},
{
"epoch": 0.1621315192743764,
"grad_norm": 23.415470123291016,
"learning_rate": 1.7309941520467838e-05,
"loss": 1.5061,
"num_tokens": 158849.0,
"step": 143
},
{
"epoch": 0.16326530612244897,
"grad_norm": 25.518234252929688,
"learning_rate": 1.728654970760234e-05,
"loss": 1.7354,
"num_tokens": 159943.0,
"step": 144
},
{
"epoch": 0.16439909297052155,
"grad_norm": 20.67085838317871,
"learning_rate": 1.7263157894736843e-05,
"loss": 1.7097,
"num_tokens": 161152.0,
"step": 145
},
{
"epoch": 0.1655328798185941,
"grad_norm": 20.089345932006836,
"learning_rate": 1.7239766081871346e-05,
"loss": 1.5232,
"num_tokens": 162350.0,
"step": 146
},
{
"epoch": 0.16666666666666666,
"grad_norm": 22.614852905273438,
"learning_rate": 1.721637426900585e-05,
"loss": 1.3706,
"num_tokens": 163374.0,
"step": 147
},
{
"epoch": 0.16780045351473924,
"grad_norm": 21.45526695251465,
"learning_rate": 1.719298245614035e-05,
"loss": 1.5264,
"num_tokens": 164537.0,
"step": 148
},
{
"epoch": 0.1689342403628118,
"grad_norm": 23.220937728881836,
"learning_rate": 1.7169590643274857e-05,
"loss": 1.4362,
"num_tokens": 165585.0,
"step": 149
},
{
"epoch": 0.17006802721088435,
"grad_norm": 24.269975662231445,
"learning_rate": 1.7146198830409356e-05,
"loss": 1.5262,
"num_tokens": 166600.0,
"step": 150
},
{
"epoch": 0.1712018140589569,
"grad_norm": 23.387495040893555,
"learning_rate": 1.7122807017543862e-05,
"loss": 1.4802,
"num_tokens": 167686.0,
"step": 151
},
{
"epoch": 0.17233560090702948,
"grad_norm": 23.54978370666504,
"learning_rate": 1.7099415204678365e-05,
"loss": 1.5707,
"num_tokens": 168793.0,
"step": 152
},
{
"epoch": 0.17346938775510204,
"grad_norm": 25.489028930664062,
"learning_rate": 1.7076023391812867e-05,
"loss": 1.5879,
"num_tokens": 169843.0,
"step": 153
},
{
"epoch": 0.1746031746031746,
"grad_norm": 23.096094131469727,
"learning_rate": 1.705263157894737e-05,
"loss": 1.4339,
"num_tokens": 170851.0,
"step": 154
},
{
"epoch": 0.17573696145124718,
"grad_norm": 22.319326400756836,
"learning_rate": 1.7029239766081872e-05,
"loss": 1.6513,
"num_tokens": 171960.0,
"step": 155
},
{
"epoch": 0.17687074829931973,
"grad_norm": 20.970462799072266,
"learning_rate": 1.7005847953216375e-05,
"loss": 1.5231,
"num_tokens": 173095.0,
"step": 156
},
{
"epoch": 0.17800453514739228,
"grad_norm": 22.09793472290039,
"learning_rate": 1.6982456140350878e-05,
"loss": 1.5093,
"num_tokens": 174221.0,
"step": 157
},
{
"epoch": 0.17913832199546487,
"grad_norm": 21.099365234375,
"learning_rate": 1.695906432748538e-05,
"loss": 1.5163,
"num_tokens": 175405.0,
"step": 158
},
{
"epoch": 0.18027210884353742,
"grad_norm": 24.84507179260254,
"learning_rate": 1.6935672514619886e-05,
"loss": 1.4498,
"num_tokens": 176445.0,
"step": 159
},
{
"epoch": 0.18140589569160998,
"grad_norm": 20.663394927978516,
"learning_rate": 1.6912280701754385e-05,
"loss": 1.4815,
"num_tokens": 177648.0,
"step": 160
},
{
"epoch": 0.18253968253968253,
"grad_norm": 20.654090881347656,
"learning_rate": 1.688888888888889e-05,
"loss": 1.534,
"num_tokens": 178831.0,
"step": 161
},
{
"epoch": 0.1836734693877551,
"grad_norm": 21.591787338256836,
"learning_rate": 1.6865497076023394e-05,
"loss": 1.4529,
"num_tokens": 179986.0,
"step": 162
},
{
"epoch": 0.18480725623582767,
"grad_norm": 21.668750762939453,
"learning_rate": 1.6842105263157896e-05,
"loss": 1.5613,
"num_tokens": 181201.0,
"step": 163
},
{
"epoch": 0.18594104308390022,
"grad_norm": 23.887989044189453,
"learning_rate": 1.68187134502924e-05,
"loss": 1.654,
"num_tokens": 182310.0,
"step": 164
},
{
"epoch": 0.1870748299319728,
"grad_norm": 21.20265007019043,
"learning_rate": 1.67953216374269e-05,
"loss": 1.561,
"num_tokens": 183432.0,
"step": 165
},
{
"epoch": 0.18820861678004536,
"grad_norm": 22.128807067871094,
"learning_rate": 1.6771929824561408e-05,
"loss": 1.6709,
"num_tokens": 184597.0,
"step": 166
},
{
"epoch": 0.1893424036281179,
"grad_norm": 24.24042320251465,
"learning_rate": 1.6748538011695907e-05,
"loss": 1.5693,
"num_tokens": 185668.0,
"step": 167
},
{
"epoch": 0.19047619047619047,
"grad_norm": 21.286191940307617,
"learning_rate": 1.672514619883041e-05,
"loss": 1.4641,
"num_tokens": 186778.0,
"step": 168
},
{
"epoch": 0.19160997732426305,
"grad_norm": 21.220508575439453,
"learning_rate": 1.6701754385964915e-05,
"loss": 1.6059,
"num_tokens": 187963.0,
"step": 169
},
{
"epoch": 0.1927437641723356,
"grad_norm": 20.513689041137695,
"learning_rate": 1.6678362573099414e-05,
"loss": 1.496,
"num_tokens": 189061.0,
"step": 170
},
{
"epoch": 0.19387755102040816,
"grad_norm": 22.597923278808594,
"learning_rate": 1.665497076023392e-05,
"loss": 1.5342,
"num_tokens": 190184.0,
"step": 171
},
{
"epoch": 0.19501133786848074,
"grad_norm": 22.039106369018555,
"learning_rate": 1.6631578947368423e-05,
"loss": 1.4233,
"num_tokens": 191228.0,
"step": 172
},
{
"epoch": 0.1961451247165533,
"grad_norm": 21.834428787231445,
"learning_rate": 1.6608187134502926e-05,
"loss": 1.461,
"num_tokens": 192388.0,
"step": 173
},
{
"epoch": 0.19727891156462585,
"grad_norm": 20.996660232543945,
"learning_rate": 1.6584795321637428e-05,
"loss": 1.5707,
"num_tokens": 193528.0,
"step": 174
},
{
"epoch": 0.1984126984126984,
"grad_norm": 21.746488571166992,
"learning_rate": 1.656140350877193e-05,
"loss": 1.4698,
"num_tokens": 194550.0,
"step": 175
},
{
"epoch": 0.19954648526077098,
"grad_norm": 22.856365203857422,
"learning_rate": 1.6538011695906437e-05,
"loss": 1.4731,
"num_tokens": 195677.0,
"step": 176
},
{
"epoch": 0.20068027210884354,
"grad_norm": 21.368350982666016,
"learning_rate": 1.6514619883040936e-05,
"loss": 1.481,
"num_tokens": 196912.0,
"step": 177
},
{
"epoch": 0.2018140589569161,
"grad_norm": 20.245771408081055,
"learning_rate": 1.649122807017544e-05,
"loss": 1.4977,
"num_tokens": 198043.0,
"step": 178
},
{
"epoch": 0.20294784580498867,
"grad_norm": 25.99681282043457,
"learning_rate": 1.6467836257309944e-05,
"loss": 1.7099,
"num_tokens": 199157.0,
"step": 179
},
{
"epoch": 0.20408163265306123,
"grad_norm": 22.558382034301758,
"learning_rate": 1.6444444444444444e-05,
"loss": 1.5767,
"num_tokens": 200255.0,
"step": 180
},
{
"epoch": 0.20521541950113378,
"grad_norm": 22.796449661254883,
"learning_rate": 1.642105263157895e-05,
"loss": 1.4589,
"num_tokens": 201297.0,
"step": 181
},
{
"epoch": 0.20634920634920634,
"grad_norm": 20.285388946533203,
"learning_rate": 1.6397660818713452e-05,
"loss": 1.4589,
"num_tokens": 202492.0,
"step": 182
},
{
"epoch": 0.20748299319727892,
"grad_norm": 20.92437744140625,
"learning_rate": 1.6374269005847955e-05,
"loss": 1.5125,
"num_tokens": 203650.0,
"step": 183
},
{
"epoch": 0.20861678004535147,
"grad_norm": 22.35742950439453,
"learning_rate": 1.6350877192982457e-05,
"loss": 1.442,
"num_tokens": 204723.0,
"step": 184
},
{
"epoch": 0.20975056689342403,
"grad_norm": 24.2554931640625,
"learning_rate": 1.632748538011696e-05,
"loss": 1.4875,
"num_tokens": 205792.0,
"step": 185
},
{
"epoch": 0.2108843537414966,
"grad_norm": 23.882658004760742,
"learning_rate": 1.6304093567251466e-05,
"loss": 1.6708,
"num_tokens": 206878.0,
"step": 186
},
{
"epoch": 0.21201814058956917,
"grad_norm": 22.755496978759766,
"learning_rate": 1.6280701754385965e-05,
"loss": 1.5278,
"num_tokens": 208071.0,
"step": 187
},
{
"epoch": 0.21315192743764172,
"grad_norm": 22.447935104370117,
"learning_rate": 1.625730994152047e-05,
"loss": 1.4872,
"num_tokens": 209227.0,
"step": 188
},
{
"epoch": 0.21428571428571427,
"grad_norm": 22.08466339111328,
"learning_rate": 1.6233918128654974e-05,
"loss": 1.5934,
"num_tokens": 210372.0,
"step": 189
},
{
"epoch": 0.21541950113378686,
"grad_norm": 21.285282135009766,
"learning_rate": 1.6210526315789473e-05,
"loss": 1.4961,
"num_tokens": 211604.0,
"step": 190
},
{
"epoch": 0.2165532879818594,
"grad_norm": 21.292037963867188,
"learning_rate": 1.618713450292398e-05,
"loss": 1.503,
"num_tokens": 212855.0,
"step": 191
},
{
"epoch": 0.21768707482993196,
"grad_norm": 21.15336036682129,
"learning_rate": 1.616374269005848e-05,
"loss": 1.4699,
"num_tokens": 213956.0,
"step": 192
},
{
"epoch": 0.21882086167800455,
"grad_norm": 22.280893325805664,
"learning_rate": 1.6140350877192984e-05,
"loss": 1.6332,
"num_tokens": 215120.0,
"step": 193
},
{
"epoch": 0.2199546485260771,
"grad_norm": 22.338592529296875,
"learning_rate": 1.6116959064327486e-05,
"loss": 1.4444,
"num_tokens": 216206.0,
"step": 194
},
{
"epoch": 0.22108843537414966,
"grad_norm": 20.818641662597656,
"learning_rate": 1.609356725146199e-05,
"loss": 1.4245,
"num_tokens": 217303.0,
"step": 195
},
{
"epoch": 0.2222222222222222,
"grad_norm": 23.549814224243164,
"learning_rate": 1.6070175438596495e-05,
"loss": 1.5753,
"num_tokens": 218327.0,
"step": 196
},
{
"epoch": 0.2233560090702948,
"grad_norm": 22.222686767578125,
"learning_rate": 1.6046783625730994e-05,
"loss": 1.3668,
"num_tokens": 219485.0,
"step": 197
},
{
"epoch": 0.22448979591836735,
"grad_norm": 19.54814910888672,
"learning_rate": 1.60233918128655e-05,
"loss": 1.6546,
"num_tokens": 220708.0,
"step": 198
},
{
"epoch": 0.2256235827664399,
"grad_norm": 20.543609619140625,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.5096,
"num_tokens": 221781.0,
"step": 199
},
{
"epoch": 0.22675736961451248,
"grad_norm": 21.107006072998047,
"learning_rate": 1.5976608187134505e-05,
"loss": 1.5088,
"num_tokens": 222829.0,
"step": 200
},
{
"epoch": 0.22789115646258504,
"grad_norm": 23.751380920410156,
"learning_rate": 1.5953216374269008e-05,
"loss": 1.4793,
"num_tokens": 224071.0,
"step": 201
},
{
"epoch": 0.2290249433106576,
"grad_norm": 22.424638748168945,
"learning_rate": 1.592982456140351e-05,
"loss": 1.6096,
"num_tokens": 225179.0,
"step": 202
},
{
"epoch": 0.23015873015873015,
"grad_norm": 22.971948623657227,
"learning_rate": 1.5906432748538013e-05,
"loss": 1.4364,
"num_tokens": 226250.0,
"step": 203
},
{
"epoch": 0.23129251700680273,
"grad_norm": 21.397802352905273,
"learning_rate": 1.5883040935672516e-05,
"loss": 1.5396,
"num_tokens": 227380.0,
"step": 204
},
{
"epoch": 0.23242630385487528,
"grad_norm": 46.37774658203125,
"learning_rate": 1.5859649122807018e-05,
"loss": 1.43,
"num_tokens": 228565.0,
"step": 205
},
{
"epoch": 0.23356009070294784,
"grad_norm": 24.68637466430664,
"learning_rate": 1.583625730994152e-05,
"loss": 1.683,
"num_tokens": 229639.0,
"step": 206
},
{
"epoch": 0.23469387755102042,
"grad_norm": 21.283252716064453,
"learning_rate": 1.5812865497076023e-05,
"loss": 1.5629,
"num_tokens": 230753.0,
"step": 207
},
{
"epoch": 0.23582766439909297,
"grad_norm": 22.06300926208496,
"learning_rate": 1.578947368421053e-05,
"loss": 1.5683,
"num_tokens": 231866.0,
"step": 208
},
{
"epoch": 0.23696145124716553,
"grad_norm": 21.9282283782959,
"learning_rate": 1.5766081871345032e-05,
"loss": 1.5273,
"num_tokens": 233001.0,
"step": 209
},
{
"epoch": 0.23809523809523808,
"grad_norm": 21.07894515991211,
"learning_rate": 1.5742690058479534e-05,
"loss": 1.5438,
"num_tokens": 234113.0,
"step": 210
},
{
"epoch": 0.23922902494331066,
"grad_norm": 19.786283493041992,
"learning_rate": 1.5719298245614037e-05,
"loss": 1.3995,
"num_tokens": 235294.0,
"step": 211
},
{
"epoch": 0.24036281179138322,
"grad_norm": 23.547483444213867,
"learning_rate": 1.569590643274854e-05,
"loss": 1.4131,
"num_tokens": 236392.0,
"step": 212
},
{
"epoch": 0.24149659863945577,
"grad_norm": 20.67225456237793,
"learning_rate": 1.5672514619883042e-05,
"loss": 1.4733,
"num_tokens": 237522.0,
"step": 213
},
{
"epoch": 0.24263038548752835,
"grad_norm": 22.142709732055664,
"learning_rate": 1.5649122807017545e-05,
"loss": 1.5663,
"num_tokens": 238635.0,
"step": 214
},
{
"epoch": 0.2437641723356009,
"grad_norm": 23.121814727783203,
"learning_rate": 1.5625730994152047e-05,
"loss": 1.3922,
"num_tokens": 239690.0,
"step": 215
},
{
"epoch": 0.24489795918367346,
"grad_norm": 25.34332847595215,
"learning_rate": 1.560233918128655e-05,
"loss": 1.529,
"num_tokens": 240712.0,
"step": 216
},
{
"epoch": 0.24603174603174602,
"grad_norm": 20.340042114257812,
"learning_rate": 1.5578947368421052e-05,
"loss": 1.4748,
"num_tokens": 241899.0,
"step": 217
},
{
"epoch": 0.2471655328798186,
"grad_norm": 21.851383209228516,
"learning_rate": 1.555555555555556e-05,
"loss": 1.537,
"num_tokens": 242975.0,
"step": 218
},
{
"epoch": 0.24829931972789115,
"grad_norm": 22.453187942504883,
"learning_rate": 1.553216374269006e-05,
"loss": 1.5451,
"num_tokens": 244129.0,
"step": 219
},
{
"epoch": 0.2494331065759637,
"grad_norm": 22.221010208129883,
"learning_rate": 1.5508771929824563e-05,
"loss": 1.5423,
"num_tokens": 245290.0,
"step": 220
},
{
"epoch": 0.25056689342403626,
"grad_norm": 22.524154663085938,
"learning_rate": 1.5485380116959066e-05,
"loss": 1.6421,
"num_tokens": 246386.0,
"step": 221
},
{
"epoch": 0.25170068027210885,
"grad_norm": 21.442996978759766,
"learning_rate": 1.546198830409357e-05,
"loss": 1.4855,
"num_tokens": 247440.0,
"step": 222
},
{
"epoch": 0.2528344671201814,
"grad_norm": 24.744312286376953,
"learning_rate": 1.543859649122807e-05,
"loss": 1.3662,
"num_tokens": 248514.0,
"step": 223
},
{
"epoch": 0.25396825396825395,
"grad_norm": 23.523515701293945,
"learning_rate": 1.5415204678362574e-05,
"loss": 1.4826,
"num_tokens": 249642.0,
"step": 224
},
{
"epoch": 0.25510204081632654,
"grad_norm": 23.982515335083008,
"learning_rate": 1.5391812865497076e-05,
"loss": 1.3806,
"num_tokens": 250712.0,
"step": 225
},
{
"epoch": 0.2562358276643991,
"grad_norm": 22.8784236907959,
"learning_rate": 1.536842105263158e-05,
"loss": 1.4332,
"num_tokens": 251897.0,
"step": 226
},
{
"epoch": 0.25736961451247165,
"grad_norm": 22.019994735717773,
"learning_rate": 1.534502923976608e-05,
"loss": 1.418,
"num_tokens": 253031.0,
"step": 227
},
{
"epoch": 0.2585034013605442,
"grad_norm": 22.86067008972168,
"learning_rate": 1.5321637426900587e-05,
"loss": 1.6491,
"num_tokens": 254172.0,
"step": 228
},
{
"epoch": 0.25963718820861675,
"grad_norm": 22.04460334777832,
"learning_rate": 1.529824561403509e-05,
"loss": 1.373,
"num_tokens": 255350.0,
"step": 229
},
{
"epoch": 0.26077097505668934,
"grad_norm": 21.948341369628906,
"learning_rate": 1.5274853801169593e-05,
"loss": 1.441,
"num_tokens": 256554.0,
"step": 230
},
{
"epoch": 0.2619047619047619,
"grad_norm": 22.99373435974121,
"learning_rate": 1.5251461988304095e-05,
"loss": 1.5293,
"num_tokens": 257729.0,
"step": 231
},
{
"epoch": 0.26303854875283444,
"grad_norm": 25.595821380615234,
"learning_rate": 1.5228070175438598e-05,
"loss": 1.4453,
"num_tokens": 258806.0,
"step": 232
},
{
"epoch": 0.264172335600907,
"grad_norm": 22.947647094726562,
"learning_rate": 1.52046783625731e-05,
"loss": 1.5168,
"num_tokens": 259874.0,
"step": 233
},
{
"epoch": 0.2653061224489796,
"grad_norm": 25.86823081970215,
"learning_rate": 1.5181286549707603e-05,
"loss": 1.5426,
"num_tokens": 260959.0,
"step": 234
},
{
"epoch": 0.26643990929705214,
"grad_norm": 20.40091896057129,
"learning_rate": 1.5157894736842107e-05,
"loss": 1.7117,
"num_tokens": 262189.0,
"step": 235
},
{
"epoch": 0.2675736961451247,
"grad_norm": 21.5074462890625,
"learning_rate": 1.5134502923976608e-05,
"loss": 1.4967,
"num_tokens": 263381.0,
"step": 236
},
{
"epoch": 0.2687074829931973,
"grad_norm": 20.92792320251465,
"learning_rate": 1.5111111111111112e-05,
"loss": 1.5214,
"num_tokens": 264514.0,
"step": 237
},
{
"epoch": 0.2698412698412698,
"grad_norm": 22.8386287689209,
"learning_rate": 1.5087719298245615e-05,
"loss": 1.5155,
"num_tokens": 265629.0,
"step": 238
},
{
"epoch": 0.2709750566893424,
"grad_norm": 21.064926147460938,
"learning_rate": 1.5064327485380119e-05,
"loss": 1.44,
"num_tokens": 266750.0,
"step": 239
},
{
"epoch": 0.272108843537415,
"grad_norm": 22.04184913635254,
"learning_rate": 1.504093567251462e-05,
"loss": 1.5298,
"num_tokens": 267781.0,
"step": 240
},
{
"epoch": 0.2732426303854875,
"grad_norm": 24.02085304260254,
"learning_rate": 1.5017543859649124e-05,
"loss": 1.425,
"num_tokens": 268921.0,
"step": 241
},
{
"epoch": 0.2743764172335601,
"grad_norm": 22.363149642944336,
"learning_rate": 1.4994152046783627e-05,
"loss": 1.4996,
"num_tokens": 269985.0,
"step": 242
},
{
"epoch": 0.2755102040816326,
"grad_norm": 23.14397430419922,
"learning_rate": 1.497076023391813e-05,
"loss": 1.5728,
"num_tokens": 271085.0,
"step": 243
},
{
"epoch": 0.2766439909297052,
"grad_norm": 21.870580673217773,
"learning_rate": 1.4947368421052632e-05,
"loss": 1.5506,
"num_tokens": 272207.0,
"step": 244
},
{
"epoch": 0.2777777777777778,
"grad_norm": 23.548410415649414,
"learning_rate": 1.4923976608187136e-05,
"loss": 1.4119,
"num_tokens": 273195.0,
"step": 245
},
{
"epoch": 0.2789115646258503,
"grad_norm": 22.918581008911133,
"learning_rate": 1.4900584795321637e-05,
"loss": 1.593,
"num_tokens": 274395.0,
"step": 246
},
{
"epoch": 0.2800453514739229,
"grad_norm": 24.52320671081543,
"learning_rate": 1.4877192982456141e-05,
"loss": 1.592,
"num_tokens": 275413.0,
"step": 247
},
{
"epoch": 0.2811791383219955,
"grad_norm": 25.395458221435547,
"learning_rate": 1.4853801169590644e-05,
"loss": 1.3833,
"num_tokens": 276425.0,
"step": 248
},
{
"epoch": 0.282312925170068,
"grad_norm": 20.522045135498047,
"learning_rate": 1.4830409356725148e-05,
"loss": 1.4719,
"num_tokens": 277674.0,
"step": 249
},
{
"epoch": 0.2834467120181406,
"grad_norm": 22.4309024810791,
"learning_rate": 1.4807017543859649e-05,
"loss": 1.3931,
"num_tokens": 278761.0,
"step": 250
},
{
"epoch": 0.28458049886621317,
"grad_norm": 23.054508209228516,
"learning_rate": 1.4783625730994153e-05,
"loss": 1.6357,
"num_tokens": 279900.0,
"step": 251
},
{
"epoch": 0.2857142857142857,
"grad_norm": 22.669353485107422,
"learning_rate": 1.4760233918128658e-05,
"loss": 1.5954,
"num_tokens": 281152.0,
"step": 252
},
{
"epoch": 0.2868480725623583,
"grad_norm": 24.546659469604492,
"learning_rate": 1.4736842105263159e-05,
"loss": 1.5536,
"num_tokens": 282236.0,
"step": 253
},
{
"epoch": 0.28798185941043086,
"grad_norm": 22.03496551513672,
"learning_rate": 1.4713450292397661e-05,
"loss": 1.4711,
"num_tokens": 283294.0,
"step": 254
},
{
"epoch": 0.2891156462585034,
"grad_norm": 23.8269100189209,
"learning_rate": 1.4690058479532165e-05,
"loss": 1.5215,
"num_tokens": 284345.0,
"step": 255
},
{
"epoch": 0.29024943310657597,
"grad_norm": 21.356388092041016,
"learning_rate": 1.4666666666666666e-05,
"loss": 1.6076,
"num_tokens": 285419.0,
"step": 256
},
{
"epoch": 0.29138321995464855,
"grad_norm": 20.89931869506836,
"learning_rate": 1.464327485380117e-05,
"loss": 1.4129,
"num_tokens": 286535.0,
"step": 257
},
{
"epoch": 0.2925170068027211,
"grad_norm": 21.1955623626709,
"learning_rate": 1.4619883040935675e-05,
"loss": 1.469,
"num_tokens": 287631.0,
"step": 258
},
{
"epoch": 0.29365079365079366,
"grad_norm": 19.76694107055664,
"learning_rate": 1.4596491228070177e-05,
"loss": 1.6142,
"num_tokens": 288754.0,
"step": 259
},
{
"epoch": 0.2947845804988662,
"grad_norm": 21.243698120117188,
"learning_rate": 1.4573099415204678e-05,
"loss": 1.5581,
"num_tokens": 289958.0,
"step": 260
},
{
"epoch": 0.29591836734693877,
"grad_norm": 22.617021560668945,
"learning_rate": 1.4549707602339183e-05,
"loss": 1.4423,
"num_tokens": 291137.0,
"step": 261
},
{
"epoch": 0.29705215419501135,
"grad_norm": 20.3275146484375,
"learning_rate": 1.4526315789473687e-05,
"loss": 1.5287,
"num_tokens": 292312.0,
"step": 262
},
{
"epoch": 0.2981859410430839,
"grad_norm": 26.21241569519043,
"learning_rate": 1.4502923976608188e-05,
"loss": 1.4952,
"num_tokens": 293355.0,
"step": 263
},
{
"epoch": 0.29931972789115646,
"grad_norm": 23.586990356445312,
"learning_rate": 1.447953216374269e-05,
"loss": 1.4412,
"num_tokens": 294480.0,
"step": 264
},
{
"epoch": 0.30045351473922904,
"grad_norm": 23.049999237060547,
"learning_rate": 1.4456140350877195e-05,
"loss": 1.5597,
"num_tokens": 295543.0,
"step": 265
},
{
"epoch": 0.30158730158730157,
"grad_norm": 22.225637435913086,
"learning_rate": 1.4432748538011695e-05,
"loss": 1.5182,
"num_tokens": 296682.0,
"step": 266
},
{
"epoch": 0.30272108843537415,
"grad_norm": 22.19669532775879,
"learning_rate": 1.44093567251462e-05,
"loss": 1.5603,
"num_tokens": 297762.0,
"step": 267
},
{
"epoch": 0.30385487528344673,
"grad_norm": 20.865692138671875,
"learning_rate": 1.4385964912280704e-05,
"loss": 1.4716,
"num_tokens": 298916.0,
"step": 268
},
{
"epoch": 0.30498866213151926,
"grad_norm": 24.449691772460938,
"learning_rate": 1.4362573099415207e-05,
"loss": 1.3971,
"num_tokens": 299936.0,
"step": 269
},
{
"epoch": 0.30612244897959184,
"grad_norm": 21.59602165222168,
"learning_rate": 1.4339181286549707e-05,
"loss": 1.5539,
"num_tokens": 301074.0,
"step": 270
},
{
"epoch": 0.3072562358276644,
"grad_norm": 23.660263061523438,
"learning_rate": 1.4315789473684212e-05,
"loss": 1.4892,
"num_tokens": 302125.0,
"step": 271
},
{
"epoch": 0.30839002267573695,
"grad_norm": 22.853479385375977,
"learning_rate": 1.4292397660818716e-05,
"loss": 1.4667,
"num_tokens": 303263.0,
"step": 272
},
{
"epoch": 0.30952380952380953,
"grad_norm": 23.96630859375,
"learning_rate": 1.4269005847953217e-05,
"loss": 1.4877,
"num_tokens": 304463.0,
"step": 273
},
{
"epoch": 0.31065759637188206,
"grad_norm": 21.596799850463867,
"learning_rate": 1.4245614035087721e-05,
"loss": 1.4767,
"num_tokens": 305660.0,
"step": 274
},
{
"epoch": 0.31179138321995464,
"grad_norm": 23.900022506713867,
"learning_rate": 1.4222222222222224e-05,
"loss": 1.4882,
"num_tokens": 306684.0,
"step": 275
},
{
"epoch": 0.3129251700680272,
"grad_norm": 21.063495635986328,
"learning_rate": 1.4198830409356725e-05,
"loss": 1.4495,
"num_tokens": 307813.0,
"step": 276
},
{
"epoch": 0.31405895691609975,
"grad_norm": 23.05027198791504,
"learning_rate": 1.4175438596491229e-05,
"loss": 1.4677,
"num_tokens": 308876.0,
"step": 277
},
{
"epoch": 0.31519274376417233,
"grad_norm": 20.81597900390625,
"learning_rate": 1.4152046783625733e-05,
"loss": 1.4085,
"num_tokens": 310012.0,
"step": 278
},
{
"epoch": 0.3163265306122449,
"grad_norm": 23.965967178344727,
"learning_rate": 1.4128654970760236e-05,
"loss": 1.4615,
"num_tokens": 311070.0,
"step": 279
},
{
"epoch": 0.31746031746031744,
"grad_norm": 22.640148162841797,
"learning_rate": 1.4105263157894738e-05,
"loss": 1.3822,
"num_tokens": 312154.0,
"step": 280
},
{
"epoch": 0.31859410430839,
"grad_norm": 22.867446899414062,
"learning_rate": 1.408187134502924e-05,
"loss": 1.6723,
"num_tokens": 313253.0,
"step": 281
},
{
"epoch": 0.3197278911564626,
"grad_norm": 22.407011032104492,
"learning_rate": 1.4058479532163745e-05,
"loss": 1.4687,
"num_tokens": 314325.0,
"step": 282
},
{
"epoch": 0.32086167800453513,
"grad_norm": 21.54814338684082,
"learning_rate": 1.4035087719298246e-05,
"loss": 1.5139,
"num_tokens": 315399.0,
"step": 283
},
{
"epoch": 0.3219954648526077,
"grad_norm": 21.578617095947266,
"learning_rate": 1.401169590643275e-05,
"loss": 1.3751,
"num_tokens": 316543.0,
"step": 284
},
{
"epoch": 0.3231292517006803,
"grad_norm": 21.266693115234375,
"learning_rate": 1.3988304093567253e-05,
"loss": 1.4847,
"num_tokens": 317658.0,
"step": 285
},
{
"epoch": 0.3242630385487528,
"grad_norm": 23.686180114746094,
"learning_rate": 1.3964912280701755e-05,
"loss": 1.491,
"num_tokens": 318773.0,
"step": 286
},
{
"epoch": 0.3253968253968254,
"grad_norm": 22.65009117126465,
"learning_rate": 1.3941520467836258e-05,
"loss": 1.5428,
"num_tokens": 319995.0,
"step": 287
},
{
"epoch": 0.32653061224489793,
"grad_norm": 20.74267578125,
"learning_rate": 1.3918128654970762e-05,
"loss": 1.3828,
"num_tokens": 321078.0,
"step": 288
},
{
"epoch": 0.3276643990929705,
"grad_norm": 22.6479434967041,
"learning_rate": 1.3894736842105265e-05,
"loss": 1.4634,
"num_tokens": 322140.0,
"step": 289
},
{
"epoch": 0.3287981859410431,
"grad_norm": 23.637678146362305,
"learning_rate": 1.3871345029239767e-05,
"loss": 1.4539,
"num_tokens": 323326.0,
"step": 290
},
{
"epoch": 0.3299319727891156,
"grad_norm": 20.191329956054688,
"learning_rate": 1.384795321637427e-05,
"loss": 1.3562,
"num_tokens": 324391.0,
"step": 291
},
{
"epoch": 0.3310657596371882,
"grad_norm": 24.18254852294922,
"learning_rate": 1.3824561403508774e-05,
"loss": 1.407,
"num_tokens": 325423.0,
"step": 292
},
{
"epoch": 0.3321995464852608,
"grad_norm": 22.894956588745117,
"learning_rate": 1.3801169590643275e-05,
"loss": 1.4151,
"num_tokens": 326471.0,
"step": 293
},
{
"epoch": 0.3333333333333333,
"grad_norm": 25.60346221923828,
"learning_rate": 1.377777777777778e-05,
"loss": 1.5366,
"num_tokens": 327570.0,
"step": 294
},
{
"epoch": 0.3344671201814059,
"grad_norm": 23.408321380615234,
"learning_rate": 1.3754385964912282e-05,
"loss": 1.5041,
"num_tokens": 328672.0,
"step": 295
},
{
"epoch": 0.3356009070294785,
"grad_norm": 23.32335662841797,
"learning_rate": 1.3730994152046784e-05,
"loss": 1.5033,
"num_tokens": 329696.0,
"step": 296
},
{
"epoch": 0.336734693877551,
"grad_norm": 20.227418899536133,
"learning_rate": 1.3707602339181287e-05,
"loss": 1.3922,
"num_tokens": 330810.0,
"step": 297
},
{
"epoch": 0.3378684807256236,
"grad_norm": 20.30182647705078,
"learning_rate": 1.3684210526315791e-05,
"loss": 1.4642,
"num_tokens": 331998.0,
"step": 298
},
{
"epoch": 0.33900226757369617,
"grad_norm": 22.34268569946289,
"learning_rate": 1.3660818713450294e-05,
"loss": 1.3948,
"num_tokens": 333011.0,
"step": 299
},
{
"epoch": 0.3401360544217687,
"grad_norm": 21.91162872314453,
"learning_rate": 1.3637426900584796e-05,
"loss": 1.4822,
"num_tokens": 334104.0,
"step": 300
},
{
"epoch": 0.3412698412698413,
"grad_norm": 22.55190658569336,
"learning_rate": 1.3614035087719299e-05,
"loss": 1.6097,
"num_tokens": 335283.0,
"step": 301
},
{
"epoch": 0.3424036281179138,
"grad_norm": 21.861995697021484,
"learning_rate": 1.3590643274853803e-05,
"loss": 1.588,
"num_tokens": 336363.0,
"step": 302
},
{
"epoch": 0.3435374149659864,
"grad_norm": 21.788677215576172,
"learning_rate": 1.3567251461988304e-05,
"loss": 1.582,
"num_tokens": 337576.0,
"step": 303
},
{
"epoch": 0.34467120181405897,
"grad_norm": 20.740257263183594,
"learning_rate": 1.3543859649122808e-05,
"loss": 1.3907,
"num_tokens": 338641.0,
"step": 304
},
{
"epoch": 0.3458049886621315,
"grad_norm": 21.97249412536621,
"learning_rate": 1.3520467836257311e-05,
"loss": 1.5256,
"num_tokens": 339795.0,
"step": 305
},
{
"epoch": 0.3469387755102041,
"grad_norm": 21.80385971069336,
"learning_rate": 1.3497076023391814e-05,
"loss": 1.4808,
"num_tokens": 340889.0,
"step": 306
},
{
"epoch": 0.34807256235827666,
"grad_norm": 22.424528121948242,
"learning_rate": 1.3473684210526316e-05,
"loss": 1.575,
"num_tokens": 342007.0,
"step": 307
},
{
"epoch": 0.3492063492063492,
"grad_norm": 22.06599998474121,
"learning_rate": 1.345029239766082e-05,
"loss": 1.307,
"num_tokens": 343151.0,
"step": 308
},
{
"epoch": 0.35034013605442177,
"grad_norm": 24.721797943115234,
"learning_rate": 1.3426900584795323e-05,
"loss": 1.5736,
"num_tokens": 344208.0,
"step": 309
},
{
"epoch": 0.35147392290249435,
"grad_norm": 22.60153579711914,
"learning_rate": 1.3403508771929826e-05,
"loss": 1.5604,
"num_tokens": 345374.0,
"step": 310
},
{
"epoch": 0.3526077097505669,
"grad_norm": 21.573318481445312,
"learning_rate": 1.3380116959064328e-05,
"loss": 1.524,
"num_tokens": 346608.0,
"step": 311
},
{
"epoch": 0.35374149659863946,
"grad_norm": 25.30203628540039,
"learning_rate": 1.3356725146198832e-05,
"loss": 1.6416,
"num_tokens": 347714.0,
"step": 312
},
{
"epoch": 0.35487528344671204,
"grad_norm": 22.905746459960938,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.5275,
"num_tokens": 348930.0,
"step": 313
},
{
"epoch": 0.35600907029478457,
"grad_norm": 23.923063278198242,
"learning_rate": 1.3309941520467838e-05,
"loss": 1.5784,
"num_tokens": 349999.0,
"step": 314
},
{
"epoch": 0.35714285714285715,
"grad_norm": 22.065505981445312,
"learning_rate": 1.328654970760234e-05,
"loss": 1.5701,
"num_tokens": 351123.0,
"step": 315
},
{
"epoch": 0.35827664399092973,
"grad_norm": 22.438655853271484,
"learning_rate": 1.3263157894736843e-05,
"loss": 1.5035,
"num_tokens": 352245.0,
"step": 316
},
{
"epoch": 0.35941043083900226,
"grad_norm": 21.77728271484375,
"learning_rate": 1.3239766081871345e-05,
"loss": 1.4594,
"num_tokens": 353380.0,
"step": 317
},
{
"epoch": 0.36054421768707484,
"grad_norm": 26.78565788269043,
"learning_rate": 1.321637426900585e-05,
"loss": 1.6554,
"num_tokens": 354375.0,
"step": 318
},
{
"epoch": 0.36167800453514737,
"grad_norm": 22.74375343322754,
"learning_rate": 1.3192982456140354e-05,
"loss": 1.4583,
"num_tokens": 355502.0,
"step": 319
},
{
"epoch": 0.36281179138321995,
"grad_norm": 22.99059295654297,
"learning_rate": 1.3169590643274855e-05,
"loss": 1.4056,
"num_tokens": 356615.0,
"step": 320
},
{
"epoch": 0.36394557823129253,
"grad_norm": 19.973115921020508,
"learning_rate": 1.3146198830409357e-05,
"loss": 1.4359,
"num_tokens": 357688.0,
"step": 321
},
{
"epoch": 0.36507936507936506,
"grad_norm": 20.598600387573242,
"learning_rate": 1.3122807017543862e-05,
"loss": 1.3685,
"num_tokens": 358832.0,
"step": 322
},
{
"epoch": 0.36621315192743764,
"grad_norm": 23.178176879882812,
"learning_rate": 1.3099415204678362e-05,
"loss": 1.4443,
"num_tokens": 359904.0,
"step": 323
},
{
"epoch": 0.3673469387755102,
"grad_norm": 23.088056564331055,
"learning_rate": 1.3076023391812867e-05,
"loss": 1.3505,
"num_tokens": 361014.0,
"step": 324
},
{
"epoch": 0.36848072562358275,
"grad_norm": 22.677352905273438,
"learning_rate": 1.305263157894737e-05,
"loss": 1.432,
"num_tokens": 362163.0,
"step": 325
},
{
"epoch": 0.36961451247165533,
"grad_norm": 22.128244400024414,
"learning_rate": 1.3029239766081872e-05,
"loss": 1.4475,
"num_tokens": 363222.0,
"step": 326
},
{
"epoch": 0.3707482993197279,
"grad_norm": 20.658218383789062,
"learning_rate": 1.3005847953216374e-05,
"loss": 1.4307,
"num_tokens": 364332.0,
"step": 327
},
{
"epoch": 0.37188208616780044,
"grad_norm": 24.494672775268555,
"learning_rate": 1.2982456140350879e-05,
"loss": 1.5444,
"num_tokens": 365433.0,
"step": 328
},
{
"epoch": 0.373015873015873,
"grad_norm": 24.10431671142578,
"learning_rate": 1.2959064327485383e-05,
"loss": 1.4431,
"num_tokens": 366487.0,
"step": 329
},
{
"epoch": 0.3741496598639456,
"grad_norm": 20.076684951782227,
"learning_rate": 1.2935672514619884e-05,
"loss": 1.4626,
"num_tokens": 367618.0,
"step": 330
},
{
"epoch": 0.37528344671201813,
"grad_norm": 22.610557556152344,
"learning_rate": 1.2912280701754386e-05,
"loss": 1.4713,
"num_tokens": 368738.0,
"step": 331
},
{
"epoch": 0.3764172335600907,
"grad_norm": 25.21763801574707,
"learning_rate": 1.288888888888889e-05,
"loss": 1.5092,
"num_tokens": 369720.0,
"step": 332
},
{
"epoch": 0.37755102040816324,
"grad_norm": 20.842025756835938,
"learning_rate": 1.2865497076023392e-05,
"loss": 1.4863,
"num_tokens": 370885.0,
"step": 333
},
{
"epoch": 0.3786848072562358,
"grad_norm": 21.969791412353516,
"learning_rate": 1.2842105263157896e-05,
"loss": 1.3838,
"num_tokens": 371958.0,
"step": 334
},
{
"epoch": 0.3798185941043084,
"grad_norm": 23.09484100341797,
"learning_rate": 1.28187134502924e-05,
"loss": 1.6291,
"num_tokens": 373085.0,
"step": 335
},
{
"epoch": 0.38095238095238093,
"grad_norm": 20.638565063476562,
"learning_rate": 1.2795321637426901e-05,
"loss": 1.4701,
"num_tokens": 374210.0,
"step": 336
},
{
"epoch": 0.3820861678004535,
"grad_norm": 21.330707550048828,
"learning_rate": 1.2771929824561404e-05,
"loss": 1.4856,
"num_tokens": 375310.0,
"step": 337
},
{
"epoch": 0.3832199546485261,
"grad_norm": 22.242534637451172,
"learning_rate": 1.2748538011695908e-05,
"loss": 1.4001,
"num_tokens": 376372.0,
"step": 338
},
{
"epoch": 0.3843537414965986,
"grad_norm": 22.377954483032227,
"learning_rate": 1.2725146198830412e-05,
"loss": 1.6322,
"num_tokens": 377527.0,
"step": 339
},
{
"epoch": 0.3854875283446712,
"grad_norm": 22.19904327392578,
"learning_rate": 1.2701754385964913e-05,
"loss": 1.3738,
"num_tokens": 378604.0,
"step": 340
},
{
"epoch": 0.3866213151927438,
"grad_norm": 21.403820037841797,
"learning_rate": 1.2678362573099417e-05,
"loss": 1.3584,
"num_tokens": 379678.0,
"step": 341
},
{
"epoch": 0.3877551020408163,
"grad_norm": 21.75101089477539,
"learning_rate": 1.265497076023392e-05,
"loss": 1.4217,
"num_tokens": 380729.0,
"step": 342
},
{
"epoch": 0.3888888888888889,
"grad_norm": 23.82843589782715,
"learning_rate": 1.263157894736842e-05,
"loss": 1.447,
"num_tokens": 381876.0,
"step": 343
},
{
"epoch": 0.3900226757369615,
"grad_norm": 23.757530212402344,
"learning_rate": 1.2608187134502925e-05,
"loss": 1.4956,
"num_tokens": 382962.0,
"step": 344
},
{
"epoch": 0.391156462585034,
"grad_norm": 23.748046875,
"learning_rate": 1.258479532163743e-05,
"loss": 1.5339,
"num_tokens": 384096.0,
"step": 345
},
{
"epoch": 0.3922902494331066,
"grad_norm": 21.567293167114258,
"learning_rate": 1.256140350877193e-05,
"loss": 1.3744,
"num_tokens": 385175.0,
"step": 346
},
{
"epoch": 0.3934240362811791,
"grad_norm": 22.490840911865234,
"learning_rate": 1.2538011695906434e-05,
"loss": 1.4251,
"num_tokens": 386358.0,
"step": 347
},
{
"epoch": 0.3945578231292517,
"grad_norm": 21.747709274291992,
"learning_rate": 1.2514619883040937e-05,
"loss": 1.4746,
"num_tokens": 387482.0,
"step": 348
},
{
"epoch": 0.3956916099773243,
"grad_norm": 19.943635940551758,
"learning_rate": 1.2491228070175441e-05,
"loss": 1.4278,
"num_tokens": 388662.0,
"step": 349
},
{
"epoch": 0.3968253968253968,
"grad_norm": 22.539836883544922,
"learning_rate": 1.2467836257309942e-05,
"loss": 1.6124,
"num_tokens": 389731.0,
"step": 350
},
{
"epoch": 0.3979591836734694,
"grad_norm": 21.41621208190918,
"learning_rate": 1.2444444444444446e-05,
"loss": 1.4527,
"num_tokens": 390921.0,
"step": 351
},
{
"epoch": 0.39909297052154197,
"grad_norm": 21.026447296142578,
"learning_rate": 1.2421052631578949e-05,
"loss": 1.5012,
"num_tokens": 392035.0,
"step": 352
},
{
"epoch": 0.4002267573696145,
"grad_norm": 25.59100914001465,
"learning_rate": 1.239766081871345e-05,
"loss": 1.5081,
"num_tokens": 393093.0,
"step": 353
},
{
"epoch": 0.4013605442176871,
"grad_norm": 22.007505416870117,
"learning_rate": 1.2374269005847954e-05,
"loss": 1.5663,
"num_tokens": 394239.0,
"step": 354
},
{
"epoch": 0.40249433106575966,
"grad_norm": 23.461524963378906,
"learning_rate": 1.2350877192982458e-05,
"loss": 1.4302,
"num_tokens": 395291.0,
"step": 355
},
{
"epoch": 0.4036281179138322,
"grad_norm": 20.203760147094727,
"learning_rate": 1.232748538011696e-05,
"loss": 1.3541,
"num_tokens": 396485.0,
"step": 356
},
{
"epoch": 0.40476190476190477,
"grad_norm": 19.759798049926758,
"learning_rate": 1.2304093567251463e-05,
"loss": 1.4233,
"num_tokens": 397644.0,
"step": 357
},
{
"epoch": 0.40589569160997735,
"grad_norm": 22.704025268554688,
"learning_rate": 1.2280701754385966e-05,
"loss": 1.3426,
"num_tokens": 398789.0,
"step": 358
},
{
"epoch": 0.4070294784580499,
"grad_norm": 21.620922088623047,
"learning_rate": 1.225730994152047e-05,
"loss": 1.5188,
"num_tokens": 399878.0,
"step": 359
},
{
"epoch": 0.40816326530612246,
"grad_norm": 19.23693084716797,
"learning_rate": 1.2233918128654971e-05,
"loss": 1.3182,
"num_tokens": 400987.0,
"step": 360
},
{
"epoch": 0.409297052154195,
"grad_norm": 21.466594696044922,
"learning_rate": 1.2210526315789475e-05,
"loss": 1.5335,
"num_tokens": 402056.0,
"step": 361
},
{
"epoch": 0.41043083900226757,
"grad_norm": 21.63371467590332,
"learning_rate": 1.2187134502923978e-05,
"loss": 1.4244,
"num_tokens": 403197.0,
"step": 362
},
{
"epoch": 0.41156462585034015,
"grad_norm": 20.83876609802246,
"learning_rate": 1.216374269005848e-05,
"loss": 1.6196,
"num_tokens": 404342.0,
"step": 363
},
{
"epoch": 0.4126984126984127,
"grad_norm": 21.863889694213867,
"learning_rate": 1.2140350877192983e-05,
"loss": 1.5207,
"num_tokens": 405485.0,
"step": 364
},
{
"epoch": 0.41383219954648526,
"grad_norm": 20.582901000976562,
"learning_rate": 1.2116959064327487e-05,
"loss": 1.4857,
"num_tokens": 406682.0,
"step": 365
},
{
"epoch": 0.41496598639455784,
"grad_norm": 22.900217056274414,
"learning_rate": 1.2093567251461988e-05,
"loss": 1.453,
"num_tokens": 407731.0,
"step": 366
},
{
"epoch": 0.41609977324263037,
"grad_norm": 24.468358993530273,
"learning_rate": 1.2070175438596493e-05,
"loss": 1.5036,
"num_tokens": 408773.0,
"step": 367
},
{
"epoch": 0.41723356009070295,
"grad_norm": 22.844940185546875,
"learning_rate": 1.2046783625730995e-05,
"loss": 1.3458,
"num_tokens": 409839.0,
"step": 368
},
{
"epoch": 0.41836734693877553,
"grad_norm": 23.0582332611084,
"learning_rate": 1.20233918128655e-05,
"loss": 1.4995,
"num_tokens": 410856.0,
"step": 369
},
{
"epoch": 0.41950113378684806,
"grad_norm": 20.938154220581055,
"learning_rate": 1.2e-05,
"loss": 1.4394,
"num_tokens": 411957.0,
"step": 370
},
{
"epoch": 0.42063492063492064,
"grad_norm": 21.037168502807617,
"learning_rate": 1.1976608187134505e-05,
"loss": 1.3752,
"num_tokens": 413070.0,
"step": 371
},
{
"epoch": 0.4217687074829932,
"grad_norm": 23.35750961303711,
"learning_rate": 1.1953216374269007e-05,
"loss": 1.4844,
"num_tokens": 414204.0,
"step": 372
},
{
"epoch": 0.42290249433106575,
"grad_norm": 20.994701385498047,
"learning_rate": 1.192982456140351e-05,
"loss": 1.4312,
"num_tokens": 415467.0,
"step": 373
},
{
"epoch": 0.42403628117913833,
"grad_norm": 21.293333053588867,
"learning_rate": 1.1906432748538012e-05,
"loss": 1.4493,
"num_tokens": 416638.0,
"step": 374
},
{
"epoch": 0.42517006802721086,
"grad_norm": 21.202180862426758,
"learning_rate": 1.1883040935672517e-05,
"loss": 1.4468,
"num_tokens": 417771.0,
"step": 375
},
{
"epoch": 0.42630385487528344,
"grad_norm": 22.729934692382812,
"learning_rate": 1.1859649122807017e-05,
"loss": 1.5065,
"num_tokens": 418899.0,
"step": 376
},
{
"epoch": 0.427437641723356,
"grad_norm": 20.654706954956055,
"learning_rate": 1.1836257309941522e-05,
"loss": 1.453,
"num_tokens": 420087.0,
"step": 377
},
{
"epoch": 0.42857142857142855,
"grad_norm": 22.81949806213379,
"learning_rate": 1.1812865497076024e-05,
"loss": 1.4696,
"num_tokens": 421175.0,
"step": 378
},
{
"epoch": 0.42970521541950113,
"grad_norm": 22.954544067382812,
"learning_rate": 1.1789473684210527e-05,
"loss": 1.4957,
"num_tokens": 422301.0,
"step": 379
},
{
"epoch": 0.4308390022675737,
"grad_norm": 20.20991325378418,
"learning_rate": 1.176608187134503e-05,
"loss": 1.3547,
"num_tokens": 423479.0,
"step": 380
},
{
"epoch": 0.43197278911564624,
"grad_norm": 21.98236846923828,
"learning_rate": 1.1742690058479534e-05,
"loss": 1.5336,
"num_tokens": 424599.0,
"step": 381
},
{
"epoch": 0.4331065759637188,
"grad_norm": 22.076784133911133,
"learning_rate": 1.1719298245614036e-05,
"loss": 1.4168,
"num_tokens": 425638.0,
"step": 382
},
{
"epoch": 0.4342403628117914,
"grad_norm": 20.801210403442383,
"learning_rate": 1.1695906432748539e-05,
"loss": 1.5112,
"num_tokens": 426831.0,
"step": 383
},
{
"epoch": 0.43537414965986393,
"grad_norm": 21.711530685424805,
"learning_rate": 1.1672514619883041e-05,
"loss": 1.5072,
"num_tokens": 428008.0,
"step": 384
},
{
"epoch": 0.4365079365079365,
"grad_norm": 20.649227142333984,
"learning_rate": 1.1649122807017546e-05,
"loss": 1.3347,
"num_tokens": 429150.0,
"step": 385
},
{
"epoch": 0.4376417233560091,
"grad_norm": 24.207242965698242,
"learning_rate": 1.1625730994152047e-05,
"loss": 1.4249,
"num_tokens": 430401.0,
"step": 386
},
{
"epoch": 0.4387755102040816,
"grad_norm": 22.359968185424805,
"learning_rate": 1.160233918128655e-05,
"loss": 1.4618,
"num_tokens": 431572.0,
"step": 387
},
{
"epoch": 0.4399092970521542,
"grad_norm": 23.305208206176758,
"learning_rate": 1.1578947368421053e-05,
"loss": 1.6135,
"num_tokens": 432748.0,
"step": 388
},
{
"epoch": 0.4410430839002268,
"grad_norm": 23.788869857788086,
"learning_rate": 1.1555555555555556e-05,
"loss": 1.4025,
"num_tokens": 433673.0,
"step": 389
},
{
"epoch": 0.4421768707482993,
"grad_norm": 23.072362899780273,
"learning_rate": 1.1532163742690059e-05,
"loss": 1.4262,
"num_tokens": 434803.0,
"step": 390
},
{
"epoch": 0.4433106575963719,
"grad_norm": 24.782913208007812,
"learning_rate": 1.1508771929824563e-05,
"loss": 1.5596,
"num_tokens": 435895.0,
"step": 391
},
{
"epoch": 0.4444444444444444,
"grad_norm": 22.748624801635742,
"learning_rate": 1.1485380116959065e-05,
"loss": 1.5855,
"num_tokens": 436983.0,
"step": 392
},
{
"epoch": 0.445578231292517,
"grad_norm": 24.416845321655273,
"learning_rate": 1.1461988304093568e-05,
"loss": 1.5917,
"num_tokens": 438073.0,
"step": 393
},
{
"epoch": 0.4467120181405896,
"grad_norm": 21.77821922302246,
"learning_rate": 1.143859649122807e-05,
"loss": 1.3617,
"num_tokens": 439139.0,
"step": 394
},
{
"epoch": 0.4478458049886621,
"grad_norm": 22.38814926147461,
"learning_rate": 1.1415204678362575e-05,
"loss": 1.4778,
"num_tokens": 440259.0,
"step": 395
},
{
"epoch": 0.4489795918367347,
"grad_norm": 21.56502342224121,
"learning_rate": 1.1391812865497076e-05,
"loss": 1.5352,
"num_tokens": 441416.0,
"step": 396
},
{
"epoch": 0.4501133786848073,
"grad_norm": 20.565168380737305,
"learning_rate": 1.136842105263158e-05,
"loss": 1.5067,
"num_tokens": 442549.0,
"step": 397
},
{
"epoch": 0.4512471655328798,
"grad_norm": 19.611265182495117,
"learning_rate": 1.1345029239766083e-05,
"loss": 1.4137,
"num_tokens": 443711.0,
"step": 398
},
{
"epoch": 0.4523809523809524,
"grad_norm": 20.92315101623535,
"learning_rate": 1.1321637426900585e-05,
"loss": 1.4697,
"num_tokens": 444840.0,
"step": 399
},
{
"epoch": 0.45351473922902497,
"grad_norm": 19.84340476989746,
"learning_rate": 1.1298245614035088e-05,
"loss": 1.5124,
"num_tokens": 445920.0,
"step": 400
},
{
"epoch": 0.4546485260770975,
"grad_norm": 20.4117431640625,
"learning_rate": 1.1274853801169592e-05,
"loss": 1.5706,
"num_tokens": 447140.0,
"step": 401
},
{
"epoch": 0.4557823129251701,
"grad_norm": 20.32288360595703,
"learning_rate": 1.1251461988304096e-05,
"loss": 1.3753,
"num_tokens": 448247.0,
"step": 402
},
{
"epoch": 0.45691609977324266,
"grad_norm": 20.550880432128906,
"learning_rate": 1.1228070175438597e-05,
"loss": 1.3749,
"num_tokens": 449442.0,
"step": 403
},
{
"epoch": 0.4580498866213152,
"grad_norm": 22.589542388916016,
"learning_rate": 1.12046783625731e-05,
"loss": 1.4055,
"num_tokens": 450561.0,
"step": 404
},
{
"epoch": 0.45918367346938777,
"grad_norm": 20.559473037719727,
"learning_rate": 1.1181286549707604e-05,
"loss": 1.5184,
"num_tokens": 451707.0,
"step": 405
},
{
"epoch": 0.4603174603174603,
"grad_norm": 19.94198226928711,
"learning_rate": 1.1157894736842105e-05,
"loss": 1.4678,
"num_tokens": 452849.0,
"step": 406
},
{
"epoch": 0.4614512471655329,
"grad_norm": 21.226221084594727,
"learning_rate": 1.1134502923976609e-05,
"loss": 1.358,
"num_tokens": 453908.0,
"step": 407
},
{
"epoch": 0.46258503401360546,
"grad_norm": 20.742549896240234,
"learning_rate": 1.1111111111111113e-05,
"loss": 1.3297,
"num_tokens": 455072.0,
"step": 408
},
{
"epoch": 0.463718820861678,
"grad_norm": 20.73780059814453,
"learning_rate": 1.1087719298245614e-05,
"loss": 1.5569,
"num_tokens": 456203.0,
"step": 409
},
{
"epoch": 0.46485260770975056,
"grad_norm": 21.602121353149414,
"learning_rate": 1.1064327485380117e-05,
"loss": 1.512,
"num_tokens": 457306.0,
"step": 410
},
{
"epoch": 0.46598639455782315,
"grad_norm": 22.79952049255371,
"learning_rate": 1.1040935672514621e-05,
"loss": 1.5161,
"num_tokens": 458435.0,
"step": 411
},
{
"epoch": 0.4671201814058957,
"grad_norm": 22.42172622680664,
"learning_rate": 1.1017543859649125e-05,
"loss": 1.2366,
"num_tokens": 459465.0,
"step": 412
},
{
"epoch": 0.46825396825396826,
"grad_norm": 22.881622314453125,
"learning_rate": 1.0994152046783626e-05,
"loss": 1.5695,
"num_tokens": 460630.0,
"step": 413
},
{
"epoch": 0.46938775510204084,
"grad_norm": 21.594409942626953,
"learning_rate": 1.0970760233918129e-05,
"loss": 1.446,
"num_tokens": 461789.0,
"step": 414
},
{
"epoch": 0.47052154195011336,
"grad_norm": 23.080078125,
"learning_rate": 1.0947368421052633e-05,
"loss": 1.4408,
"num_tokens": 462919.0,
"step": 415
},
{
"epoch": 0.47165532879818595,
"grad_norm": 22.40519142150879,
"learning_rate": 1.0923976608187134e-05,
"loss": 1.3175,
"num_tokens": 463976.0,
"step": 416
},
{
"epoch": 0.47278911564625853,
"grad_norm": 23.810379028320312,
"learning_rate": 1.0900584795321638e-05,
"loss": 1.5058,
"num_tokens": 465105.0,
"step": 417
},
{
"epoch": 0.47392290249433106,
"grad_norm": 21.764375686645508,
"learning_rate": 1.0877192982456142e-05,
"loss": 1.4464,
"num_tokens": 466236.0,
"step": 418
},
{
"epoch": 0.47505668934240364,
"grad_norm": 21.064970016479492,
"learning_rate": 1.0853801169590643e-05,
"loss": 1.4898,
"num_tokens": 467384.0,
"step": 419
},
{
"epoch": 0.47619047619047616,
"grad_norm": 21.655441284179688,
"learning_rate": 1.0830409356725146e-05,
"loss": 1.3508,
"num_tokens": 468501.0,
"step": 420
},
{
"epoch": 0.47732426303854875,
"grad_norm": 23.890254974365234,
"learning_rate": 1.080701754385965e-05,
"loss": 1.597,
"num_tokens": 469514.0,
"step": 421
},
{
"epoch": 0.47845804988662133,
"grad_norm": 22.981708526611328,
"learning_rate": 1.0783625730994154e-05,
"loss": 1.5223,
"num_tokens": 470570.0,
"step": 422
},
{
"epoch": 0.47959183673469385,
"grad_norm": 23.430938720703125,
"learning_rate": 1.0760233918128655e-05,
"loss": 1.4387,
"num_tokens": 471783.0,
"step": 423
},
{
"epoch": 0.48072562358276644,
"grad_norm": 22.37723159790039,
"learning_rate": 1.073684210526316e-05,
"loss": 1.4399,
"num_tokens": 472934.0,
"step": 424
},
{
"epoch": 0.481859410430839,
"grad_norm": 22.111480712890625,
"learning_rate": 1.0713450292397662e-05,
"loss": 1.5395,
"num_tokens": 474128.0,
"step": 425
},
{
"epoch": 0.48299319727891155,
"grad_norm": 22.287321090698242,
"learning_rate": 1.0690058479532163e-05,
"loss": 1.419,
"num_tokens": 475240.0,
"step": 426
},
{
"epoch": 0.48412698412698413,
"grad_norm": 21.262020111083984,
"learning_rate": 1.0666666666666667e-05,
"loss": 1.5711,
"num_tokens": 476354.0,
"step": 427
},
{
"epoch": 0.4852607709750567,
"grad_norm": 22.599716186523438,
"learning_rate": 1.0643274853801172e-05,
"loss": 1.5905,
"num_tokens": 477408.0,
"step": 428
},
{
"epoch": 0.48639455782312924,
"grad_norm": 23.220651626586914,
"learning_rate": 1.0619883040935672e-05,
"loss": 1.5256,
"num_tokens": 478458.0,
"step": 429
},
{
"epoch": 0.4875283446712018,
"grad_norm": 22.43808937072754,
"learning_rate": 1.0596491228070177e-05,
"loss": 1.4177,
"num_tokens": 479577.0,
"step": 430
},
{
"epoch": 0.4886621315192744,
"grad_norm": 23.958772659301758,
"learning_rate": 1.057309941520468e-05,
"loss": 1.4556,
"num_tokens": 480596.0,
"step": 431
},
{
"epoch": 0.4897959183673469,
"grad_norm": 23.579402923583984,
"learning_rate": 1.0549707602339184e-05,
"loss": 1.54,
"num_tokens": 481636.0,
"step": 432
},
{
"epoch": 0.4909297052154195,
"grad_norm": 23.185216903686523,
"learning_rate": 1.0526315789473684e-05,
"loss": 1.5177,
"num_tokens": 482749.0,
"step": 433
},
{
"epoch": 0.49206349206349204,
"grad_norm": 21.95652961730957,
"learning_rate": 1.0502923976608189e-05,
"loss": 1.4486,
"num_tokens": 483894.0,
"step": 434
},
{
"epoch": 0.4931972789115646,
"grad_norm": 21.265581130981445,
"learning_rate": 1.0479532163742691e-05,
"loss": 1.5921,
"num_tokens": 485099.0,
"step": 435
},
{
"epoch": 0.4943310657596372,
"grad_norm": 21.50389289855957,
"learning_rate": 1.0456140350877194e-05,
"loss": 1.5166,
"num_tokens": 486238.0,
"step": 436
},
{
"epoch": 0.4954648526077097,
"grad_norm": 22.350412368774414,
"learning_rate": 1.0432748538011696e-05,
"loss": 1.4719,
"num_tokens": 487264.0,
"step": 437
},
{
"epoch": 0.4965986394557823,
"grad_norm": 21.480653762817383,
"learning_rate": 1.04093567251462e-05,
"loss": 1.4648,
"num_tokens": 488457.0,
"step": 438
},
{
"epoch": 0.4977324263038549,
"grad_norm": 23.51890754699707,
"learning_rate": 1.0385964912280702e-05,
"loss": 1.5191,
"num_tokens": 489594.0,
"step": 439
},
{
"epoch": 0.4988662131519274,
"grad_norm": 21.588485717773438,
"learning_rate": 1.0362573099415206e-05,
"loss": 1.5039,
"num_tokens": 490762.0,
"step": 440
},
{
"epoch": 0.5,
"grad_norm": 21.730770111083984,
"learning_rate": 1.0339181286549708e-05,
"loss": 1.4574,
"num_tokens": 491869.0,
"step": 441
},
{
"epoch": 0.5011337868480725,
"grad_norm": 21.45746612548828,
"learning_rate": 1.0315789473684213e-05,
"loss": 1.5759,
"num_tokens": 492996.0,
"step": 442
},
{
"epoch": 0.5022675736961452,
"grad_norm": 25.636009216308594,
"learning_rate": 1.0292397660818714e-05,
"loss": 1.5797,
"num_tokens": 494026.0,
"step": 443
},
{
"epoch": 0.5034013605442177,
"grad_norm": 20.464859008789062,
"learning_rate": 1.0269005847953218e-05,
"loss": 1.469,
"num_tokens": 495187.0,
"step": 444
},
{
"epoch": 0.5045351473922902,
"grad_norm": 21.59822654724121,
"learning_rate": 1.024561403508772e-05,
"loss": 1.4327,
"num_tokens": 496377.0,
"step": 445
},
{
"epoch": 0.5056689342403629,
"grad_norm": 22.208587646484375,
"learning_rate": 1.0222222222222223e-05,
"loss": 1.5198,
"num_tokens": 497515.0,
"step": 446
},
{
"epoch": 0.5068027210884354,
"grad_norm": 20.97501564025879,
"learning_rate": 1.0198830409356726e-05,
"loss": 1.3439,
"num_tokens": 498679.0,
"step": 447
},
{
"epoch": 0.5079365079365079,
"grad_norm": 22.33463478088379,
"learning_rate": 1.017543859649123e-05,
"loss": 1.4406,
"num_tokens": 499695.0,
"step": 448
},
{
"epoch": 0.5090702947845805,
"grad_norm": 23.04416275024414,
"learning_rate": 1.015204678362573e-05,
"loss": 1.397,
"num_tokens": 500855.0,
"step": 449
},
{
"epoch": 0.5102040816326531,
"grad_norm": 21.69202995300293,
"learning_rate": 1.0128654970760235e-05,
"loss": 1.5077,
"num_tokens": 501968.0,
"step": 450
},
{
"epoch": 0.5113378684807256,
"grad_norm": 24.2927303314209,
"learning_rate": 1.0105263157894738e-05,
"loss": 1.4229,
"num_tokens": 502939.0,
"step": 451
},
{
"epoch": 0.5124716553287982,
"grad_norm": 22.449220657348633,
"learning_rate": 1.0081871345029242e-05,
"loss": 1.3924,
"num_tokens": 503993.0,
"step": 452
},
{
"epoch": 0.5136054421768708,
"grad_norm": 23.610368728637695,
"learning_rate": 1.0058479532163743e-05,
"loss": 1.4355,
"num_tokens": 505157.0,
"step": 453
},
{
"epoch": 0.5147392290249433,
"grad_norm": 21.155065536499023,
"learning_rate": 1.0035087719298247e-05,
"loss": 1.4111,
"num_tokens": 506313.0,
"step": 454
},
{
"epoch": 0.5158730158730159,
"grad_norm": 23.355241775512695,
"learning_rate": 1.001169590643275e-05,
"loss": 1.4867,
"num_tokens": 507464.0,
"step": 455
},
{
"epoch": 0.5170068027210885,
"grad_norm": 22.317737579345703,
"learning_rate": 9.988304093567252e-06,
"loss": 1.5623,
"num_tokens": 508542.0,
"step": 456
},
{
"epoch": 0.518140589569161,
"grad_norm": 20.3128662109375,
"learning_rate": 9.964912280701755e-06,
"loss": 1.4996,
"num_tokens": 509696.0,
"step": 457
},
{
"epoch": 0.5192743764172335,
"grad_norm": 21.98442268371582,
"learning_rate": 9.941520467836257e-06,
"loss": 1.5096,
"num_tokens": 510844.0,
"step": 458
},
{
"epoch": 0.5204081632653061,
"grad_norm": 22.291845321655273,
"learning_rate": 9.918128654970762e-06,
"loss": 1.5076,
"num_tokens": 511905.0,
"step": 459
},
{
"epoch": 0.5215419501133787,
"grad_norm": 21.11138153076172,
"learning_rate": 9.894736842105264e-06,
"loss": 1.4024,
"num_tokens": 513025.0,
"step": 460
},
{
"epoch": 0.5226757369614512,
"grad_norm": 21.474123001098633,
"learning_rate": 9.871345029239767e-06,
"loss": 1.3739,
"num_tokens": 514096.0,
"step": 461
},
{
"epoch": 0.5238095238095238,
"grad_norm": 22.77983283996582,
"learning_rate": 9.84795321637427e-06,
"loss": 1.537,
"num_tokens": 515114.0,
"step": 462
},
{
"epoch": 0.5249433106575964,
"grad_norm": 22.172109603881836,
"learning_rate": 9.824561403508772e-06,
"loss": 1.4446,
"num_tokens": 516218.0,
"step": 463
},
{
"epoch": 0.5260770975056689,
"grad_norm": 21.138439178466797,
"learning_rate": 9.801169590643276e-06,
"loss": 1.4733,
"num_tokens": 517329.0,
"step": 464
},
{
"epoch": 0.5272108843537415,
"grad_norm": 22.17593765258789,
"learning_rate": 9.777777777777779e-06,
"loss": 1.5316,
"num_tokens": 518435.0,
"step": 465
},
{
"epoch": 0.528344671201814,
"grad_norm": 22.072914123535156,
"learning_rate": 9.754385964912281e-06,
"loss": 1.3249,
"num_tokens": 519509.0,
"step": 466
},
{
"epoch": 0.5294784580498866,
"grad_norm": 22.987707138061523,
"learning_rate": 9.730994152046784e-06,
"loss": 1.5979,
"num_tokens": 520620.0,
"step": 467
},
{
"epoch": 0.5306122448979592,
"grad_norm": 20.49030876159668,
"learning_rate": 9.707602339181286e-06,
"loss": 1.3421,
"num_tokens": 521806.0,
"step": 468
},
{
"epoch": 0.5317460317460317,
"grad_norm": 22.049985885620117,
"learning_rate": 9.68421052631579e-06,
"loss": 1.5731,
"num_tokens": 522945.0,
"step": 469
},
{
"epoch": 0.5328798185941043,
"grad_norm": 21.49173927307129,
"learning_rate": 9.660818713450293e-06,
"loss": 1.4883,
"num_tokens": 524157.0,
"step": 470
},
{
"epoch": 0.5340136054421769,
"grad_norm": 22.7564697265625,
"learning_rate": 9.637426900584796e-06,
"loss": 1.4914,
"num_tokens": 525245.0,
"step": 471
},
{
"epoch": 0.5351473922902494,
"grad_norm": 21.756540298461914,
"learning_rate": 9.614035087719298e-06,
"loss": 1.4791,
"num_tokens": 526398.0,
"step": 472
},
{
"epoch": 0.536281179138322,
"grad_norm": 20.813621520996094,
"learning_rate": 9.590643274853801e-06,
"loss": 1.5602,
"num_tokens": 527591.0,
"step": 473
},
{
"epoch": 0.5374149659863946,
"grad_norm": 20.991981506347656,
"learning_rate": 9.567251461988305e-06,
"loss": 1.4139,
"num_tokens": 528722.0,
"step": 474
},
{
"epoch": 0.5385487528344671,
"grad_norm": 21.36113166809082,
"learning_rate": 9.543859649122808e-06,
"loss": 1.3992,
"num_tokens": 529926.0,
"step": 475
},
{
"epoch": 0.5396825396825397,
"grad_norm": 23.542648315429688,
"learning_rate": 9.52046783625731e-06,
"loss": 1.462,
"num_tokens": 531002.0,
"step": 476
},
{
"epoch": 0.5408163265306123,
"grad_norm": 23.813064575195312,
"learning_rate": 9.497076023391813e-06,
"loss": 1.5113,
"num_tokens": 532063.0,
"step": 477
},
{
"epoch": 0.5419501133786848,
"grad_norm": 23.0479736328125,
"learning_rate": 9.473684210526315e-06,
"loss": 1.3654,
"num_tokens": 533122.0,
"step": 478
},
{
"epoch": 0.5430839002267573,
"grad_norm": 23.56155014038086,
"learning_rate": 9.45029239766082e-06,
"loss": 1.4703,
"num_tokens": 534354.0,
"step": 479
},
{
"epoch": 0.54421768707483,
"grad_norm": 21.863500595092773,
"learning_rate": 9.426900584795322e-06,
"loss": 1.3612,
"num_tokens": 535447.0,
"step": 480
},
{
"epoch": 0.5453514739229025,
"grad_norm": 20.762577056884766,
"learning_rate": 9.403508771929825e-06,
"loss": 1.5177,
"num_tokens": 536573.0,
"step": 481
},
{
"epoch": 0.546485260770975,
"grad_norm": 21.422998428344727,
"learning_rate": 9.380116959064327e-06,
"loss": 1.435,
"num_tokens": 537707.0,
"step": 482
},
{
"epoch": 0.5476190476190477,
"grad_norm": 21.787960052490234,
"learning_rate": 9.35672514619883e-06,
"loss": 1.5013,
"num_tokens": 538818.0,
"step": 483
},
{
"epoch": 0.5487528344671202,
"grad_norm": 23.668500900268555,
"learning_rate": 9.333333333333334e-06,
"loss": 1.5828,
"num_tokens": 539908.0,
"step": 484
},
{
"epoch": 0.5498866213151927,
"grad_norm": 22.385894775390625,
"learning_rate": 9.309941520467837e-06,
"loss": 1.4457,
"num_tokens": 541045.0,
"step": 485
},
{
"epoch": 0.5510204081632653,
"grad_norm": 20.410974502563477,
"learning_rate": 9.28654970760234e-06,
"loss": 1.4135,
"num_tokens": 542124.0,
"step": 486
},
{
"epoch": 0.5521541950113379,
"grad_norm": 22.673994064331055,
"learning_rate": 9.263157894736842e-06,
"loss": 1.4219,
"num_tokens": 543220.0,
"step": 487
},
{
"epoch": 0.5532879818594104,
"grad_norm": 19.593944549560547,
"learning_rate": 9.239766081871345e-06,
"loss": 1.4269,
"num_tokens": 544354.0,
"step": 488
},
{
"epoch": 0.5544217687074829,
"grad_norm": 23.19577407836914,
"learning_rate": 9.216374269005849e-06,
"loss": 1.5597,
"num_tokens": 545379.0,
"step": 489
},
{
"epoch": 0.5555555555555556,
"grad_norm": 20.372406005859375,
"learning_rate": 9.192982456140351e-06,
"loss": 1.3499,
"num_tokens": 546460.0,
"step": 490
},
{
"epoch": 0.5566893424036281,
"grad_norm": 18.678747177124023,
"learning_rate": 9.169590643274856e-06,
"loss": 1.3365,
"num_tokens": 547604.0,
"step": 491
},
{
"epoch": 0.5578231292517006,
"grad_norm": 23.01070785522461,
"learning_rate": 9.146198830409357e-06,
"loss": 1.4461,
"num_tokens": 548661.0,
"step": 492
},
{
"epoch": 0.5589569160997733,
"grad_norm": 23.752355575561523,
"learning_rate": 9.12280701754386e-06,
"loss": 1.5772,
"num_tokens": 549763.0,
"step": 493
},
{
"epoch": 0.5600907029478458,
"grad_norm": 22.22798728942871,
"learning_rate": 9.099415204678363e-06,
"loss": 1.3951,
"num_tokens": 550930.0,
"step": 494
},
{
"epoch": 0.5612244897959183,
"grad_norm": 21.435941696166992,
"learning_rate": 9.076023391812866e-06,
"loss": 1.5268,
"num_tokens": 552031.0,
"step": 495
},
{
"epoch": 0.562358276643991,
"grad_norm": 22.89337730407715,
"learning_rate": 9.05263157894737e-06,
"loss": 1.4932,
"num_tokens": 553188.0,
"step": 496
},
{
"epoch": 0.5634920634920635,
"grad_norm": 20.6674747467041,
"learning_rate": 9.029239766081873e-06,
"loss": 1.3984,
"num_tokens": 554282.0,
"step": 497
},
{
"epoch": 0.564625850340136,
"grad_norm": 20.50768280029297,
"learning_rate": 9.005847953216374e-06,
"loss": 1.4644,
"num_tokens": 555422.0,
"step": 498
},
{
"epoch": 0.5657596371882087,
"grad_norm": 20.704288482666016,
"learning_rate": 8.982456140350878e-06,
"loss": 1.6067,
"num_tokens": 556575.0,
"step": 499
},
{
"epoch": 0.5668934240362812,
"grad_norm": 22.014612197875977,
"learning_rate": 8.95906432748538e-06,
"loss": 1.5485,
"num_tokens": 557680.0,
"step": 500
},
{
"epoch": 0.5680272108843537,
"grad_norm": 23.033823013305664,
"learning_rate": 8.935672514619885e-06,
"loss": 1.4399,
"num_tokens": 558791.0,
"step": 501
},
{
"epoch": 0.5691609977324263,
"grad_norm": 22.817472457885742,
"learning_rate": 8.912280701754387e-06,
"loss": 1.239,
"num_tokens": 559835.0,
"step": 502
},
{
"epoch": 0.5702947845804989,
"grad_norm": 21.822277069091797,
"learning_rate": 8.888888888888888e-06,
"loss": 1.4371,
"num_tokens": 560975.0,
"step": 503
},
{
"epoch": 0.5714285714285714,
"grad_norm": 22.173824310302734,
"learning_rate": 8.865497076023393e-06,
"loss": 1.3944,
"num_tokens": 562132.0,
"step": 504
},
{
"epoch": 0.572562358276644,
"grad_norm": 22.091264724731445,
"learning_rate": 8.842105263157895e-06,
"loss": 1.3903,
"num_tokens": 563198.0,
"step": 505
},
{
"epoch": 0.5736961451247166,
"grad_norm": 20.748226165771484,
"learning_rate": 8.8187134502924e-06,
"loss": 1.3665,
"num_tokens": 564323.0,
"step": 506
},
{
"epoch": 0.5748299319727891,
"grad_norm": 25.668231964111328,
"learning_rate": 8.795321637426902e-06,
"loss": 1.6003,
"num_tokens": 565410.0,
"step": 507
},
{
"epoch": 0.5759637188208617,
"grad_norm": 21.842506408691406,
"learning_rate": 8.771929824561405e-06,
"loss": 1.4388,
"num_tokens": 566583.0,
"step": 508
},
{
"epoch": 0.5770975056689343,
"grad_norm": 21.225013732910156,
"learning_rate": 8.748538011695907e-06,
"loss": 1.3724,
"num_tokens": 567705.0,
"step": 509
},
{
"epoch": 0.5782312925170068,
"grad_norm": 22.67068862915039,
"learning_rate": 8.72514619883041e-06,
"loss": 1.3551,
"num_tokens": 568800.0,
"step": 510
},
{
"epoch": 0.5793650793650794,
"grad_norm": 21.34926986694336,
"learning_rate": 8.701754385964914e-06,
"loss": 1.3908,
"num_tokens": 569913.0,
"step": 511
},
{
"epoch": 0.5804988662131519,
"grad_norm": 22.283945083618164,
"learning_rate": 8.678362573099417e-06,
"loss": 1.4563,
"num_tokens": 571017.0,
"step": 512
},
{
"epoch": 0.5816326530612245,
"grad_norm": 20.993438720703125,
"learning_rate": 8.654970760233919e-06,
"loss": 1.3512,
"num_tokens": 572126.0,
"step": 513
},
{
"epoch": 0.5827664399092971,
"grad_norm": 22.38656234741211,
"learning_rate": 8.631578947368422e-06,
"loss": 1.5402,
"num_tokens": 573264.0,
"step": 514
},
{
"epoch": 0.5839002267573696,
"grad_norm": 23.537073135375977,
"learning_rate": 8.608187134502924e-06,
"loss": 1.3946,
"num_tokens": 574289.0,
"step": 515
},
{
"epoch": 0.5850340136054422,
"grad_norm": 22.79361915588379,
"learning_rate": 8.584795321637429e-06,
"loss": 1.5559,
"num_tokens": 575350.0,
"step": 516
},
{
"epoch": 0.5861678004535147,
"grad_norm": 21.405969619750977,
"learning_rate": 8.561403508771931e-06,
"loss": 1.4803,
"num_tokens": 576453.0,
"step": 517
},
{
"epoch": 0.5873015873015873,
"grad_norm": 23.284671783447266,
"learning_rate": 8.538011695906434e-06,
"loss": 1.5008,
"num_tokens": 577497.0,
"step": 518
},
{
"epoch": 0.5884353741496599,
"grad_norm": 21.37405776977539,
"learning_rate": 8.514619883040936e-06,
"loss": 1.4739,
"num_tokens": 578563.0,
"step": 519
},
{
"epoch": 0.5895691609977324,
"grad_norm": 24.321304321289062,
"learning_rate": 8.491228070175439e-06,
"loss": 1.5819,
"num_tokens": 579721.0,
"step": 520
},
{
"epoch": 0.590702947845805,
"grad_norm": 21.786588668823242,
"learning_rate": 8.467836257309943e-06,
"loss": 1.4846,
"num_tokens": 580835.0,
"step": 521
},
{
"epoch": 0.5918367346938775,
"grad_norm": 22.301950454711914,
"learning_rate": 8.444444444444446e-06,
"loss": 1.4472,
"num_tokens": 582031.0,
"step": 522
},
{
"epoch": 0.5929705215419501,
"grad_norm": 22.732772827148438,
"learning_rate": 8.421052631578948e-06,
"loss": 1.4803,
"num_tokens": 583095.0,
"step": 523
},
{
"epoch": 0.5941043083900227,
"grad_norm": 21.661388397216797,
"learning_rate": 8.39766081871345e-06,
"loss": 1.3937,
"num_tokens": 584233.0,
"step": 524
},
{
"epoch": 0.5952380952380952,
"grad_norm": 20.343976974487305,
"learning_rate": 8.374269005847953e-06,
"loss": 1.353,
"num_tokens": 585302.0,
"step": 525
},
{
"epoch": 0.5963718820861678,
"grad_norm": 22.84857940673828,
"learning_rate": 8.350877192982458e-06,
"loss": 1.4354,
"num_tokens": 586371.0,
"step": 526
},
{
"epoch": 0.5975056689342404,
"grad_norm": 22.94053077697754,
"learning_rate": 8.32748538011696e-06,
"loss": 1.5107,
"num_tokens": 587512.0,
"step": 527
},
{
"epoch": 0.5986394557823129,
"grad_norm": 22.8999080657959,
"learning_rate": 8.304093567251463e-06,
"loss": 1.6091,
"num_tokens": 588596.0,
"step": 528
},
{
"epoch": 0.5997732426303855,
"grad_norm": 23.222049713134766,
"learning_rate": 8.280701754385965e-06,
"loss": 1.4075,
"num_tokens": 589699.0,
"step": 529
},
{
"epoch": 0.6009070294784581,
"grad_norm": 20.573713302612305,
"learning_rate": 8.257309941520468e-06,
"loss": 1.5905,
"num_tokens": 590891.0,
"step": 530
},
{
"epoch": 0.6020408163265306,
"grad_norm": 21.398988723754883,
"learning_rate": 8.233918128654972e-06,
"loss": 1.4535,
"num_tokens": 592028.0,
"step": 531
},
{
"epoch": 0.6031746031746031,
"grad_norm": 22.95913314819336,
"learning_rate": 8.210526315789475e-06,
"loss": 1.371,
"num_tokens": 593213.0,
"step": 532
},
{
"epoch": 0.6043083900226758,
"grad_norm": 21.470468521118164,
"learning_rate": 8.187134502923977e-06,
"loss": 1.6771,
"num_tokens": 594442.0,
"step": 533
},
{
"epoch": 0.6054421768707483,
"grad_norm": 22.546823501586914,
"learning_rate": 8.16374269005848e-06,
"loss": 1.5234,
"num_tokens": 595601.0,
"step": 534
},
{
"epoch": 0.6065759637188208,
"grad_norm": 25.772891998291016,
"learning_rate": 8.140350877192983e-06,
"loss": 1.4929,
"num_tokens": 596802.0,
"step": 535
},
{
"epoch": 0.6077097505668935,
"grad_norm": 22.497135162353516,
"learning_rate": 8.116959064327487e-06,
"loss": 1.4492,
"num_tokens": 597920.0,
"step": 536
},
{
"epoch": 0.608843537414966,
"grad_norm": 24.795047760009766,
"learning_rate": 8.09356725146199e-06,
"loss": 1.3766,
"num_tokens": 598998.0,
"step": 537
},
{
"epoch": 0.6099773242630385,
"grad_norm": 20.771310806274414,
"learning_rate": 8.070175438596492e-06,
"loss": 1.6282,
"num_tokens": 600212.0,
"step": 538
},
{
"epoch": 0.6111111111111112,
"grad_norm": 24.573957443237305,
"learning_rate": 8.046783625730994e-06,
"loss": 1.3345,
"num_tokens": 601248.0,
"step": 539
},
{
"epoch": 0.6122448979591837,
"grad_norm": 20.914003372192383,
"learning_rate": 8.023391812865497e-06,
"loss": 1.4553,
"num_tokens": 602367.0,
"step": 540
},
{
"epoch": 0.6133786848072562,
"grad_norm": 22.030210494995117,
"learning_rate": 8.000000000000001e-06,
"loss": 1.4759,
"num_tokens": 603524.0,
"step": 541
},
{
"epoch": 0.6145124716553289,
"grad_norm": 22.657468795776367,
"learning_rate": 7.976608187134504e-06,
"loss": 1.4916,
"num_tokens": 604618.0,
"step": 542
},
{
"epoch": 0.6156462585034014,
"grad_norm": 23.671598434448242,
"learning_rate": 7.953216374269006e-06,
"loss": 1.3016,
"num_tokens": 605667.0,
"step": 543
},
{
"epoch": 0.6167800453514739,
"grad_norm": 22.971860885620117,
"learning_rate": 7.929824561403509e-06,
"loss": 1.5056,
"num_tokens": 606768.0,
"step": 544
},
{
"epoch": 0.6179138321995464,
"grad_norm": 22.38102912902832,
"learning_rate": 7.906432748538012e-06,
"loss": 1.5457,
"num_tokens": 607836.0,
"step": 545
},
{
"epoch": 0.6190476190476191,
"grad_norm": 21.6705379486084,
"learning_rate": 7.883040935672516e-06,
"loss": 1.4587,
"num_tokens": 608915.0,
"step": 546
},
{
"epoch": 0.6201814058956916,
"grad_norm": 22.290454864501953,
"learning_rate": 7.859649122807018e-06,
"loss": 1.5118,
"num_tokens": 609979.0,
"step": 547
},
{
"epoch": 0.6213151927437641,
"grad_norm": 22.28474998474121,
"learning_rate": 7.836257309941521e-06,
"loss": 1.5042,
"num_tokens": 611160.0,
"step": 548
},
{
"epoch": 0.6224489795918368,
"grad_norm": 23.13262176513672,
"learning_rate": 7.812865497076024e-06,
"loss": 1.4265,
"num_tokens": 612235.0,
"step": 549
},
{
"epoch": 0.6235827664399093,
"grad_norm": 25.128522872924805,
"learning_rate": 7.789473684210526e-06,
"loss": 1.6554,
"num_tokens": 613310.0,
"step": 550
},
{
"epoch": 0.6247165532879818,
"grad_norm": 20.930503845214844,
"learning_rate": 7.76608187134503e-06,
"loss": 1.4841,
"num_tokens": 614403.0,
"step": 551
},
{
"epoch": 0.6258503401360545,
"grad_norm": 21.933982849121094,
"learning_rate": 7.742690058479533e-06,
"loss": 1.4161,
"num_tokens": 615531.0,
"step": 552
},
{
"epoch": 0.626984126984127,
"grad_norm": 20.2902889251709,
"learning_rate": 7.719298245614036e-06,
"loss": 1.4999,
"num_tokens": 616780.0,
"step": 553
},
{
"epoch": 0.6281179138321995,
"grad_norm": 20.942724227905273,
"learning_rate": 7.695906432748538e-06,
"loss": 1.3684,
"num_tokens": 617962.0,
"step": 554
},
{
"epoch": 0.6292517006802721,
"grad_norm": 21.612571716308594,
"learning_rate": 7.67251461988304e-06,
"loss": 1.5484,
"num_tokens": 619151.0,
"step": 555
},
{
"epoch": 0.6303854875283447,
"grad_norm": 22.789594650268555,
"learning_rate": 7.649122807017545e-06,
"loss": 1.4786,
"num_tokens": 620263.0,
"step": 556
},
{
"epoch": 0.6315192743764172,
"grad_norm": 20.90570640563965,
"learning_rate": 7.625730994152048e-06,
"loss": 1.6095,
"num_tokens": 621484.0,
"step": 557
},
{
"epoch": 0.6326530612244898,
"grad_norm": 25.00489616394043,
"learning_rate": 7.60233918128655e-06,
"loss": 1.6409,
"num_tokens": 622514.0,
"step": 558
},
{
"epoch": 0.6337868480725624,
"grad_norm": 20.243322372436523,
"learning_rate": 7.578947368421054e-06,
"loss": 1.5188,
"num_tokens": 623693.0,
"step": 559
},
{
"epoch": 0.6349206349206349,
"grad_norm": 22.97846221923828,
"learning_rate": 7.555555555555556e-06,
"loss": 1.5657,
"num_tokens": 624782.0,
"step": 560
},
{
"epoch": 0.6360544217687075,
"grad_norm": 24.399961471557617,
"learning_rate": 7.5321637426900596e-06,
"loss": 1.5053,
"num_tokens": 625875.0,
"step": 561
},
{
"epoch": 0.63718820861678,
"grad_norm": 20.759233474731445,
"learning_rate": 7.508771929824562e-06,
"loss": 1.5293,
"num_tokens": 627019.0,
"step": 562
},
{
"epoch": 0.6383219954648526,
"grad_norm": 21.16717529296875,
"learning_rate": 7.485380116959065e-06,
"loss": 1.3894,
"num_tokens": 628139.0,
"step": 563
},
{
"epoch": 0.6394557823129252,
"grad_norm": 22.11092758178711,
"learning_rate": 7.461988304093568e-06,
"loss": 1.4995,
"num_tokens": 629274.0,
"step": 564
},
{
"epoch": 0.6405895691609977,
"grad_norm": 27.72738265991211,
"learning_rate": 7.438596491228071e-06,
"loss": 1.3619,
"num_tokens": 630243.0,
"step": 565
},
{
"epoch": 0.6417233560090703,
"grad_norm": 20.804826736450195,
"learning_rate": 7.415204678362574e-06,
"loss": 1.4518,
"num_tokens": 631375.0,
"step": 566
},
{
"epoch": 0.6428571428571429,
"grad_norm": 21.256473541259766,
"learning_rate": 7.391812865497077e-06,
"loss": 1.4972,
"num_tokens": 632486.0,
"step": 567
},
{
"epoch": 0.6439909297052154,
"grad_norm": 22.026762008666992,
"learning_rate": 7.368421052631579e-06,
"loss": 1.3551,
"num_tokens": 633618.0,
"step": 568
},
{
"epoch": 0.645124716553288,
"grad_norm": 21.86498260498047,
"learning_rate": 7.345029239766083e-06,
"loss": 1.5152,
"num_tokens": 634700.0,
"step": 569
},
{
"epoch": 0.6462585034013606,
"grad_norm": 22.414583206176758,
"learning_rate": 7.321637426900585e-06,
"loss": 1.3151,
"num_tokens": 635812.0,
"step": 570
},
{
"epoch": 0.6473922902494331,
"grad_norm": 21.637290954589844,
"learning_rate": 7.298245614035089e-06,
"loss": 1.4313,
"num_tokens": 636943.0,
"step": 571
},
{
"epoch": 0.6485260770975056,
"grad_norm": 22.154354095458984,
"learning_rate": 7.274853801169591e-06,
"loss": 1.3439,
"num_tokens": 638032.0,
"step": 572
},
{
"epoch": 0.6496598639455783,
"grad_norm": 24.491914749145508,
"learning_rate": 7.251461988304094e-06,
"loss": 1.5135,
"num_tokens": 639185.0,
"step": 573
},
{
"epoch": 0.6507936507936508,
"grad_norm": 23.253528594970703,
"learning_rate": 7.228070175438597e-06,
"loss": 1.4544,
"num_tokens": 640432.0,
"step": 574
},
{
"epoch": 0.6519274376417233,
"grad_norm": 22.823341369628906,
"learning_rate": 7.2046783625731e-06,
"loss": 1.326,
"num_tokens": 641575.0,
"step": 575
},
{
"epoch": 0.6530612244897959,
"grad_norm": 22.95077896118164,
"learning_rate": 7.181286549707603e-06,
"loss": 1.6407,
"num_tokens": 642791.0,
"step": 576
},
{
"epoch": 0.6541950113378685,
"grad_norm": 22.988304138183594,
"learning_rate": 7.157894736842106e-06,
"loss": 1.4079,
"num_tokens": 643851.0,
"step": 577
},
{
"epoch": 0.655328798185941,
"grad_norm": 22.63962745666504,
"learning_rate": 7.134502923976608e-06,
"loss": 1.3522,
"num_tokens": 644917.0,
"step": 578
},
{
"epoch": 0.6564625850340136,
"grad_norm": 26.570436477661133,
"learning_rate": 7.111111111111112e-06,
"loss": 1.5682,
"num_tokens": 645970.0,
"step": 579
},
{
"epoch": 0.6575963718820862,
"grad_norm": 22.55550765991211,
"learning_rate": 7.087719298245614e-06,
"loss": 1.4318,
"num_tokens": 647052.0,
"step": 580
},
{
"epoch": 0.6587301587301587,
"grad_norm": 22.61783218383789,
"learning_rate": 7.064327485380118e-06,
"loss": 1.4444,
"num_tokens": 648119.0,
"step": 581
},
{
"epoch": 0.6598639455782312,
"grad_norm": 22.553382873535156,
"learning_rate": 7.04093567251462e-06,
"loss": 1.359,
"num_tokens": 649162.0,
"step": 582
},
{
"epoch": 0.6609977324263039,
"grad_norm": 21.678308486938477,
"learning_rate": 7.017543859649123e-06,
"loss": 1.3491,
"num_tokens": 650283.0,
"step": 583
},
{
"epoch": 0.6621315192743764,
"grad_norm": 21.3242130279541,
"learning_rate": 6.994152046783626e-06,
"loss": 1.4176,
"num_tokens": 651440.0,
"step": 584
},
{
"epoch": 0.6632653061224489,
"grad_norm": 24.34210777282715,
"learning_rate": 6.970760233918129e-06,
"loss": 1.5172,
"num_tokens": 652549.0,
"step": 585
},
{
"epoch": 0.6643990929705216,
"grad_norm": 24.187849044799805,
"learning_rate": 6.947368421052632e-06,
"loss": 1.5697,
"num_tokens": 653590.0,
"step": 586
},
{
"epoch": 0.6655328798185941,
"grad_norm": 22.247058868408203,
"learning_rate": 6.923976608187135e-06,
"loss": 1.3541,
"num_tokens": 654714.0,
"step": 587
},
{
"epoch": 0.6666666666666666,
"grad_norm": 21.859542846679688,
"learning_rate": 6.9005847953216375e-06,
"loss": 1.492,
"num_tokens": 655964.0,
"step": 588
},
{
"epoch": 0.6678004535147393,
"grad_norm": 22.576213836669922,
"learning_rate": 6.877192982456141e-06,
"loss": 1.4299,
"num_tokens": 657065.0,
"step": 589
},
{
"epoch": 0.6689342403628118,
"grad_norm": 22.9122257232666,
"learning_rate": 6.8538011695906435e-06,
"loss": 1.4698,
"num_tokens": 658255.0,
"step": 590
},
{
"epoch": 0.6700680272108843,
"grad_norm": 23.04901123046875,
"learning_rate": 6.830409356725147e-06,
"loss": 1.5064,
"num_tokens": 659325.0,
"step": 591
},
{
"epoch": 0.671201814058957,
"grad_norm": 23.730091094970703,
"learning_rate": 6.8070175438596495e-06,
"loss": 1.4925,
"num_tokens": 660485.0,
"step": 592
},
{
"epoch": 0.6723356009070295,
"grad_norm": 22.81539535522461,
"learning_rate": 6.783625730994152e-06,
"loss": 1.3731,
"num_tokens": 661513.0,
"step": 593
},
{
"epoch": 0.673469387755102,
"grad_norm": 23.25772476196289,
"learning_rate": 6.7602339181286555e-06,
"loss": 1.4893,
"num_tokens": 662618.0,
"step": 594
},
{
"epoch": 0.6746031746031746,
"grad_norm": 23.883750915527344,
"learning_rate": 6.736842105263158e-06,
"loss": 1.2276,
"num_tokens": 663737.0,
"step": 595
},
{
"epoch": 0.6757369614512472,
"grad_norm": 23.94465446472168,
"learning_rate": 6.7134502923976615e-06,
"loss": 1.4642,
"num_tokens": 664857.0,
"step": 596
},
{
"epoch": 0.6768707482993197,
"grad_norm": 22.831388473510742,
"learning_rate": 6.690058479532164e-06,
"loss": 1.4457,
"num_tokens": 665954.0,
"step": 597
},
{
"epoch": 0.6780045351473923,
"grad_norm": 22.41533851623535,
"learning_rate": 6.666666666666667e-06,
"loss": 1.5384,
"num_tokens": 667045.0,
"step": 598
},
{
"epoch": 0.6791383219954649,
"grad_norm": 21.215373992919922,
"learning_rate": 6.64327485380117e-06,
"loss": 1.5598,
"num_tokens": 668272.0,
"step": 599
},
{
"epoch": 0.6802721088435374,
"grad_norm": 21.893632888793945,
"learning_rate": 6.619883040935673e-06,
"loss": 1.5232,
"num_tokens": 669382.0,
"step": 600
},
{
"epoch": 0.68140589569161,
"grad_norm": 22.526578903198242,
"learning_rate": 6.596491228070177e-06,
"loss": 1.5125,
"num_tokens": 670453.0,
"step": 601
},
{
"epoch": 0.6825396825396826,
"grad_norm": 22.798067092895508,
"learning_rate": 6.573099415204679e-06,
"loss": 1.4695,
"num_tokens": 671545.0,
"step": 602
},
{
"epoch": 0.6836734693877551,
"grad_norm": 21.24146842956543,
"learning_rate": 6.549707602339181e-06,
"loss": 1.46,
"num_tokens": 672697.0,
"step": 603
},
{
"epoch": 0.6848072562358276,
"grad_norm": 24.16074562072754,
"learning_rate": 6.526315789473685e-06,
"loss": 1.4916,
"num_tokens": 673798.0,
"step": 604
},
{
"epoch": 0.6859410430839002,
"grad_norm": 21.984033584594727,
"learning_rate": 6.502923976608187e-06,
"loss": 1.3227,
"num_tokens": 674822.0,
"step": 605
},
{
"epoch": 0.6870748299319728,
"grad_norm": 23.14649772644043,
"learning_rate": 6.4795321637426915e-06,
"loss": 1.5123,
"num_tokens": 675953.0,
"step": 606
},
{
"epoch": 0.6882086167800453,
"grad_norm": 21.985637664794922,
"learning_rate": 6.456140350877193e-06,
"loss": 1.5573,
"num_tokens": 677069.0,
"step": 607
},
{
"epoch": 0.6893424036281179,
"grad_norm": 22.13504409790039,
"learning_rate": 6.432748538011696e-06,
"loss": 1.5341,
"num_tokens": 678207.0,
"step": 608
},
{
"epoch": 0.6904761904761905,
"grad_norm": 22.876258850097656,
"learning_rate": 6.4093567251462e-06,
"loss": 1.5077,
"num_tokens": 679333.0,
"step": 609
},
{
"epoch": 0.691609977324263,
"grad_norm": 22.648876190185547,
"learning_rate": 6.385964912280702e-06,
"loss": 1.5165,
"num_tokens": 680472.0,
"step": 610
},
{
"epoch": 0.6927437641723356,
"grad_norm": 24.44489860534668,
"learning_rate": 6.362573099415206e-06,
"loss": 1.4595,
"num_tokens": 681550.0,
"step": 611
},
{
"epoch": 0.6938775510204082,
"grad_norm": 23.062849044799805,
"learning_rate": 6.339181286549709e-06,
"loss": 1.5479,
"num_tokens": 682665.0,
"step": 612
},
{
"epoch": 0.6950113378684807,
"grad_norm": 25.6104793548584,
"learning_rate": 6.31578947368421e-06,
"loss": 1.4372,
"num_tokens": 683797.0,
"step": 613
},
{
"epoch": 0.6961451247165533,
"grad_norm": 22.21469497680664,
"learning_rate": 6.292397660818715e-06,
"loss": 1.4886,
"num_tokens": 684941.0,
"step": 614
},
{
"epoch": 0.6972789115646258,
"grad_norm": 22.583446502685547,
"learning_rate": 6.269005847953217e-06,
"loss": 1.4647,
"num_tokens": 686074.0,
"step": 615
},
{
"epoch": 0.6984126984126984,
"grad_norm": 23.686786651611328,
"learning_rate": 6.245614035087721e-06,
"loss": 1.3958,
"num_tokens": 687271.0,
"step": 616
},
{
"epoch": 0.699546485260771,
"grad_norm": 23.05241584777832,
"learning_rate": 6.222222222222223e-06,
"loss": 1.4258,
"num_tokens": 688341.0,
"step": 617
},
{
"epoch": 0.7006802721088435,
"grad_norm": 23.340354919433594,
"learning_rate": 6.198830409356725e-06,
"loss": 1.5196,
"num_tokens": 689435.0,
"step": 618
},
{
"epoch": 0.7018140589569161,
"grad_norm": 22.070066452026367,
"learning_rate": 6.175438596491229e-06,
"loss": 1.3828,
"num_tokens": 690569.0,
"step": 619
},
{
"epoch": 0.7029478458049887,
"grad_norm": 23.374649047851562,
"learning_rate": 6.152046783625732e-06,
"loss": 1.4309,
"num_tokens": 691672.0,
"step": 620
},
{
"epoch": 0.7040816326530612,
"grad_norm": 23.68497085571289,
"learning_rate": 6.128654970760235e-06,
"loss": 1.4063,
"num_tokens": 692947.0,
"step": 621
},
{
"epoch": 0.7052154195011338,
"grad_norm": 22.70351791381836,
"learning_rate": 6.105263157894738e-06,
"loss": 1.4569,
"num_tokens": 694052.0,
"step": 622
},
{
"epoch": 0.7063492063492064,
"grad_norm": 21.42176055908203,
"learning_rate": 6.08187134502924e-06,
"loss": 1.4617,
"num_tokens": 695248.0,
"step": 623
},
{
"epoch": 0.7074829931972789,
"grad_norm": 22.26094627380371,
"learning_rate": 6.058479532163744e-06,
"loss": 1.516,
"num_tokens": 696356.0,
"step": 624
},
{
"epoch": 0.7086167800453514,
"grad_norm": 26.08213233947754,
"learning_rate": 6.035087719298246e-06,
"loss": 1.4949,
"num_tokens": 697517.0,
"step": 625
},
{
"epoch": 0.7097505668934241,
"grad_norm": 22.348121643066406,
"learning_rate": 6.01169590643275e-06,
"loss": 1.4339,
"num_tokens": 698582.0,
"step": 626
},
{
"epoch": 0.7108843537414966,
"grad_norm": 22.22833251953125,
"learning_rate": 5.988304093567252e-06,
"loss": 1.4808,
"num_tokens": 699745.0,
"step": 627
},
{
"epoch": 0.7120181405895691,
"grad_norm": 23.78014373779297,
"learning_rate": 5.964912280701755e-06,
"loss": 1.4515,
"num_tokens": 700778.0,
"step": 628
},
{
"epoch": 0.7131519274376418,
"grad_norm": 23.705251693725586,
"learning_rate": 5.941520467836258e-06,
"loss": 1.4206,
"num_tokens": 701913.0,
"step": 629
},
{
"epoch": 0.7142857142857143,
"grad_norm": 20.9232234954834,
"learning_rate": 5.918128654970761e-06,
"loss": 1.337,
"num_tokens": 703034.0,
"step": 630
},
{
"epoch": 0.7154195011337868,
"grad_norm": 20.941648483276367,
"learning_rate": 5.8947368421052634e-06,
"loss": 1.3962,
"num_tokens": 704183.0,
"step": 631
},
{
"epoch": 0.7165532879818595,
"grad_norm": 21.786916732788086,
"learning_rate": 5.871345029239767e-06,
"loss": 1.3946,
"num_tokens": 705285.0,
"step": 632
},
{
"epoch": 0.717687074829932,
"grad_norm": 24.243087768554688,
"learning_rate": 5.847953216374269e-06,
"loss": 1.4448,
"num_tokens": 706394.0,
"step": 633
},
{
"epoch": 0.7188208616780045,
"grad_norm": 24.048946380615234,
"learning_rate": 5.824561403508773e-06,
"loss": 1.5736,
"num_tokens": 707507.0,
"step": 634
},
{
"epoch": 0.719954648526077,
"grad_norm": 20.490083694458008,
"learning_rate": 5.801169590643275e-06,
"loss": 1.531,
"num_tokens": 708763.0,
"step": 635
},
{
"epoch": 0.7210884353741497,
"grad_norm": 23.876779556274414,
"learning_rate": 5.777777777777778e-06,
"loss": 1.4557,
"num_tokens": 709792.0,
"step": 636
},
{
"epoch": 0.7222222222222222,
"grad_norm": 24.66008186340332,
"learning_rate": 5.754385964912281e-06,
"loss": 1.517,
"num_tokens": 710874.0,
"step": 637
},
{
"epoch": 0.7233560090702947,
"grad_norm": 20.79994010925293,
"learning_rate": 5.730994152046784e-06,
"loss": 1.3886,
"num_tokens": 712053.0,
"step": 638
},
{
"epoch": 0.7244897959183674,
"grad_norm": 23.969280242919922,
"learning_rate": 5.707602339181287e-06,
"loss": 1.3954,
"num_tokens": 713114.0,
"step": 639
},
{
"epoch": 0.7256235827664399,
"grad_norm": 23.537185668945312,
"learning_rate": 5.68421052631579e-06,
"loss": 1.3893,
"num_tokens": 714201.0,
"step": 640
},
{
"epoch": 0.7267573696145124,
"grad_norm": 22.46196937561035,
"learning_rate": 5.6608187134502925e-06,
"loss": 1.5846,
"num_tokens": 715281.0,
"step": 641
},
{
"epoch": 0.7278911564625851,
"grad_norm": 20.890539169311523,
"learning_rate": 5.637426900584796e-06,
"loss": 1.4379,
"num_tokens": 716404.0,
"step": 642
},
{
"epoch": 0.7290249433106576,
"grad_norm": 21.008331298828125,
"learning_rate": 5.6140350877192985e-06,
"loss": 1.4799,
"num_tokens": 717488.0,
"step": 643
},
{
"epoch": 0.7301587301587301,
"grad_norm": 22.868465423583984,
"learning_rate": 5.590643274853802e-06,
"loss": 1.484,
"num_tokens": 718571.0,
"step": 644
},
{
"epoch": 0.7312925170068028,
"grad_norm": 23.07869529724121,
"learning_rate": 5.5672514619883045e-06,
"loss": 1.484,
"num_tokens": 719716.0,
"step": 645
},
{
"epoch": 0.7324263038548753,
"grad_norm": 24.570358276367188,
"learning_rate": 5.543859649122807e-06,
"loss": 1.5601,
"num_tokens": 720905.0,
"step": 646
},
{
"epoch": 0.7335600907029478,
"grad_norm": 26.3187255859375,
"learning_rate": 5.5204678362573105e-06,
"loss": 1.5891,
"num_tokens": 722076.0,
"step": 647
},
{
"epoch": 0.7346938775510204,
"grad_norm": 22.848508834838867,
"learning_rate": 5.497076023391813e-06,
"loss": 1.3089,
"num_tokens": 723183.0,
"step": 648
},
{
"epoch": 0.735827664399093,
"grad_norm": 22.79510498046875,
"learning_rate": 5.4736842105263165e-06,
"loss": 1.5494,
"num_tokens": 724329.0,
"step": 649
},
{
"epoch": 0.7369614512471655,
"grad_norm": 20.732099533081055,
"learning_rate": 5.450292397660819e-06,
"loss": 1.4599,
"num_tokens": 725447.0,
"step": 650
},
{
"epoch": 0.7380952380952381,
"grad_norm": 22.604984283447266,
"learning_rate": 5.426900584795322e-06,
"loss": 1.3155,
"num_tokens": 726477.0,
"step": 651
},
{
"epoch": 0.7392290249433107,
"grad_norm": 21.90055274963379,
"learning_rate": 5.403508771929825e-06,
"loss": 1.5524,
"num_tokens": 727606.0,
"step": 652
},
{
"epoch": 0.7403628117913832,
"grad_norm": 21.966693878173828,
"learning_rate": 5.380116959064328e-06,
"loss": 1.5046,
"num_tokens": 728718.0,
"step": 653
},
{
"epoch": 0.7414965986394558,
"grad_norm": 25.208890914916992,
"learning_rate": 5.356725146198831e-06,
"loss": 1.6362,
"num_tokens": 729745.0,
"step": 654
},
{
"epoch": 0.7426303854875284,
"grad_norm": 24.090883255004883,
"learning_rate": 5.333333333333334e-06,
"loss": 1.3422,
"num_tokens": 730924.0,
"step": 655
},
{
"epoch": 0.7437641723356009,
"grad_norm": 22.79339599609375,
"learning_rate": 5.309941520467836e-06,
"loss": 1.3421,
"num_tokens": 731980.0,
"step": 656
},
{
"epoch": 0.7448979591836735,
"grad_norm": 23.833890914916992,
"learning_rate": 5.28654970760234e-06,
"loss": 1.5008,
"num_tokens": 733073.0,
"step": 657
},
{
"epoch": 0.746031746031746,
"grad_norm": 19.78729820251465,
"learning_rate": 5.263157894736842e-06,
"loss": 1.3997,
"num_tokens": 734355.0,
"step": 658
},
{
"epoch": 0.7471655328798186,
"grad_norm": 24.19782829284668,
"learning_rate": 5.239766081871346e-06,
"loss": 1.4371,
"num_tokens": 735425.0,
"step": 659
},
{
"epoch": 0.7482993197278912,
"grad_norm": 23.320068359375,
"learning_rate": 5.216374269005848e-06,
"loss": 1.3869,
"num_tokens": 736522.0,
"step": 660
},
{
"epoch": 0.7494331065759637,
"grad_norm": 22.0587158203125,
"learning_rate": 5.192982456140351e-06,
"loss": 1.4641,
"num_tokens": 737633.0,
"step": 661
},
{
"epoch": 0.7505668934240363,
"grad_norm": 22.994808197021484,
"learning_rate": 5.169590643274854e-06,
"loss": 1.4685,
"num_tokens": 738753.0,
"step": 662
},
{
"epoch": 0.7517006802721088,
"grad_norm": 22.301300048828125,
"learning_rate": 5.146198830409357e-06,
"loss": 1.4651,
"num_tokens": 739872.0,
"step": 663
},
{
"epoch": 0.7528344671201814,
"grad_norm": 21.97808837890625,
"learning_rate": 5.12280701754386e-06,
"loss": 1.4382,
"num_tokens": 741008.0,
"step": 664
},
{
"epoch": 0.753968253968254,
"grad_norm": 21.360986709594727,
"learning_rate": 5.099415204678363e-06,
"loss": 1.5411,
"num_tokens": 742227.0,
"step": 665
},
{
"epoch": 0.7551020408163265,
"grad_norm": 22.520597457885742,
"learning_rate": 5.076023391812865e-06,
"loss": 1.4047,
"num_tokens": 743294.0,
"step": 666
},
{
"epoch": 0.7562358276643991,
"grad_norm": 22.39497184753418,
"learning_rate": 5.052631578947369e-06,
"loss": 1.3934,
"num_tokens": 744498.0,
"step": 667
},
{
"epoch": 0.7573696145124716,
"grad_norm": 21.658981323242188,
"learning_rate": 5.029239766081871e-06,
"loss": 1.4161,
"num_tokens": 745753.0,
"step": 668
},
{
"epoch": 0.7585034013605442,
"grad_norm": 22.147428512573242,
"learning_rate": 5.005847953216375e-06,
"loss": 1.365,
"num_tokens": 746899.0,
"step": 669
},
{
"epoch": 0.7596371882086168,
"grad_norm": 22.580663681030273,
"learning_rate": 4.982456140350877e-06,
"loss": 1.5026,
"num_tokens": 747982.0,
"step": 670
},
{
"epoch": 0.7607709750566893,
"grad_norm": 22.9129581451416,
"learning_rate": 4.959064327485381e-06,
"loss": 1.3533,
"num_tokens": 749043.0,
"step": 671
},
{
"epoch": 0.7619047619047619,
"grad_norm": 20.52309226989746,
"learning_rate": 4.935672514619883e-06,
"loss": 1.3866,
"num_tokens": 750230.0,
"step": 672
},
{
"epoch": 0.7630385487528345,
"grad_norm": 25.36422348022461,
"learning_rate": 4.912280701754386e-06,
"loss": 1.5019,
"num_tokens": 751311.0,
"step": 673
},
{
"epoch": 0.764172335600907,
"grad_norm": 20.621496200561523,
"learning_rate": 4.888888888888889e-06,
"loss": 1.4865,
"num_tokens": 752533.0,
"step": 674
},
{
"epoch": 0.7653061224489796,
"grad_norm": 25.740957260131836,
"learning_rate": 4.865497076023392e-06,
"loss": 1.363,
"num_tokens": 753621.0,
"step": 675
},
{
"epoch": 0.7664399092970522,
"grad_norm": 22.625244140625,
"learning_rate": 4.842105263157895e-06,
"loss": 1.4387,
"num_tokens": 754720.0,
"step": 676
},
{
"epoch": 0.7675736961451247,
"grad_norm": 23.55186653137207,
"learning_rate": 4.818713450292398e-06,
"loss": 1.4493,
"num_tokens": 755780.0,
"step": 677
},
{
"epoch": 0.7687074829931972,
"grad_norm": 22.29022789001465,
"learning_rate": 4.7953216374269005e-06,
"loss": 1.4572,
"num_tokens": 756845.0,
"step": 678
},
{
"epoch": 0.7698412698412699,
"grad_norm": 23.95270538330078,
"learning_rate": 4.771929824561404e-06,
"loss": 1.4394,
"num_tokens": 758014.0,
"step": 679
},
{
"epoch": 0.7709750566893424,
"grad_norm": 22.06515121459961,
"learning_rate": 4.7485380116959065e-06,
"loss": 1.5034,
"num_tokens": 759116.0,
"step": 680
},
{
"epoch": 0.7721088435374149,
"grad_norm": 21.582685470581055,
"learning_rate": 4.72514619883041e-06,
"loss": 1.4249,
"num_tokens": 760209.0,
"step": 681
},
{
"epoch": 0.7732426303854876,
"grad_norm": 23.25868797302246,
"learning_rate": 4.7017543859649125e-06,
"loss": 1.4951,
"num_tokens": 761296.0,
"step": 682
},
{
"epoch": 0.7743764172335601,
"grad_norm": 22.286924362182617,
"learning_rate": 4.678362573099415e-06,
"loss": 1.473,
"num_tokens": 762380.0,
"step": 683
},
{
"epoch": 0.7755102040816326,
"grad_norm": 22.370563507080078,
"learning_rate": 4.6549707602339184e-06,
"loss": 1.3724,
"num_tokens": 763416.0,
"step": 684
},
{
"epoch": 0.7766439909297053,
"grad_norm": 22.624408721923828,
"learning_rate": 4.631578947368421e-06,
"loss": 1.3322,
"num_tokens": 764519.0,
"step": 685
},
{
"epoch": 0.7777777777777778,
"grad_norm": 22.60655403137207,
"learning_rate": 4.6081871345029244e-06,
"loss": 1.4132,
"num_tokens": 765685.0,
"step": 686
},
{
"epoch": 0.7789115646258503,
"grad_norm": 22.81845474243164,
"learning_rate": 4.584795321637428e-06,
"loss": 1.4029,
"num_tokens": 766859.0,
"step": 687
},
{
"epoch": 0.780045351473923,
"grad_norm": 22.1308536529541,
"learning_rate": 4.56140350877193e-06,
"loss": 1.3881,
"num_tokens": 768070.0,
"step": 688
},
{
"epoch": 0.7811791383219955,
"grad_norm": 23.998682022094727,
"learning_rate": 4.538011695906433e-06,
"loss": 1.5834,
"num_tokens": 769163.0,
"step": 689
},
{
"epoch": 0.782312925170068,
"grad_norm": 23.69707489013672,
"learning_rate": 4.5146198830409364e-06,
"loss": 1.5529,
"num_tokens": 770331.0,
"step": 690
},
{
"epoch": 0.7834467120181405,
"grad_norm": 24.159103393554688,
"learning_rate": 4.491228070175439e-06,
"loss": 1.3963,
"num_tokens": 771348.0,
"step": 691
},
{
"epoch": 0.7845804988662132,
"grad_norm": 22.975770950317383,
"learning_rate": 4.467836257309942e-06,
"loss": 1.3646,
"num_tokens": 772423.0,
"step": 692
},
{
"epoch": 0.7857142857142857,
"grad_norm": 24.724035263061523,
"learning_rate": 4.444444444444444e-06,
"loss": 1.3742,
"num_tokens": 773491.0,
"step": 693
},
{
"epoch": 0.7868480725623582,
"grad_norm": 23.44858741760254,
"learning_rate": 4.4210526315789476e-06,
"loss": 1.5088,
"num_tokens": 774638.0,
"step": 694
},
{
"epoch": 0.7879818594104309,
"grad_norm": 23.948352813720703,
"learning_rate": 4.397660818713451e-06,
"loss": 1.7207,
"num_tokens": 775775.0,
"step": 695
},
{
"epoch": 0.7891156462585034,
"grad_norm": 21.847509384155273,
"learning_rate": 4.3742690058479536e-06,
"loss": 1.5141,
"num_tokens": 776879.0,
"step": 696
},
{
"epoch": 0.7902494331065759,
"grad_norm": 23.483224868774414,
"learning_rate": 4.350877192982457e-06,
"loss": 1.5361,
"num_tokens": 778025.0,
"step": 697
},
{
"epoch": 0.7913832199546486,
"grad_norm": 22.497262954711914,
"learning_rate": 4.3274853801169596e-06,
"loss": 1.5269,
"num_tokens": 779182.0,
"step": 698
},
{
"epoch": 0.7925170068027211,
"grad_norm": 24.064851760864258,
"learning_rate": 4.304093567251462e-06,
"loss": 1.4504,
"num_tokens": 780292.0,
"step": 699
},
{
"epoch": 0.7936507936507936,
"grad_norm": 25.360252380371094,
"learning_rate": 4.2807017543859656e-06,
"loss": 1.358,
"num_tokens": 781293.0,
"step": 700
},
{
"epoch": 0.7947845804988662,
"grad_norm": 22.048860549926758,
"learning_rate": 4.257309941520468e-06,
"loss": 1.4928,
"num_tokens": 782489.0,
"step": 701
},
{
"epoch": 0.7959183673469388,
"grad_norm": 25.04562759399414,
"learning_rate": 4.2339181286549715e-06,
"loss": 1.4883,
"num_tokens": 783575.0,
"step": 702
},
{
"epoch": 0.7970521541950113,
"grad_norm": 23.770830154418945,
"learning_rate": 4.210526315789474e-06,
"loss": 1.512,
"num_tokens": 784656.0,
"step": 703
},
{
"epoch": 0.7981859410430839,
"grad_norm": 22.704742431640625,
"learning_rate": 4.187134502923977e-06,
"loss": 1.3788,
"num_tokens": 785780.0,
"step": 704
},
{
"epoch": 0.7993197278911565,
"grad_norm": 21.932666778564453,
"learning_rate": 4.16374269005848e-06,
"loss": 1.381,
"num_tokens": 786993.0,
"step": 705
},
{
"epoch": 0.800453514739229,
"grad_norm": 22.353124618530273,
"learning_rate": 4.140350877192983e-06,
"loss": 1.3069,
"num_tokens": 788042.0,
"step": 706
},
{
"epoch": 0.8015873015873016,
"grad_norm": 23.022003173828125,
"learning_rate": 4.116959064327486e-06,
"loss": 1.5089,
"num_tokens": 789182.0,
"step": 707
},
{
"epoch": 0.8027210884353742,
"grad_norm": 22.555904388427734,
"learning_rate": 4.093567251461989e-06,
"loss": 1.4996,
"num_tokens": 790326.0,
"step": 708
},
{
"epoch": 0.8038548752834467,
"grad_norm": 22.719186782836914,
"learning_rate": 4.070175438596491e-06,
"loss": 1.5449,
"num_tokens": 791473.0,
"step": 709
},
{
"epoch": 0.8049886621315193,
"grad_norm": 21.924585342407227,
"learning_rate": 4.046783625730995e-06,
"loss": 1.3993,
"num_tokens": 792646.0,
"step": 710
},
{
"epoch": 0.8061224489795918,
"grad_norm": 20.699281692504883,
"learning_rate": 4.023391812865497e-06,
"loss": 1.4973,
"num_tokens": 793797.0,
"step": 711
},
{
"epoch": 0.8072562358276644,
"grad_norm": 22.318761825561523,
"learning_rate": 4.000000000000001e-06,
"loss": 1.4515,
"num_tokens": 794861.0,
"step": 712
},
{
"epoch": 0.808390022675737,
"grad_norm": 21.21175765991211,
"learning_rate": 3.976608187134503e-06,
"loss": 1.3681,
"num_tokens": 796010.0,
"step": 713
},
{
"epoch": 0.8095238095238095,
"grad_norm": 22.589601516723633,
"learning_rate": 3.953216374269006e-06,
"loss": 1.4962,
"num_tokens": 797073.0,
"step": 714
},
{
"epoch": 0.8106575963718821,
"grad_norm": 21.697351455688477,
"learning_rate": 3.929824561403509e-06,
"loss": 1.4605,
"num_tokens": 798194.0,
"step": 715
},
{
"epoch": 0.8117913832199547,
"grad_norm": 21.772241592407227,
"learning_rate": 3.906432748538012e-06,
"loss": 1.4382,
"num_tokens": 799326.0,
"step": 716
},
{
"epoch": 0.8129251700680272,
"grad_norm": 21.598237991333008,
"learning_rate": 3.883040935672515e-06,
"loss": 1.4977,
"num_tokens": 800462.0,
"step": 717
},
{
"epoch": 0.8140589569160998,
"grad_norm": 21.325088500976562,
"learning_rate": 3.859649122807018e-06,
"loss": 1.5207,
"num_tokens": 801660.0,
"step": 718
},
{
"epoch": 0.8151927437641724,
"grad_norm": 24.253833770751953,
"learning_rate": 3.83625730994152e-06,
"loss": 1.4913,
"num_tokens": 802681.0,
"step": 719
},
{
"epoch": 0.8163265306122449,
"grad_norm": 22.219438552856445,
"learning_rate": 3.812865497076024e-06,
"loss": 1.4802,
"num_tokens": 803845.0,
"step": 720
},
{
"epoch": 0.8174603174603174,
"grad_norm": 22.79910659790039,
"learning_rate": 3.789473684210527e-06,
"loss": 1.4913,
"num_tokens": 804919.0,
"step": 721
},
{
"epoch": 0.81859410430839,
"grad_norm": 22.189367294311523,
"learning_rate": 3.7660818713450298e-06,
"loss": 1.4382,
"num_tokens": 806066.0,
"step": 722
},
{
"epoch": 0.8197278911564626,
"grad_norm": 21.064807891845703,
"learning_rate": 3.7426900584795324e-06,
"loss": 1.5483,
"num_tokens": 807298.0,
"step": 723
},
{
"epoch": 0.8208616780045351,
"grad_norm": 22.164886474609375,
"learning_rate": 3.7192982456140354e-06,
"loss": 1.4852,
"num_tokens": 808431.0,
"step": 724
},
{
"epoch": 0.8219954648526077,
"grad_norm": 22.567136764526367,
"learning_rate": 3.6959064327485384e-06,
"loss": 1.5195,
"num_tokens": 809525.0,
"step": 725
},
{
"epoch": 0.8231292517006803,
"grad_norm": 22.86061668395996,
"learning_rate": 3.6725146198830414e-06,
"loss": 1.4455,
"num_tokens": 810602.0,
"step": 726
},
{
"epoch": 0.8242630385487528,
"grad_norm": 24.17923927307129,
"learning_rate": 3.6491228070175443e-06,
"loss": 1.4668,
"num_tokens": 811652.0,
"step": 727
},
{
"epoch": 0.8253968253968254,
"grad_norm": 23.038942337036133,
"learning_rate": 3.625730994152047e-06,
"loss": 1.5045,
"num_tokens": 812716.0,
"step": 728
},
{
"epoch": 0.826530612244898,
"grad_norm": 21.556453704833984,
"learning_rate": 3.60233918128655e-06,
"loss": 1.3505,
"num_tokens": 813814.0,
"step": 729
},
{
"epoch": 0.8276643990929705,
"grad_norm": 21.08284568786621,
"learning_rate": 3.578947368421053e-06,
"loss": 1.3864,
"num_tokens": 814929.0,
"step": 730
},
{
"epoch": 0.828798185941043,
"grad_norm": 23.882568359375,
"learning_rate": 3.555555555555556e-06,
"loss": 1.3743,
"num_tokens": 815979.0,
"step": 731
},
{
"epoch": 0.8299319727891157,
"grad_norm": 24.24713897705078,
"learning_rate": 3.532163742690059e-06,
"loss": 1.509,
"num_tokens": 817290.0,
"step": 732
},
{
"epoch": 0.8310657596371882,
"grad_norm": 22.145536422729492,
"learning_rate": 3.5087719298245615e-06,
"loss": 1.501,
"num_tokens": 818392.0,
"step": 733
},
{
"epoch": 0.8321995464852607,
"grad_norm": 21.552515029907227,
"learning_rate": 3.4853801169590645e-06,
"loss": 1.3933,
"num_tokens": 819609.0,
"step": 734
},
{
"epoch": 0.8333333333333334,
"grad_norm": 25.56783676147461,
"learning_rate": 3.4619883040935675e-06,
"loss": 1.583,
"num_tokens": 820656.0,
"step": 735
},
{
"epoch": 0.8344671201814059,
"grad_norm": 22.9251766204834,
"learning_rate": 3.4385964912280705e-06,
"loss": 1.3717,
"num_tokens": 821762.0,
"step": 736
},
{
"epoch": 0.8356009070294784,
"grad_norm": 24.673242568969727,
"learning_rate": 3.4152046783625735e-06,
"loss": 1.5318,
"num_tokens": 822936.0,
"step": 737
},
{
"epoch": 0.8367346938775511,
"grad_norm": 22.047882080078125,
"learning_rate": 3.391812865497076e-06,
"loss": 1.4858,
"num_tokens": 824105.0,
"step": 738
},
{
"epoch": 0.8378684807256236,
"grad_norm": 26.981224060058594,
"learning_rate": 3.368421052631579e-06,
"loss": 1.6137,
"num_tokens": 825116.0,
"step": 739
},
{
"epoch": 0.8390022675736961,
"grad_norm": 24.74795913696289,
"learning_rate": 3.345029239766082e-06,
"loss": 1.4405,
"num_tokens": 826109.0,
"step": 740
},
{
"epoch": 0.8401360544217688,
"grad_norm": 23.003847122192383,
"learning_rate": 3.321637426900585e-06,
"loss": 1.4222,
"num_tokens": 827151.0,
"step": 741
},
{
"epoch": 0.8412698412698413,
"grad_norm": 25.1625919342041,
"learning_rate": 3.2982456140350885e-06,
"loss": 1.538,
"num_tokens": 828251.0,
"step": 742
},
{
"epoch": 0.8424036281179138,
"grad_norm": 23.558082580566406,
"learning_rate": 3.2748538011695906e-06,
"loss": 1.2967,
"num_tokens": 829406.0,
"step": 743
},
{
"epoch": 0.8435374149659864,
"grad_norm": 22.346006393432617,
"learning_rate": 3.2514619883040936e-06,
"loss": 1.3927,
"num_tokens": 830517.0,
"step": 744
},
{
"epoch": 0.844671201814059,
"grad_norm": 21.72648048400879,
"learning_rate": 3.2280701754385966e-06,
"loss": 1.5264,
"num_tokens": 831751.0,
"step": 745
},
{
"epoch": 0.8458049886621315,
"grad_norm": 23.65290641784668,
"learning_rate": 3.2046783625731e-06,
"loss": 1.5314,
"num_tokens": 832836.0,
"step": 746
},
{
"epoch": 0.8469387755102041,
"grad_norm": 22.38150978088379,
"learning_rate": 3.181286549707603e-06,
"loss": 1.6097,
"num_tokens": 834005.0,
"step": 747
},
{
"epoch": 0.8480725623582767,
"grad_norm": 23.84773826599121,
"learning_rate": 3.157894736842105e-06,
"loss": 1.422,
"num_tokens": 835035.0,
"step": 748
},
{
"epoch": 0.8492063492063492,
"grad_norm": 23.476564407348633,
"learning_rate": 3.1345029239766086e-06,
"loss": 1.4523,
"num_tokens": 836174.0,
"step": 749
},
{
"epoch": 0.8503401360544217,
"grad_norm": 24.975629806518555,
"learning_rate": 3.1111111111111116e-06,
"loss": 1.4704,
"num_tokens": 837345.0,
"step": 750
},
{
"epoch": 0.8514739229024944,
"grad_norm": 25.92818260192871,
"learning_rate": 3.0877192982456146e-06,
"loss": 1.354,
"num_tokens": 838470.0,
"step": 751
},
{
"epoch": 0.8526077097505669,
"grad_norm": 21.76276969909668,
"learning_rate": 3.0643274853801176e-06,
"loss": 1.2758,
"num_tokens": 839542.0,
"step": 752
},
{
"epoch": 0.8537414965986394,
"grad_norm": 23.923139572143555,
"learning_rate": 3.04093567251462e-06,
"loss": 1.5039,
"num_tokens": 840615.0,
"step": 753
},
{
"epoch": 0.854875283446712,
"grad_norm": 20.571720123291016,
"learning_rate": 3.017543859649123e-06,
"loss": 1.5064,
"num_tokens": 841800.0,
"step": 754
},
{
"epoch": 0.8560090702947846,
"grad_norm": 22.663013458251953,
"learning_rate": 2.994152046783626e-06,
"loss": 1.5114,
"num_tokens": 842943.0,
"step": 755
},
{
"epoch": 0.8571428571428571,
"grad_norm": 22.52815055847168,
"learning_rate": 2.970760233918129e-06,
"loss": 1.5805,
"num_tokens": 844094.0,
"step": 756
},
{
"epoch": 0.8582766439909297,
"grad_norm": 22.185199737548828,
"learning_rate": 2.9473684210526317e-06,
"loss": 1.4734,
"num_tokens": 845227.0,
"step": 757
},
{
"epoch": 0.8594104308390023,
"grad_norm": 21.498750686645508,
"learning_rate": 2.9239766081871347e-06,
"loss": 1.3279,
"num_tokens": 846379.0,
"step": 758
},
{
"epoch": 0.8605442176870748,
"grad_norm": 22.456575393676758,
"learning_rate": 2.9005847953216377e-06,
"loss": 1.3714,
"num_tokens": 847478.0,
"step": 759
},
{
"epoch": 0.8616780045351474,
"grad_norm": 25.112070083618164,
"learning_rate": 2.8771929824561407e-06,
"loss": 1.5006,
"num_tokens": 848561.0,
"step": 760
},
{
"epoch": 0.86281179138322,
"grad_norm": 21.80204963684082,
"learning_rate": 2.8538011695906437e-06,
"loss": 1.4335,
"num_tokens": 849697.0,
"step": 761
},
{
"epoch": 0.8639455782312925,
"grad_norm": 23.306922912597656,
"learning_rate": 2.8304093567251463e-06,
"loss": 1.4661,
"num_tokens": 850827.0,
"step": 762
},
{
"epoch": 0.8650793650793651,
"grad_norm": 22.614166259765625,
"learning_rate": 2.8070175438596493e-06,
"loss": 1.4819,
"num_tokens": 851967.0,
"step": 763
},
{
"epoch": 0.8662131519274376,
"grad_norm": 22.74590301513672,
"learning_rate": 2.7836257309941523e-06,
"loss": 1.4953,
"num_tokens": 853111.0,
"step": 764
},
{
"epoch": 0.8673469387755102,
"grad_norm": 22.92290687561035,
"learning_rate": 2.7602339181286553e-06,
"loss": 1.4969,
"num_tokens": 854225.0,
"step": 765
},
{
"epoch": 0.8684807256235828,
"grad_norm": 21.96657371520996,
"learning_rate": 2.7368421052631583e-06,
"loss": 1.5487,
"num_tokens": 855400.0,
"step": 766
},
{
"epoch": 0.8696145124716553,
"grad_norm": 22.646835327148438,
"learning_rate": 2.713450292397661e-06,
"loss": 1.391,
"num_tokens": 856571.0,
"step": 767
},
{
"epoch": 0.8707482993197279,
"grad_norm": 22.9639892578125,
"learning_rate": 2.690058479532164e-06,
"loss": 1.4171,
"num_tokens": 857782.0,
"step": 768
},
{
"epoch": 0.8718820861678005,
"grad_norm": 23.083837509155273,
"learning_rate": 2.666666666666667e-06,
"loss": 1.5668,
"num_tokens": 858959.0,
"step": 769
},
{
"epoch": 0.873015873015873,
"grad_norm": 22.602506637573242,
"learning_rate": 2.64327485380117e-06,
"loss": 1.4862,
"num_tokens": 860124.0,
"step": 770
},
{
"epoch": 0.8741496598639455,
"grad_norm": 23.74089241027832,
"learning_rate": 2.619883040935673e-06,
"loss": 1.4123,
"num_tokens": 861312.0,
"step": 771
},
{
"epoch": 0.8752834467120182,
"grad_norm": 22.073772430419922,
"learning_rate": 2.5964912280701754e-06,
"loss": 1.4083,
"num_tokens": 862379.0,
"step": 772
},
{
"epoch": 0.8764172335600907,
"grad_norm": 23.299543380737305,
"learning_rate": 2.5730994152046784e-06,
"loss": 1.3877,
"num_tokens": 863526.0,
"step": 773
},
{
"epoch": 0.8775510204081632,
"grad_norm": 23.912397384643555,
"learning_rate": 2.5497076023391814e-06,
"loss": 1.6234,
"num_tokens": 864663.0,
"step": 774
},
{
"epoch": 0.8786848072562359,
"grad_norm": 24.13743019104004,
"learning_rate": 2.5263157894736844e-06,
"loss": 1.6174,
"num_tokens": 865755.0,
"step": 775
},
{
"epoch": 0.8798185941043084,
"grad_norm": 23.58803939819336,
"learning_rate": 2.5029239766081874e-06,
"loss": 1.5354,
"num_tokens": 866867.0,
"step": 776
},
{
"epoch": 0.8809523809523809,
"grad_norm": 22.72237205505371,
"learning_rate": 2.4795321637426904e-06,
"loss": 1.4695,
"num_tokens": 867947.0,
"step": 777
},
{
"epoch": 0.8820861678004536,
"grad_norm": 23.858057022094727,
"learning_rate": 2.456140350877193e-06,
"loss": 1.4492,
"num_tokens": 869047.0,
"step": 778
},
{
"epoch": 0.8832199546485261,
"grad_norm": 21.552154541015625,
"learning_rate": 2.432748538011696e-06,
"loss": 1.4438,
"num_tokens": 870112.0,
"step": 779
},
{
"epoch": 0.8843537414965986,
"grad_norm": 24.152172088623047,
"learning_rate": 2.409356725146199e-06,
"loss": 1.4744,
"num_tokens": 871206.0,
"step": 780
},
{
"epoch": 0.8854875283446711,
"grad_norm": 22.032583236694336,
"learning_rate": 2.385964912280702e-06,
"loss": 1.3994,
"num_tokens": 872313.0,
"step": 781
},
{
"epoch": 0.8866213151927438,
"grad_norm": 22.46315574645996,
"learning_rate": 2.362573099415205e-06,
"loss": 1.4018,
"num_tokens": 873407.0,
"step": 782
},
{
"epoch": 0.8877551020408163,
"grad_norm": 22.419082641601562,
"learning_rate": 2.3391812865497075e-06,
"loss": 1.3361,
"num_tokens": 874507.0,
"step": 783
},
{
"epoch": 0.8888888888888888,
"grad_norm": 22.973846435546875,
"learning_rate": 2.3157894736842105e-06,
"loss": 1.5251,
"num_tokens": 875594.0,
"step": 784
},
{
"epoch": 0.8900226757369615,
"grad_norm": 21.73171043395996,
"learning_rate": 2.292397660818714e-06,
"loss": 1.4877,
"num_tokens": 876759.0,
"step": 785
},
{
"epoch": 0.891156462585034,
"grad_norm": 22.867074966430664,
"learning_rate": 2.2690058479532165e-06,
"loss": 1.4999,
"num_tokens": 877790.0,
"step": 786
},
{
"epoch": 0.8922902494331065,
"grad_norm": 23.208694458007812,
"learning_rate": 2.2456140350877195e-06,
"loss": 1.4502,
"num_tokens": 878974.0,
"step": 787
},
{
"epoch": 0.8934240362811792,
"grad_norm": 23.5717830657959,
"learning_rate": 2.222222222222222e-06,
"loss": 1.3898,
"num_tokens": 880052.0,
"step": 788
},
{
"epoch": 0.8945578231292517,
"grad_norm": 23.512678146362305,
"learning_rate": 2.1988304093567255e-06,
"loss": 1.4121,
"num_tokens": 881239.0,
"step": 789
},
{
"epoch": 0.8956916099773242,
"grad_norm": 23.14679527282715,
"learning_rate": 2.1754385964912285e-06,
"loss": 1.4851,
"num_tokens": 882281.0,
"step": 790
},
{
"epoch": 0.8968253968253969,
"grad_norm": 23.918848037719727,
"learning_rate": 2.152046783625731e-06,
"loss": 1.4963,
"num_tokens": 883346.0,
"step": 791
},
{
"epoch": 0.8979591836734694,
"grad_norm": 26.56317710876465,
"learning_rate": 2.128654970760234e-06,
"loss": 1.5296,
"num_tokens": 884352.0,
"step": 792
},
{
"epoch": 0.8990929705215419,
"grad_norm": 22.81806182861328,
"learning_rate": 2.105263157894737e-06,
"loss": 1.5657,
"num_tokens": 885502.0,
"step": 793
},
{
"epoch": 0.9002267573696145,
"grad_norm": 23.83610725402832,
"learning_rate": 2.08187134502924e-06,
"loss": 1.5682,
"num_tokens": 886626.0,
"step": 794
},
{
"epoch": 0.9013605442176871,
"grad_norm": 23.975093841552734,
"learning_rate": 2.058479532163743e-06,
"loss": 1.527,
"num_tokens": 887757.0,
"step": 795
},
{
"epoch": 0.9024943310657596,
"grad_norm": 21.868783950805664,
"learning_rate": 2.0350877192982456e-06,
"loss": 1.4765,
"num_tokens": 888939.0,
"step": 796
},
{
"epoch": 0.9036281179138322,
"grad_norm": 22.76094627380371,
"learning_rate": 2.0116959064327486e-06,
"loss": 1.3386,
"num_tokens": 890157.0,
"step": 797
},
{
"epoch": 0.9047619047619048,
"grad_norm": 23.086637496948242,
"learning_rate": 1.9883040935672516e-06,
"loss": 1.6715,
"num_tokens": 891398.0,
"step": 798
},
{
"epoch": 0.9058956916099773,
"grad_norm": 24.359127044677734,
"learning_rate": 1.9649122807017546e-06,
"loss": 1.3054,
"num_tokens": 892408.0,
"step": 799
},
{
"epoch": 0.9070294784580499,
"grad_norm": 25.446590423583984,
"learning_rate": 1.9415204678362576e-06,
"loss": 1.4722,
"num_tokens": 893592.0,
"step": 800
},
{
"epoch": 0.9081632653061225,
"grad_norm": 23.30593490600586,
"learning_rate": 1.91812865497076e-06,
"loss": 1.4158,
"num_tokens": 894697.0,
"step": 801
},
{
"epoch": 0.909297052154195,
"grad_norm": 21.06711769104004,
"learning_rate": 1.8947368421052634e-06,
"loss": 1.592,
"num_tokens": 895818.0,
"step": 802
},
{
"epoch": 0.9104308390022676,
"grad_norm": 23.522247314453125,
"learning_rate": 1.8713450292397662e-06,
"loss": 1.5568,
"num_tokens": 896885.0,
"step": 803
},
{
"epoch": 0.9115646258503401,
"grad_norm": 22.43680763244629,
"learning_rate": 1.8479532163742692e-06,
"loss": 1.3984,
"num_tokens": 898032.0,
"step": 804
},
{
"epoch": 0.9126984126984127,
"grad_norm": 22.576719284057617,
"learning_rate": 1.8245614035087722e-06,
"loss": 1.4604,
"num_tokens": 899163.0,
"step": 805
},
{
"epoch": 0.9138321995464853,
"grad_norm": 22.583637237548828,
"learning_rate": 1.801169590643275e-06,
"loss": 1.4348,
"num_tokens": 900304.0,
"step": 806
},
{
"epoch": 0.9149659863945578,
"grad_norm": 21.52495574951172,
"learning_rate": 1.777777777777778e-06,
"loss": 1.364,
"num_tokens": 901464.0,
"step": 807
},
{
"epoch": 0.9160997732426304,
"grad_norm": 22.536762237548828,
"learning_rate": 1.7543859649122807e-06,
"loss": 1.5089,
"num_tokens": 902652.0,
"step": 808
},
{
"epoch": 0.9172335600907029,
"grad_norm": 22.26384162902832,
"learning_rate": 1.7309941520467837e-06,
"loss": 1.4325,
"num_tokens": 903725.0,
"step": 809
},
{
"epoch": 0.9183673469387755,
"grad_norm": 20.824234008789062,
"learning_rate": 1.7076023391812867e-06,
"loss": 1.396,
"num_tokens": 904819.0,
"step": 810
},
{
"epoch": 0.9195011337868481,
"grad_norm": 21.5470027923584,
"learning_rate": 1.6842105263157895e-06,
"loss": 1.4437,
"num_tokens": 905994.0,
"step": 811
},
{
"epoch": 0.9206349206349206,
"grad_norm": 22.454166412353516,
"learning_rate": 1.6608187134502925e-06,
"loss": 1.4207,
"num_tokens": 907165.0,
"step": 812
},
{
"epoch": 0.9217687074829932,
"grad_norm": 22.63362693786621,
"learning_rate": 1.6374269005847953e-06,
"loss": 1.5246,
"num_tokens": 908315.0,
"step": 813
},
{
"epoch": 0.9229024943310657,
"grad_norm": 25.17198371887207,
"learning_rate": 1.6140350877192983e-06,
"loss": 1.527,
"num_tokens": 909504.0,
"step": 814
},
{
"epoch": 0.9240362811791383,
"grad_norm": 25.855079650878906,
"learning_rate": 1.5906432748538015e-06,
"loss": 1.6569,
"num_tokens": 910579.0,
"step": 815
},
{
"epoch": 0.9251700680272109,
"grad_norm": 22.92259979248047,
"learning_rate": 1.5672514619883043e-06,
"loss": 1.4674,
"num_tokens": 911639.0,
"step": 816
},
{
"epoch": 0.9263038548752834,
"grad_norm": 24.303924560546875,
"learning_rate": 1.5438596491228073e-06,
"loss": 1.5105,
"num_tokens": 912701.0,
"step": 817
},
{
"epoch": 0.927437641723356,
"grad_norm": 21.863985061645508,
"learning_rate": 1.52046783625731e-06,
"loss": 1.6027,
"num_tokens": 913766.0,
"step": 818
},
{
"epoch": 0.9285714285714286,
"grad_norm": 24.20486068725586,
"learning_rate": 1.497076023391813e-06,
"loss": 1.5517,
"num_tokens": 914798.0,
"step": 819
},
{
"epoch": 0.9297052154195011,
"grad_norm": 24.408193588256836,
"learning_rate": 1.4736842105263159e-06,
"loss": 1.4336,
"num_tokens": 915953.0,
"step": 820
},
{
"epoch": 0.9308390022675737,
"grad_norm": 22.91715431213379,
"learning_rate": 1.4502923976608189e-06,
"loss": 1.3284,
"num_tokens": 917153.0,
"step": 821
},
{
"epoch": 0.9319727891156463,
"grad_norm": 23.430063247680664,
"learning_rate": 1.4269005847953219e-06,
"loss": 1.5508,
"num_tokens": 918239.0,
"step": 822
},
{
"epoch": 0.9331065759637188,
"grad_norm": 24.82691764831543,
"learning_rate": 1.4035087719298246e-06,
"loss": 1.4022,
"num_tokens": 919263.0,
"step": 823
},
{
"epoch": 0.9342403628117913,
"grad_norm": 21.78477668762207,
"learning_rate": 1.3801169590643276e-06,
"loss": 1.4777,
"num_tokens": 920398.0,
"step": 824
},
{
"epoch": 0.935374149659864,
"grad_norm": 23.606664657592773,
"learning_rate": 1.3567251461988304e-06,
"loss": 1.479,
"num_tokens": 921477.0,
"step": 825
},
{
"epoch": 0.9365079365079365,
"grad_norm": 22.557498931884766,
"learning_rate": 1.3333333333333334e-06,
"loss": 1.5736,
"num_tokens": 922580.0,
"step": 826
},
{
"epoch": 0.937641723356009,
"grad_norm": 22.220712661743164,
"learning_rate": 1.3099415204678364e-06,
"loss": 1.4134,
"num_tokens": 923731.0,
"step": 827
},
{
"epoch": 0.9387755102040817,
"grad_norm": 20.02456283569336,
"learning_rate": 1.2865497076023392e-06,
"loss": 1.4506,
"num_tokens": 924867.0,
"step": 828
},
{
"epoch": 0.9399092970521542,
"grad_norm": 23.332258224487305,
"learning_rate": 1.2631578947368422e-06,
"loss": 1.3667,
"num_tokens": 925946.0,
"step": 829
},
{
"epoch": 0.9410430839002267,
"grad_norm": 23.033876419067383,
"learning_rate": 1.2397660818713452e-06,
"loss": 1.4524,
"num_tokens": 927211.0,
"step": 830
},
{
"epoch": 0.9421768707482994,
"grad_norm": 21.960102081298828,
"learning_rate": 1.216374269005848e-06,
"loss": 1.4102,
"num_tokens": 928388.0,
"step": 831
},
{
"epoch": 0.9433106575963719,
"grad_norm": 22.837690353393555,
"learning_rate": 1.192982456140351e-06,
"loss": 1.4648,
"num_tokens": 929505.0,
"step": 832
},
{
"epoch": 0.9444444444444444,
"grad_norm": 26.00808334350586,
"learning_rate": 1.1695906432748538e-06,
"loss": 1.3195,
"num_tokens": 930491.0,
"step": 833
},
{
"epoch": 0.9455782312925171,
"grad_norm": 22.8687744140625,
"learning_rate": 1.146198830409357e-06,
"loss": 1.4152,
"num_tokens": 931679.0,
"step": 834
},
{
"epoch": 0.9467120181405896,
"grad_norm": 23.170955657958984,
"learning_rate": 1.1228070175438598e-06,
"loss": 1.3588,
"num_tokens": 932826.0,
"step": 835
},
{
"epoch": 0.9478458049886621,
"grad_norm": 21.579723358154297,
"learning_rate": 1.0994152046783627e-06,
"loss": 1.4296,
"num_tokens": 933959.0,
"step": 836
},
{
"epoch": 0.9489795918367347,
"grad_norm": 22.338029861450195,
"learning_rate": 1.0760233918128655e-06,
"loss": 1.5036,
"num_tokens": 935084.0,
"step": 837
},
{
"epoch": 0.9501133786848073,
"grad_norm": 24.644001007080078,
"learning_rate": 1.0526315789473685e-06,
"loss": 1.5511,
"num_tokens": 936207.0,
"step": 838
},
{
"epoch": 0.9512471655328798,
"grad_norm": 20.820331573486328,
"learning_rate": 1.0292397660818715e-06,
"loss": 1.4014,
"num_tokens": 937398.0,
"step": 839
},
{
"epoch": 0.9523809523809523,
"grad_norm": 22.744985580444336,
"learning_rate": 1.0058479532163743e-06,
"loss": 1.5348,
"num_tokens": 938566.0,
"step": 840
},
{
"epoch": 0.953514739229025,
"grad_norm": 21.92036247253418,
"learning_rate": 9.824561403508773e-07,
"loss": 1.5325,
"num_tokens": 939691.0,
"step": 841
},
{
"epoch": 0.9546485260770975,
"grad_norm": 21.75764274597168,
"learning_rate": 9.5906432748538e-07,
"loss": 1.468,
"num_tokens": 940827.0,
"step": 842
},
{
"epoch": 0.95578231292517,
"grad_norm": 22.83157730102539,
"learning_rate": 9.356725146198831e-07,
"loss": 1.4777,
"num_tokens": 941874.0,
"step": 843
},
{
"epoch": 0.9569160997732427,
"grad_norm": 24.487289428710938,
"learning_rate": 9.122807017543861e-07,
"loss": 1.2818,
"num_tokens": 942976.0,
"step": 844
},
{
"epoch": 0.9580498866213152,
"grad_norm": 22.184720993041992,
"learning_rate": 8.88888888888889e-07,
"loss": 1.4944,
"num_tokens": 944151.0,
"step": 845
},
{
"epoch": 0.9591836734693877,
"grad_norm": 24.632143020629883,
"learning_rate": 8.654970760233919e-07,
"loss": 1.4877,
"num_tokens": 945188.0,
"step": 846
},
{
"epoch": 0.9603174603174603,
"grad_norm": 23.738849639892578,
"learning_rate": 8.421052631578948e-07,
"loss": 1.5023,
"num_tokens": 946295.0,
"step": 847
},
{
"epoch": 0.9614512471655329,
"grad_norm": 23.87755012512207,
"learning_rate": 8.187134502923977e-07,
"loss": 1.5317,
"num_tokens": 947340.0,
"step": 848
},
{
"epoch": 0.9625850340136054,
"grad_norm": 23.480918884277344,
"learning_rate": 7.953216374269008e-07,
"loss": 1.4858,
"num_tokens": 948481.0,
"step": 849
},
{
"epoch": 0.963718820861678,
"grad_norm": 24.38697052001953,
"learning_rate": 7.719298245614036e-07,
"loss": 1.3701,
"num_tokens": 949547.0,
"step": 850
},
{
"epoch": 0.9648526077097506,
"grad_norm": 23.775096893310547,
"learning_rate": 7.485380116959065e-07,
"loss": 1.4726,
"num_tokens": 950588.0,
"step": 851
},
{
"epoch": 0.9659863945578231,
"grad_norm": 22.794851303100586,
"learning_rate": 7.251461988304094e-07,
"loss": 1.5002,
"num_tokens": 951774.0,
"step": 852
},
{
"epoch": 0.9671201814058957,
"grad_norm": 21.191730499267578,
"learning_rate": 7.017543859649123e-07,
"loss": 1.4518,
"num_tokens": 952871.0,
"step": 853
},
{
"epoch": 0.9682539682539683,
"grad_norm": 21.24921417236328,
"learning_rate": 6.783625730994152e-07,
"loss": 1.3679,
"num_tokens": 954031.0,
"step": 854
},
{
"epoch": 0.9693877551020408,
"grad_norm": 23.635141372680664,
"learning_rate": 6.549707602339182e-07,
"loss": 1.3911,
"num_tokens": 955054.0,
"step": 855
},
{
"epoch": 0.9705215419501134,
"grad_norm": 23.66080665588379,
"learning_rate": 6.315789473684211e-07,
"loss": 1.3363,
"num_tokens": 956174.0,
"step": 856
},
{
"epoch": 0.971655328798186,
"grad_norm": 23.329038619995117,
"learning_rate": 6.08187134502924e-07,
"loss": 1.514,
"num_tokens": 957337.0,
"step": 857
},
{
"epoch": 0.9727891156462585,
"grad_norm": 22.854223251342773,
"learning_rate": 5.847953216374269e-07,
"loss": 1.4398,
"num_tokens": 958396.0,
"step": 858
},
{
"epoch": 0.9739229024943311,
"grad_norm": 22.29954719543457,
"learning_rate": 5.614035087719299e-07,
"loss": 1.5289,
"num_tokens": 959510.0,
"step": 859
},
{
"epoch": 0.9750566893424036,
"grad_norm": 23.53312873840332,
"learning_rate": 5.380116959064328e-07,
"loss": 1.4513,
"num_tokens": 960614.0,
"step": 860
},
{
"epoch": 0.9761904761904762,
"grad_norm": 21.8758602142334,
"learning_rate": 5.146198830409358e-07,
"loss": 1.422,
"num_tokens": 961707.0,
"step": 861
},
{
"epoch": 0.9773242630385488,
"grad_norm": 23.302278518676758,
"learning_rate": 4.912280701754387e-07,
"loss": 1.4902,
"num_tokens": 962890.0,
"step": 862
},
{
"epoch": 0.9784580498866213,
"grad_norm": 23.379724502563477,
"learning_rate": 4.6783625730994155e-07,
"loss": 1.4964,
"num_tokens": 964107.0,
"step": 863
},
{
"epoch": 0.9795918367346939,
"grad_norm": 23.007959365844727,
"learning_rate": 4.444444444444445e-07,
"loss": 1.4419,
"num_tokens": 965201.0,
"step": 864
},
{
"epoch": 0.9807256235827665,
"grad_norm": 24.184219360351562,
"learning_rate": 4.210526315789474e-07,
"loss": 1.3729,
"num_tokens": 966250.0,
"step": 865
},
{
"epoch": 0.981859410430839,
"grad_norm": 22.903079986572266,
"learning_rate": 3.976608187134504e-07,
"loss": 1.4376,
"num_tokens": 967412.0,
"step": 866
},
{
"epoch": 0.9829931972789115,
"grad_norm": 21.505783081054688,
"learning_rate": 3.7426900584795327e-07,
"loss": 1.4338,
"num_tokens": 968670.0,
"step": 867
},
{
"epoch": 0.9841269841269841,
"grad_norm": 24.10959243774414,
"learning_rate": 3.5087719298245616e-07,
"loss": 1.3635,
"num_tokens": 969688.0,
"step": 868
},
{
"epoch": 0.9852607709750567,
"grad_norm": 23.527223587036133,
"learning_rate": 3.274853801169591e-07,
"loss": 1.5074,
"num_tokens": 970730.0,
"step": 869
},
{
"epoch": 0.9863945578231292,
"grad_norm": 22.13941764831543,
"learning_rate": 3.04093567251462e-07,
"loss": 1.4279,
"num_tokens": 971851.0,
"step": 870
},
{
"epoch": 0.9875283446712018,
"grad_norm": 23.4210262298584,
"learning_rate": 2.8070175438596494e-07,
"loss": 1.4491,
"num_tokens": 972955.0,
"step": 871
},
{
"epoch": 0.9886621315192744,
"grad_norm": 22.403488159179688,
"learning_rate": 2.573099415204679e-07,
"loss": 1.4925,
"num_tokens": 974119.0,
"step": 872
},
{
"epoch": 0.9897959183673469,
"grad_norm": 22.045902252197266,
"learning_rate": 2.3391812865497077e-07,
"loss": 1.5381,
"num_tokens": 975268.0,
"step": 873
},
{
"epoch": 0.9909297052154195,
"grad_norm": 22.418025970458984,
"learning_rate": 2.105263157894737e-07,
"loss": 1.3585,
"num_tokens": 976366.0,
"step": 874
},
{
"epoch": 0.9920634920634921,
"grad_norm": 24.32988739013672,
"learning_rate": 1.8713450292397663e-07,
"loss": 1.3053,
"num_tokens": 977432.0,
"step": 875
},
{
"epoch": 0.9931972789115646,
"grad_norm": 22.723827362060547,
"learning_rate": 1.6374269005847955e-07,
"loss": 1.461,
"num_tokens": 978586.0,
"step": 876
},
{
"epoch": 0.9943310657596371,
"grad_norm": 22.09205436706543,
"learning_rate": 1.4035087719298247e-07,
"loss": 1.396,
"num_tokens": 979670.0,
"step": 877
},
{
"epoch": 0.9954648526077098,
"grad_norm": 24.673280715942383,
"learning_rate": 1.1695906432748539e-07,
"loss": 1.3698,
"num_tokens": 980667.0,
"step": 878
},
{
"epoch": 0.9965986394557823,
"grad_norm": 22.821020126342773,
"learning_rate": 9.356725146198832e-08,
"loss": 1.4921,
"num_tokens": 981825.0,
"step": 879
},
{
"epoch": 0.9977324263038548,
"grad_norm": 23.99849510192871,
"learning_rate": 7.017543859649123e-08,
"loss": 1.5141,
"num_tokens": 982883.0,
"step": 880
},
{
"epoch": 0.9988662131519275,
"grad_norm": 22.911663055419922,
"learning_rate": 4.678362573099416e-08,
"loss": 1.4862,
"num_tokens": 983976.0,
"step": 881
},
{
"epoch": 1.0,
"grad_norm": 23.910690307617188,
"learning_rate": 2.339181286549708e-08,
"loss": 1.367,
"num_tokens": 984517.0,
"step": 882
},
{
"epoch": 1.0,
"step": 882,
"total_flos": 5752804923670528.0,
"train_loss": 1.5323709486022827,
"train_runtime": 132.5784,
"train_samples_per_second": 53.169,
"train_steps_per_second": 6.653
}
],
"logging_steps": 1,
"max_steps": 882,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5752804923670528.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}