|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 882, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0011337868480725624, |
|
"grad_norm": 74.33029174804688, |
|
"learning_rate": 0.0, |
|
"loss": 2.7802, |
|
"num_tokens": 1145.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0022675736961451248, |
|
"grad_norm": 90.74027252197266, |
|
"learning_rate": 7.407407407407407e-07, |
|
"loss": 3.001, |
|
"num_tokens": 2223.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.003401360544217687, |
|
"grad_norm": 84.00408172607422, |
|
"learning_rate": 1.4814814814814815e-06, |
|
"loss": 3.0741, |
|
"num_tokens": 3315.0, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0045351473922902496, |
|
"grad_norm": 74.34835052490234, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 2.9966, |
|
"num_tokens": 4429.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.005668934240362812, |
|
"grad_norm": 74.42596435546875, |
|
"learning_rate": 2.962962962962963e-06, |
|
"loss": 2.8384, |
|
"num_tokens": 5594.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.006802721088435374, |
|
"grad_norm": 86.56283569335938, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 2.9764, |
|
"num_tokens": 6751.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.007936507936507936, |
|
"grad_norm": 83.2591323852539, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 3.067, |
|
"num_tokens": 7768.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.009070294784580499, |
|
"grad_norm": 79.88240814208984, |
|
"learning_rate": 5.185185185185185e-06, |
|
"loss": 2.7835, |
|
"num_tokens": 8885.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01020408163265306, |
|
"grad_norm": 62.56035232543945, |
|
"learning_rate": 5.925925925925926e-06, |
|
"loss": 2.8879, |
|
"num_tokens": 10034.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.011337868480725623, |
|
"grad_norm": 65.7270278930664, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 2.7762, |
|
"num_tokens": 11110.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.012471655328798186, |
|
"grad_norm": 66.46719360351562, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 2.9042, |
|
"num_tokens": 12163.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.013605442176870748, |
|
"grad_norm": 54.481658935546875, |
|
"learning_rate": 8.148148148148148e-06, |
|
"loss": 2.8072, |
|
"num_tokens": 13214.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.01473922902494331, |
|
"grad_norm": 47.543304443359375, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 2.5354, |
|
"num_tokens": 14332.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.015873015873015872, |
|
"grad_norm": 43.516944885253906, |
|
"learning_rate": 9.62962962962963e-06, |
|
"loss": 2.6121, |
|
"num_tokens": 15486.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.017006802721088437, |
|
"grad_norm": 51.404109954833984, |
|
"learning_rate": 1.037037037037037e-05, |
|
"loss": 2.4756, |
|
"num_tokens": 16505.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.018140589569160998, |
|
"grad_norm": 49.24734878540039, |
|
"learning_rate": 1.1111111111111113e-05, |
|
"loss": 2.537, |
|
"num_tokens": 17529.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.01927437641723356, |
|
"grad_norm": 44.85908508300781, |
|
"learning_rate": 1.1851851851851852e-05, |
|
"loss": 2.5237, |
|
"num_tokens": 18625.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.02040816326530612, |
|
"grad_norm": 42.729736328125, |
|
"learning_rate": 1.2592592592592593e-05, |
|
"loss": 2.3821, |
|
"num_tokens": 19757.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.021541950113378686, |
|
"grad_norm": 42.92080307006836, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 2.3942, |
|
"num_tokens": 20818.0, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.022675736961451247, |
|
"grad_norm": 38.561309814453125, |
|
"learning_rate": 1.4074074074074075e-05, |
|
"loss": 2.4325, |
|
"num_tokens": 21939.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.023809523809523808, |
|
"grad_norm": 36.45574188232422, |
|
"learning_rate": 1.4814814814814815e-05, |
|
"loss": 2.1893, |
|
"num_tokens": 23014.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.024943310657596373, |
|
"grad_norm": 31.59442901611328, |
|
"learning_rate": 1.555555555555556e-05, |
|
"loss": 2.2218, |
|
"num_tokens": 24169.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.026077097505668934, |
|
"grad_norm": 32.13104248046875, |
|
"learning_rate": 1.6296296296296297e-05, |
|
"loss": 2.2421, |
|
"num_tokens": 25249.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.027210884353741496, |
|
"grad_norm": 30.995559692382812, |
|
"learning_rate": 1.7037037037037038e-05, |
|
"loss": 2.0935, |
|
"num_tokens": 26313.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.02834467120181406, |
|
"grad_norm": 33.38172912597656, |
|
"learning_rate": 1.7777777777777777e-05, |
|
"loss": 2.1663, |
|
"num_tokens": 27351.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.02947845804988662, |
|
"grad_norm": 30.938949584960938, |
|
"learning_rate": 1.851851851851852e-05, |
|
"loss": 1.9916, |
|
"num_tokens": 28367.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.030612244897959183, |
|
"grad_norm": 31.419017791748047, |
|
"learning_rate": 1.925925925925926e-05, |
|
"loss": 1.9991, |
|
"num_tokens": 29437.0, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.031746031746031744, |
|
"grad_norm": 28.316110610961914, |
|
"learning_rate": 2e-05, |
|
"loss": 1.9932, |
|
"num_tokens": 30576.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.032879818594104306, |
|
"grad_norm": 28.506546020507812, |
|
"learning_rate": 1.9976608187134504e-05, |
|
"loss": 1.9384, |
|
"num_tokens": 31772.0, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.034013605442176874, |
|
"grad_norm": 30.74509048461914, |
|
"learning_rate": 1.9953216374269007e-05, |
|
"loss": 1.897, |
|
"num_tokens": 32868.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.035147392290249435, |
|
"grad_norm": 29.190940856933594, |
|
"learning_rate": 1.992982456140351e-05, |
|
"loss": 2.0675, |
|
"num_tokens": 34025.0, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.036281179138321996, |
|
"grad_norm": 27.97025489807129, |
|
"learning_rate": 1.9906432748538015e-05, |
|
"loss": 1.9301, |
|
"num_tokens": 35175.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.03741496598639456, |
|
"grad_norm": 26.162158966064453, |
|
"learning_rate": 1.9883040935672515e-05, |
|
"loss": 1.8323, |
|
"num_tokens": 36242.0, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.03854875283446712, |
|
"grad_norm": 23.444116592407227, |
|
"learning_rate": 1.9859649122807017e-05, |
|
"loss": 1.9189, |
|
"num_tokens": 37352.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.03968253968253968, |
|
"grad_norm": 24.441686630249023, |
|
"learning_rate": 1.9836257309941523e-05, |
|
"loss": 1.8367, |
|
"num_tokens": 38425.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04081632653061224, |
|
"grad_norm": 23.87506675720215, |
|
"learning_rate": 1.9812865497076026e-05, |
|
"loss": 1.7329, |
|
"num_tokens": 39617.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04195011337868481, |
|
"grad_norm": 24.666210174560547, |
|
"learning_rate": 1.9789473684210528e-05, |
|
"loss": 1.8583, |
|
"num_tokens": 40651.0, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.04308390022675737, |
|
"grad_norm": 21.076745986938477, |
|
"learning_rate": 1.976608187134503e-05, |
|
"loss": 1.7904, |
|
"num_tokens": 41788.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.04421768707482993, |
|
"grad_norm": 18.543474197387695, |
|
"learning_rate": 1.9742690058479533e-05, |
|
"loss": 1.807, |
|
"num_tokens": 42935.0, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.045351473922902494, |
|
"grad_norm": 19.128664016723633, |
|
"learning_rate": 1.9719298245614036e-05, |
|
"loss": 1.6529, |
|
"num_tokens": 44074.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.046485260770975055, |
|
"grad_norm": 19.04387855529785, |
|
"learning_rate": 1.969590643274854e-05, |
|
"loss": 1.9339, |
|
"num_tokens": 45240.0, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.047619047619047616, |
|
"grad_norm": 18.83653450012207, |
|
"learning_rate": 1.9672514619883044e-05, |
|
"loss": 1.7952, |
|
"num_tokens": 46381.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.048752834467120185, |
|
"grad_norm": 20.276226043701172, |
|
"learning_rate": 1.9649122807017544e-05, |
|
"loss": 1.7274, |
|
"num_tokens": 47452.0, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.049886621315192746, |
|
"grad_norm": 18.112844467163086, |
|
"learning_rate": 1.962573099415205e-05, |
|
"loss": 1.7377, |
|
"num_tokens": 48609.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.05102040816326531, |
|
"grad_norm": 18.350265502929688, |
|
"learning_rate": 1.9602339181286552e-05, |
|
"loss": 1.4819, |
|
"num_tokens": 49744.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05215419501133787, |
|
"grad_norm": 19.062835693359375, |
|
"learning_rate": 1.9578947368421055e-05, |
|
"loss": 1.8832, |
|
"num_tokens": 50850.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.05328798185941043, |
|
"grad_norm": 21.639110565185547, |
|
"learning_rate": 1.9555555555555557e-05, |
|
"loss": 1.7446, |
|
"num_tokens": 51963.0, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.05442176870748299, |
|
"grad_norm": 18.648723602294922, |
|
"learning_rate": 1.953216374269006e-05, |
|
"loss": 1.6512, |
|
"num_tokens": 53070.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.05555555555555555, |
|
"grad_norm": 21.16545867919922, |
|
"learning_rate": 1.9508771929824562e-05, |
|
"loss": 1.7065, |
|
"num_tokens": 54154.0, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.05668934240362812, |
|
"grad_norm": 18.10296630859375, |
|
"learning_rate": 1.9485380116959065e-05, |
|
"loss": 1.5725, |
|
"num_tokens": 55278.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05782312925170068, |
|
"grad_norm": 18.74391746520996, |
|
"learning_rate": 1.9461988304093568e-05, |
|
"loss": 1.8522, |
|
"num_tokens": 56381.0, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.05895691609977324, |
|
"grad_norm": 20.406055450439453, |
|
"learning_rate": 1.9438596491228074e-05, |
|
"loss": 1.6082, |
|
"num_tokens": 57413.0, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.060090702947845805, |
|
"grad_norm": 19.050594329833984, |
|
"learning_rate": 1.9415204678362573e-05, |
|
"loss": 1.7722, |
|
"num_tokens": 58555.0, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.061224489795918366, |
|
"grad_norm": 19.197467803955078, |
|
"learning_rate": 1.939181286549708e-05, |
|
"loss": 1.6229, |
|
"num_tokens": 59581.0, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.06235827664399093, |
|
"grad_norm": 20.112733840942383, |
|
"learning_rate": 1.936842105263158e-05, |
|
"loss": 1.71, |
|
"num_tokens": 60776.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06349206349206349, |
|
"grad_norm": 19.704727172851562, |
|
"learning_rate": 1.9345029239766084e-05, |
|
"loss": 1.6764, |
|
"num_tokens": 61957.0, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.06462585034013606, |
|
"grad_norm": 19.901477813720703, |
|
"learning_rate": 1.9321637426900586e-05, |
|
"loss": 1.6584, |
|
"num_tokens": 63043.0, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.06575963718820861, |
|
"grad_norm": 19.40325927734375, |
|
"learning_rate": 1.929824561403509e-05, |
|
"loss": 1.7776, |
|
"num_tokens": 64129.0, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.06689342403628118, |
|
"grad_norm": 19.251150131225586, |
|
"learning_rate": 1.927485380116959e-05, |
|
"loss": 1.5785, |
|
"num_tokens": 65256.0, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.06802721088435375, |
|
"grad_norm": 18.591737747192383, |
|
"learning_rate": 1.9251461988304094e-05, |
|
"loss": 1.6411, |
|
"num_tokens": 66376.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0691609977324263, |
|
"grad_norm": 18.51482582092285, |
|
"learning_rate": 1.9228070175438597e-05, |
|
"loss": 1.6408, |
|
"num_tokens": 67572.0, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.07029478458049887, |
|
"grad_norm": 20.782194137573242, |
|
"learning_rate": 1.9204678362573103e-05, |
|
"loss": 1.5661, |
|
"num_tokens": 68652.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 20.952245712280273, |
|
"learning_rate": 1.9181286549707602e-05, |
|
"loss": 1.6445, |
|
"num_tokens": 69693.0, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.07256235827664399, |
|
"grad_norm": 19.55459213256836, |
|
"learning_rate": 1.9157894736842108e-05, |
|
"loss": 1.6929, |
|
"num_tokens": 70855.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.07369614512471655, |
|
"grad_norm": 19.258163452148438, |
|
"learning_rate": 1.913450292397661e-05, |
|
"loss": 1.5356, |
|
"num_tokens": 71899.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07482993197278912, |
|
"grad_norm": 19.81553840637207, |
|
"learning_rate": 1.9111111111111113e-05, |
|
"loss": 1.5472, |
|
"num_tokens": 73092.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.07596371882086168, |
|
"grad_norm": 18.654260635375977, |
|
"learning_rate": 1.9087719298245616e-05, |
|
"loss": 1.6884, |
|
"num_tokens": 74232.0, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.07709750566893424, |
|
"grad_norm": 20.07318878173828, |
|
"learning_rate": 1.9064327485380118e-05, |
|
"loss": 1.7197, |
|
"num_tokens": 75347.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.0782312925170068, |
|
"grad_norm": 20.34659767150879, |
|
"learning_rate": 1.904093567251462e-05, |
|
"loss": 1.736, |
|
"num_tokens": 76455.0, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.07936507936507936, |
|
"grad_norm": 18.60862159729004, |
|
"learning_rate": 1.9017543859649123e-05, |
|
"loss": 1.7258, |
|
"num_tokens": 77648.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08049886621315193, |
|
"grad_norm": 19.377056121826172, |
|
"learning_rate": 1.8994152046783626e-05, |
|
"loss": 1.7971, |
|
"num_tokens": 78688.0, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.08163265306122448, |
|
"grad_norm": 18.83735466003418, |
|
"learning_rate": 1.8970760233918132e-05, |
|
"loss": 1.4687, |
|
"num_tokens": 79760.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.08276643990929705, |
|
"grad_norm": 19.88797950744629, |
|
"learning_rate": 1.894736842105263e-05, |
|
"loss": 1.6935, |
|
"num_tokens": 80889.0, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.08390022675736962, |
|
"grad_norm": 20.648218154907227, |
|
"learning_rate": 1.8923976608187137e-05, |
|
"loss": 1.5982, |
|
"num_tokens": 81898.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.08503401360544217, |
|
"grad_norm": 20.697324752807617, |
|
"learning_rate": 1.890058479532164e-05, |
|
"loss": 1.7866, |
|
"num_tokens": 83019.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08616780045351474, |
|
"grad_norm": 21.66934585571289, |
|
"learning_rate": 1.8877192982456142e-05, |
|
"loss": 1.7473, |
|
"num_tokens": 84034.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.0873015873015873, |
|
"grad_norm": 18.273752212524414, |
|
"learning_rate": 1.8853801169590645e-05, |
|
"loss": 1.6144, |
|
"num_tokens": 85221.0, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.08843537414965986, |
|
"grad_norm": 21.30136489868164, |
|
"learning_rate": 1.8830409356725147e-05, |
|
"loss": 1.6525, |
|
"num_tokens": 86291.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.08956916099773243, |
|
"grad_norm": 21.55545425415039, |
|
"learning_rate": 1.880701754385965e-05, |
|
"loss": 1.7933, |
|
"num_tokens": 87273.0, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.09070294784580499, |
|
"grad_norm": 20.109617233276367, |
|
"learning_rate": 1.8783625730994152e-05, |
|
"loss": 1.6552, |
|
"num_tokens": 88353.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09183673469387756, |
|
"grad_norm": 20.001720428466797, |
|
"learning_rate": 1.8760233918128655e-05, |
|
"loss": 1.6845, |
|
"num_tokens": 89389.0, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.09297052154195011, |
|
"grad_norm": 20.805374145507812, |
|
"learning_rate": 1.873684210526316e-05, |
|
"loss": 1.597, |
|
"num_tokens": 90434.0, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.09410430839002268, |
|
"grad_norm": 18.642532348632812, |
|
"learning_rate": 1.871345029239766e-05, |
|
"loss": 1.5591, |
|
"num_tokens": 91566.0, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.09523809523809523, |
|
"grad_norm": 20.010791778564453, |
|
"learning_rate": 1.8690058479532166e-05, |
|
"loss": 1.6093, |
|
"num_tokens": 92699.0, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.0963718820861678, |
|
"grad_norm": 21.23207664489746, |
|
"learning_rate": 1.866666666666667e-05, |
|
"loss": 1.7362, |
|
"num_tokens": 93770.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.09750566893424037, |
|
"grad_norm": 22.148468017578125, |
|
"learning_rate": 1.864327485380117e-05, |
|
"loss": 1.6206, |
|
"num_tokens": 94927.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.09863945578231292, |
|
"grad_norm": 19.68358039855957, |
|
"learning_rate": 1.8619883040935674e-05, |
|
"loss": 1.7617, |
|
"num_tokens": 96064.0, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.09977324263038549, |
|
"grad_norm": 20.17274284362793, |
|
"learning_rate": 1.8596491228070176e-05, |
|
"loss": 1.6107, |
|
"num_tokens": 97169.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.10090702947845805, |
|
"grad_norm": 19.35859489440918, |
|
"learning_rate": 1.857309941520468e-05, |
|
"loss": 1.5928, |
|
"num_tokens": 98366.0, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.10204081632653061, |
|
"grad_norm": 21.101551055908203, |
|
"learning_rate": 1.854970760233918e-05, |
|
"loss": 1.6669, |
|
"num_tokens": 99457.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.10317460317460317, |
|
"grad_norm": 21.70389175415039, |
|
"learning_rate": 1.8526315789473684e-05, |
|
"loss": 1.7066, |
|
"num_tokens": 100599.0, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.10430839002267574, |
|
"grad_norm": 19.57090950012207, |
|
"learning_rate": 1.850292397660819e-05, |
|
"loss": 1.4318, |
|
"num_tokens": 101813.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.1054421768707483, |
|
"grad_norm": 20.19015884399414, |
|
"learning_rate": 1.847953216374269e-05, |
|
"loss": 1.5283, |
|
"num_tokens": 102920.0, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.10657596371882086, |
|
"grad_norm": 21.393760681152344, |
|
"learning_rate": 1.8456140350877195e-05, |
|
"loss": 1.574, |
|
"num_tokens": 104083.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.10770975056689343, |
|
"grad_norm": 19.35205078125, |
|
"learning_rate": 1.8432748538011698e-05, |
|
"loss": 1.4985, |
|
"num_tokens": 105252.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.10884353741496598, |
|
"grad_norm": 22.61254119873047, |
|
"learning_rate": 1.84093567251462e-05, |
|
"loss": 1.5807, |
|
"num_tokens": 106354.0, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.10997732426303855, |
|
"grad_norm": 21.425067901611328, |
|
"learning_rate": 1.8385964912280703e-05, |
|
"loss": 1.6194, |
|
"num_tokens": 107532.0, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 20.971893310546875, |
|
"learning_rate": 1.8362573099415205e-05, |
|
"loss": 1.6884, |
|
"num_tokens": 108667.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.11224489795918367, |
|
"grad_norm": 22.08867073059082, |
|
"learning_rate": 1.833918128654971e-05, |
|
"loss": 1.6521, |
|
"num_tokens": 109714.0, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.11337868480725624, |
|
"grad_norm": 19.163150787353516, |
|
"learning_rate": 1.831578947368421e-05, |
|
"loss": 1.7947, |
|
"num_tokens": 110894.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1145124716553288, |
|
"grad_norm": 20.42814826965332, |
|
"learning_rate": 1.8292397660818713e-05, |
|
"loss": 1.6677, |
|
"num_tokens": 111990.0, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.11564625850340136, |
|
"grad_norm": 20.525667190551758, |
|
"learning_rate": 1.826900584795322e-05, |
|
"loss": 1.5593, |
|
"num_tokens": 113085.0, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.11678004535147392, |
|
"grad_norm": 21.172090530395508, |
|
"learning_rate": 1.824561403508772e-05, |
|
"loss": 1.6892, |
|
"num_tokens": 114209.0, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.11791383219954649, |
|
"grad_norm": 19.004167556762695, |
|
"learning_rate": 1.8222222222222224e-05, |
|
"loss": 1.6172, |
|
"num_tokens": 115268.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.11904761904761904, |
|
"grad_norm": 21.883514404296875, |
|
"learning_rate": 1.8198830409356727e-05, |
|
"loss": 1.5572, |
|
"num_tokens": 116382.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.12018140589569161, |
|
"grad_norm": 20.1040096282959, |
|
"learning_rate": 1.817543859649123e-05, |
|
"loss": 1.6517, |
|
"num_tokens": 117504.0, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.12131519274376418, |
|
"grad_norm": 19.666257858276367, |
|
"learning_rate": 1.8152046783625732e-05, |
|
"loss": 1.6365, |
|
"num_tokens": 118698.0, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.12244897959183673, |
|
"grad_norm": 21.657516479492188, |
|
"learning_rate": 1.8128654970760235e-05, |
|
"loss": 1.7495, |
|
"num_tokens": 119791.0, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.1235827664399093, |
|
"grad_norm": 20.718935012817383, |
|
"learning_rate": 1.810526315789474e-05, |
|
"loss": 1.7333, |
|
"num_tokens": 120996.0, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.12471655328798185, |
|
"grad_norm": 20.30824851989746, |
|
"learning_rate": 1.808187134502924e-05, |
|
"loss": 1.5361, |
|
"num_tokens": 122066.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12585034013605442, |
|
"grad_norm": 21.735916137695312, |
|
"learning_rate": 1.8058479532163746e-05, |
|
"loss": 1.5749, |
|
"num_tokens": 123250.0, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.12698412698412698, |
|
"grad_norm": 22.563383102416992, |
|
"learning_rate": 1.8035087719298248e-05, |
|
"loss": 1.562, |
|
"num_tokens": 124337.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.12811791383219956, |
|
"grad_norm": 22.8643856048584, |
|
"learning_rate": 1.8011695906432747e-05, |
|
"loss": 1.5494, |
|
"num_tokens": 125423.0, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.1292517006802721, |
|
"grad_norm": 22.234722137451172, |
|
"learning_rate": 1.7988304093567253e-05, |
|
"loss": 1.7195, |
|
"num_tokens": 126544.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.13038548752834467, |
|
"grad_norm": 21.315336227416992, |
|
"learning_rate": 1.7964912280701756e-05, |
|
"loss": 1.7304, |
|
"num_tokens": 127718.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.13151927437641722, |
|
"grad_norm": 22.4656925201416, |
|
"learning_rate": 1.794152046783626e-05, |
|
"loss": 1.6433, |
|
"num_tokens": 128897.0, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.1326530612244898, |
|
"grad_norm": 22.329336166381836, |
|
"learning_rate": 1.791812865497076e-05, |
|
"loss": 1.6484, |
|
"num_tokens": 130034.0, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.13378684807256236, |
|
"grad_norm": 20.504932403564453, |
|
"learning_rate": 1.7894736842105264e-05, |
|
"loss": 1.5951, |
|
"num_tokens": 131196.0, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.1349206349206349, |
|
"grad_norm": 20.69579315185547, |
|
"learning_rate": 1.787134502923977e-05, |
|
"loss": 1.589, |
|
"num_tokens": 132362.0, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.1360544217687075, |
|
"grad_norm": 21.02338218688965, |
|
"learning_rate": 1.784795321637427e-05, |
|
"loss": 1.6319, |
|
"num_tokens": 133454.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13718820861678005, |
|
"grad_norm": 21.11552619934082, |
|
"learning_rate": 1.7824561403508775e-05, |
|
"loss": 1.6886, |
|
"num_tokens": 134552.0, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.1383219954648526, |
|
"grad_norm": 19.719133377075195, |
|
"learning_rate": 1.7801169590643277e-05, |
|
"loss": 1.6521, |
|
"num_tokens": 135670.0, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.13945578231292516, |
|
"grad_norm": 25.066865921020508, |
|
"learning_rate": 1.7777777777777777e-05, |
|
"loss": 1.743, |
|
"num_tokens": 136682.0, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.14058956916099774, |
|
"grad_norm": 21.543785095214844, |
|
"learning_rate": 1.7754385964912283e-05, |
|
"loss": 1.6218, |
|
"num_tokens": 137759.0, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.1417233560090703, |
|
"grad_norm": 21.717317581176758, |
|
"learning_rate": 1.7730994152046785e-05, |
|
"loss": 1.6657, |
|
"num_tokens": 138859.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 21.426856994628906, |
|
"learning_rate": 1.7707602339181288e-05, |
|
"loss": 1.4688, |
|
"num_tokens": 140019.0, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.14399092970521543, |
|
"grad_norm": 23.49262237548828, |
|
"learning_rate": 1.768421052631579e-05, |
|
"loss": 1.5918, |
|
"num_tokens": 141070.0, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.14512471655328799, |
|
"grad_norm": 23.134132385253906, |
|
"learning_rate": 1.7660818713450293e-05, |
|
"loss": 1.5323, |
|
"num_tokens": 142190.0, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.14625850340136054, |
|
"grad_norm": 21.138839721679688, |
|
"learning_rate": 1.76374269005848e-05, |
|
"loss": 1.4775, |
|
"num_tokens": 143354.0, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.1473922902494331, |
|
"grad_norm": 25.536008834838867, |
|
"learning_rate": 1.7614035087719298e-05, |
|
"loss": 1.5466, |
|
"num_tokens": 144426.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14852607709750568, |
|
"grad_norm": 23.146469116210938, |
|
"learning_rate": 1.7590643274853804e-05, |
|
"loss": 1.5849, |
|
"num_tokens": 145548.0, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.14965986394557823, |
|
"grad_norm": 25.74847412109375, |
|
"learning_rate": 1.7567251461988307e-05, |
|
"loss": 1.5719, |
|
"num_tokens": 146672.0, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.15079365079365079, |
|
"grad_norm": 27.166261672973633, |
|
"learning_rate": 1.754385964912281e-05, |
|
"loss": 1.5535, |
|
"num_tokens": 147779.0, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.15192743764172337, |
|
"grad_norm": 36.15412139892578, |
|
"learning_rate": 1.752046783625731e-05, |
|
"loss": 1.5972, |
|
"num_tokens": 148689.0, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.15306122448979592, |
|
"grad_norm": 27.721406936645508, |
|
"learning_rate": 1.7497076023391814e-05, |
|
"loss": 1.7406, |
|
"num_tokens": 149900.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.15419501133786848, |
|
"grad_norm": 28.549758911132812, |
|
"learning_rate": 1.7473684210526317e-05, |
|
"loss": 1.5852, |
|
"num_tokens": 151079.0, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.15532879818594103, |
|
"grad_norm": 29.91474151611328, |
|
"learning_rate": 1.745029239766082e-05, |
|
"loss": 1.6233, |
|
"num_tokens": 152168.0, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.1564625850340136, |
|
"grad_norm": 29.763633728027344, |
|
"learning_rate": 1.7426900584795322e-05, |
|
"loss": 1.4513, |
|
"num_tokens": 153262.0, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.15759637188208617, |
|
"grad_norm": 30.80873680114746, |
|
"learning_rate": 1.7403508771929828e-05, |
|
"loss": 1.4502, |
|
"num_tokens": 154398.0, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.15873015873015872, |
|
"grad_norm": 32.4648323059082, |
|
"learning_rate": 1.7380116959064327e-05, |
|
"loss": 1.515, |
|
"num_tokens": 155438.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1598639455782313, |
|
"grad_norm": 30.387754440307617, |
|
"learning_rate": 1.7356725146198833e-05, |
|
"loss": 1.5156, |
|
"num_tokens": 156535.0, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.16099773242630386, |
|
"grad_norm": 26.93579864501953, |
|
"learning_rate": 1.7333333333333336e-05, |
|
"loss": 1.5558, |
|
"num_tokens": 157676.0, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.1621315192743764, |
|
"grad_norm": 23.415470123291016, |
|
"learning_rate": 1.7309941520467838e-05, |
|
"loss": 1.5061, |
|
"num_tokens": 158849.0, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.16326530612244897, |
|
"grad_norm": 25.518234252929688, |
|
"learning_rate": 1.728654970760234e-05, |
|
"loss": 1.7354, |
|
"num_tokens": 159943.0, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.16439909297052155, |
|
"grad_norm": 20.67085838317871, |
|
"learning_rate": 1.7263157894736843e-05, |
|
"loss": 1.7097, |
|
"num_tokens": 161152.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.1655328798185941, |
|
"grad_norm": 20.089345932006836, |
|
"learning_rate": 1.7239766081871346e-05, |
|
"loss": 1.5232, |
|
"num_tokens": 162350.0, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 22.614852905273438, |
|
"learning_rate": 1.721637426900585e-05, |
|
"loss": 1.3706, |
|
"num_tokens": 163374.0, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.16780045351473924, |
|
"grad_norm": 21.45526695251465, |
|
"learning_rate": 1.719298245614035e-05, |
|
"loss": 1.5264, |
|
"num_tokens": 164537.0, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.1689342403628118, |
|
"grad_norm": 23.220937728881836, |
|
"learning_rate": 1.7169590643274857e-05, |
|
"loss": 1.4362, |
|
"num_tokens": 165585.0, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.17006802721088435, |
|
"grad_norm": 24.269975662231445, |
|
"learning_rate": 1.7146198830409356e-05, |
|
"loss": 1.5262, |
|
"num_tokens": 166600.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1712018140589569, |
|
"grad_norm": 23.387495040893555, |
|
"learning_rate": 1.7122807017543862e-05, |
|
"loss": 1.4802, |
|
"num_tokens": 167686.0, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.17233560090702948, |
|
"grad_norm": 23.54978370666504, |
|
"learning_rate": 1.7099415204678365e-05, |
|
"loss": 1.5707, |
|
"num_tokens": 168793.0, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.17346938775510204, |
|
"grad_norm": 25.489028930664062, |
|
"learning_rate": 1.7076023391812867e-05, |
|
"loss": 1.5879, |
|
"num_tokens": 169843.0, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.1746031746031746, |
|
"grad_norm": 23.096094131469727, |
|
"learning_rate": 1.705263157894737e-05, |
|
"loss": 1.4339, |
|
"num_tokens": 170851.0, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.17573696145124718, |
|
"grad_norm": 22.319326400756836, |
|
"learning_rate": 1.7029239766081872e-05, |
|
"loss": 1.6513, |
|
"num_tokens": 171960.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.17687074829931973, |
|
"grad_norm": 20.970462799072266, |
|
"learning_rate": 1.7005847953216375e-05, |
|
"loss": 1.5231, |
|
"num_tokens": 173095.0, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.17800453514739228, |
|
"grad_norm": 22.09793472290039, |
|
"learning_rate": 1.6982456140350878e-05, |
|
"loss": 1.5093, |
|
"num_tokens": 174221.0, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.17913832199546487, |
|
"grad_norm": 21.099365234375, |
|
"learning_rate": 1.695906432748538e-05, |
|
"loss": 1.5163, |
|
"num_tokens": 175405.0, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.18027210884353742, |
|
"grad_norm": 24.84507179260254, |
|
"learning_rate": 1.6935672514619886e-05, |
|
"loss": 1.4498, |
|
"num_tokens": 176445.0, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.18140589569160998, |
|
"grad_norm": 20.663394927978516, |
|
"learning_rate": 1.6912280701754385e-05, |
|
"loss": 1.4815, |
|
"num_tokens": 177648.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.18253968253968253, |
|
"grad_norm": 20.654090881347656, |
|
"learning_rate": 1.688888888888889e-05, |
|
"loss": 1.534, |
|
"num_tokens": 178831.0, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.1836734693877551, |
|
"grad_norm": 21.591787338256836, |
|
"learning_rate": 1.6865497076023394e-05, |
|
"loss": 1.4529, |
|
"num_tokens": 179986.0, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.18480725623582767, |
|
"grad_norm": 21.668750762939453, |
|
"learning_rate": 1.6842105263157896e-05, |
|
"loss": 1.5613, |
|
"num_tokens": 181201.0, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.18594104308390022, |
|
"grad_norm": 23.887989044189453, |
|
"learning_rate": 1.68187134502924e-05, |
|
"loss": 1.654, |
|
"num_tokens": 182310.0, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.1870748299319728, |
|
"grad_norm": 21.20265007019043, |
|
"learning_rate": 1.67953216374269e-05, |
|
"loss": 1.561, |
|
"num_tokens": 183432.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.18820861678004536, |
|
"grad_norm": 22.128807067871094, |
|
"learning_rate": 1.6771929824561408e-05, |
|
"loss": 1.6709, |
|
"num_tokens": 184597.0, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.1893424036281179, |
|
"grad_norm": 24.24042320251465, |
|
"learning_rate": 1.6748538011695907e-05, |
|
"loss": 1.5693, |
|
"num_tokens": 185668.0, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.19047619047619047, |
|
"grad_norm": 21.286191940307617, |
|
"learning_rate": 1.672514619883041e-05, |
|
"loss": 1.4641, |
|
"num_tokens": 186778.0, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.19160997732426305, |
|
"grad_norm": 21.220508575439453, |
|
"learning_rate": 1.6701754385964915e-05, |
|
"loss": 1.6059, |
|
"num_tokens": 187963.0, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.1927437641723356, |
|
"grad_norm": 20.513689041137695, |
|
"learning_rate": 1.6678362573099414e-05, |
|
"loss": 1.496, |
|
"num_tokens": 189061.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.19387755102040816, |
|
"grad_norm": 22.597923278808594, |
|
"learning_rate": 1.665497076023392e-05, |
|
"loss": 1.5342, |
|
"num_tokens": 190184.0, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.19501133786848074, |
|
"grad_norm": 22.039106369018555, |
|
"learning_rate": 1.6631578947368423e-05, |
|
"loss": 1.4233, |
|
"num_tokens": 191228.0, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.1961451247165533, |
|
"grad_norm": 21.834428787231445, |
|
"learning_rate": 1.6608187134502926e-05, |
|
"loss": 1.461, |
|
"num_tokens": 192388.0, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.19727891156462585, |
|
"grad_norm": 20.996660232543945, |
|
"learning_rate": 1.6584795321637428e-05, |
|
"loss": 1.5707, |
|
"num_tokens": 193528.0, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.1984126984126984, |
|
"grad_norm": 21.746488571166992, |
|
"learning_rate": 1.656140350877193e-05, |
|
"loss": 1.4698, |
|
"num_tokens": 194550.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.19954648526077098, |
|
"grad_norm": 22.856365203857422, |
|
"learning_rate": 1.6538011695906437e-05, |
|
"loss": 1.4731, |
|
"num_tokens": 195677.0, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.20068027210884354, |
|
"grad_norm": 21.368350982666016, |
|
"learning_rate": 1.6514619883040936e-05, |
|
"loss": 1.481, |
|
"num_tokens": 196912.0, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.2018140589569161, |
|
"grad_norm": 20.245771408081055, |
|
"learning_rate": 1.649122807017544e-05, |
|
"loss": 1.4977, |
|
"num_tokens": 198043.0, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.20294784580498867, |
|
"grad_norm": 25.99681282043457, |
|
"learning_rate": 1.6467836257309944e-05, |
|
"loss": 1.7099, |
|
"num_tokens": 199157.0, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.20408163265306123, |
|
"grad_norm": 22.558382034301758, |
|
"learning_rate": 1.6444444444444444e-05, |
|
"loss": 1.5767, |
|
"num_tokens": 200255.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.20521541950113378, |
|
"grad_norm": 22.796449661254883, |
|
"learning_rate": 1.642105263157895e-05, |
|
"loss": 1.4589, |
|
"num_tokens": 201297.0, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.20634920634920634, |
|
"grad_norm": 20.285388946533203, |
|
"learning_rate": 1.6397660818713452e-05, |
|
"loss": 1.4589, |
|
"num_tokens": 202492.0, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.20748299319727892, |
|
"grad_norm": 20.92437744140625, |
|
"learning_rate": 1.6374269005847955e-05, |
|
"loss": 1.5125, |
|
"num_tokens": 203650.0, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.20861678004535147, |
|
"grad_norm": 22.35742950439453, |
|
"learning_rate": 1.6350877192982457e-05, |
|
"loss": 1.442, |
|
"num_tokens": 204723.0, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.20975056689342403, |
|
"grad_norm": 24.2554931640625, |
|
"learning_rate": 1.632748538011696e-05, |
|
"loss": 1.4875, |
|
"num_tokens": 205792.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.2108843537414966, |
|
"grad_norm": 23.882658004760742, |
|
"learning_rate": 1.6304093567251466e-05, |
|
"loss": 1.6708, |
|
"num_tokens": 206878.0, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.21201814058956917, |
|
"grad_norm": 22.755496978759766, |
|
"learning_rate": 1.6280701754385965e-05, |
|
"loss": 1.5278, |
|
"num_tokens": 208071.0, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.21315192743764172, |
|
"grad_norm": 22.447935104370117, |
|
"learning_rate": 1.625730994152047e-05, |
|
"loss": 1.4872, |
|
"num_tokens": 209227.0, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 22.08466339111328, |
|
"learning_rate": 1.6233918128654974e-05, |
|
"loss": 1.5934, |
|
"num_tokens": 210372.0, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.21541950113378686, |
|
"grad_norm": 21.285282135009766, |
|
"learning_rate": 1.6210526315789473e-05, |
|
"loss": 1.4961, |
|
"num_tokens": 211604.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2165532879818594, |
|
"grad_norm": 21.292037963867188, |
|
"learning_rate": 1.618713450292398e-05, |
|
"loss": 1.503, |
|
"num_tokens": 212855.0, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.21768707482993196, |
|
"grad_norm": 21.15336036682129, |
|
"learning_rate": 1.616374269005848e-05, |
|
"loss": 1.4699, |
|
"num_tokens": 213956.0, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.21882086167800455, |
|
"grad_norm": 22.280893325805664, |
|
"learning_rate": 1.6140350877192984e-05, |
|
"loss": 1.6332, |
|
"num_tokens": 215120.0, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.2199546485260771, |
|
"grad_norm": 22.338592529296875, |
|
"learning_rate": 1.6116959064327486e-05, |
|
"loss": 1.4444, |
|
"num_tokens": 216206.0, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.22108843537414966, |
|
"grad_norm": 20.818641662597656, |
|
"learning_rate": 1.609356725146199e-05, |
|
"loss": 1.4245, |
|
"num_tokens": 217303.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 23.549814224243164, |
|
"learning_rate": 1.6070175438596495e-05, |
|
"loss": 1.5753, |
|
"num_tokens": 218327.0, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.2233560090702948, |
|
"grad_norm": 22.222686767578125, |
|
"learning_rate": 1.6046783625730994e-05, |
|
"loss": 1.3668, |
|
"num_tokens": 219485.0, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.22448979591836735, |
|
"grad_norm": 19.54814910888672, |
|
"learning_rate": 1.60233918128655e-05, |
|
"loss": 1.6546, |
|
"num_tokens": 220708.0, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.2256235827664399, |
|
"grad_norm": 20.543609619140625, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.5096, |
|
"num_tokens": 221781.0, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.22675736961451248, |
|
"grad_norm": 21.107006072998047, |
|
"learning_rate": 1.5976608187134505e-05, |
|
"loss": 1.5088, |
|
"num_tokens": 222829.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22789115646258504, |
|
"grad_norm": 23.751380920410156, |
|
"learning_rate": 1.5953216374269008e-05, |
|
"loss": 1.4793, |
|
"num_tokens": 224071.0, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.2290249433106576, |
|
"grad_norm": 22.424638748168945, |
|
"learning_rate": 1.592982456140351e-05, |
|
"loss": 1.6096, |
|
"num_tokens": 225179.0, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.23015873015873015, |
|
"grad_norm": 22.971948623657227, |
|
"learning_rate": 1.5906432748538013e-05, |
|
"loss": 1.4364, |
|
"num_tokens": 226250.0, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.23129251700680273, |
|
"grad_norm": 21.397802352905273, |
|
"learning_rate": 1.5883040935672516e-05, |
|
"loss": 1.5396, |
|
"num_tokens": 227380.0, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.23242630385487528, |
|
"grad_norm": 46.37774658203125, |
|
"learning_rate": 1.5859649122807018e-05, |
|
"loss": 1.43, |
|
"num_tokens": 228565.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.23356009070294784, |
|
"grad_norm": 24.68637466430664, |
|
"learning_rate": 1.583625730994152e-05, |
|
"loss": 1.683, |
|
"num_tokens": 229639.0, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.23469387755102042, |
|
"grad_norm": 21.283252716064453, |
|
"learning_rate": 1.5812865497076023e-05, |
|
"loss": 1.5629, |
|
"num_tokens": 230753.0, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.23582766439909297, |
|
"grad_norm": 22.06300926208496, |
|
"learning_rate": 1.578947368421053e-05, |
|
"loss": 1.5683, |
|
"num_tokens": 231866.0, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.23696145124716553, |
|
"grad_norm": 21.9282283782959, |
|
"learning_rate": 1.5766081871345032e-05, |
|
"loss": 1.5273, |
|
"num_tokens": 233001.0, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 21.07894515991211, |
|
"learning_rate": 1.5742690058479534e-05, |
|
"loss": 1.5438, |
|
"num_tokens": 234113.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.23922902494331066, |
|
"grad_norm": 19.786283493041992, |
|
"learning_rate": 1.5719298245614037e-05, |
|
"loss": 1.3995, |
|
"num_tokens": 235294.0, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.24036281179138322, |
|
"grad_norm": 23.547483444213867, |
|
"learning_rate": 1.569590643274854e-05, |
|
"loss": 1.4131, |
|
"num_tokens": 236392.0, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.24149659863945577, |
|
"grad_norm": 20.67225456237793, |
|
"learning_rate": 1.5672514619883042e-05, |
|
"loss": 1.4733, |
|
"num_tokens": 237522.0, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.24263038548752835, |
|
"grad_norm": 22.142709732055664, |
|
"learning_rate": 1.5649122807017545e-05, |
|
"loss": 1.5663, |
|
"num_tokens": 238635.0, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.2437641723356009, |
|
"grad_norm": 23.121814727783203, |
|
"learning_rate": 1.5625730994152047e-05, |
|
"loss": 1.3922, |
|
"num_tokens": 239690.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.24489795918367346, |
|
"grad_norm": 25.34332847595215, |
|
"learning_rate": 1.560233918128655e-05, |
|
"loss": 1.529, |
|
"num_tokens": 240712.0, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.24603174603174602, |
|
"grad_norm": 20.340042114257812, |
|
"learning_rate": 1.5578947368421052e-05, |
|
"loss": 1.4748, |
|
"num_tokens": 241899.0, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.2471655328798186, |
|
"grad_norm": 21.851383209228516, |
|
"learning_rate": 1.555555555555556e-05, |
|
"loss": 1.537, |
|
"num_tokens": 242975.0, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.24829931972789115, |
|
"grad_norm": 22.453187942504883, |
|
"learning_rate": 1.553216374269006e-05, |
|
"loss": 1.5451, |
|
"num_tokens": 244129.0, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.2494331065759637, |
|
"grad_norm": 22.221010208129883, |
|
"learning_rate": 1.5508771929824563e-05, |
|
"loss": 1.5423, |
|
"num_tokens": 245290.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.25056689342403626, |
|
"grad_norm": 22.524154663085938, |
|
"learning_rate": 1.5485380116959066e-05, |
|
"loss": 1.6421, |
|
"num_tokens": 246386.0, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.25170068027210885, |
|
"grad_norm": 21.442996978759766, |
|
"learning_rate": 1.546198830409357e-05, |
|
"loss": 1.4855, |
|
"num_tokens": 247440.0, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.2528344671201814, |
|
"grad_norm": 24.744312286376953, |
|
"learning_rate": 1.543859649122807e-05, |
|
"loss": 1.3662, |
|
"num_tokens": 248514.0, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.25396825396825395, |
|
"grad_norm": 23.523515701293945, |
|
"learning_rate": 1.5415204678362574e-05, |
|
"loss": 1.4826, |
|
"num_tokens": 249642.0, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.25510204081632654, |
|
"grad_norm": 23.982515335083008, |
|
"learning_rate": 1.5391812865497076e-05, |
|
"loss": 1.3806, |
|
"num_tokens": 250712.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2562358276643991, |
|
"grad_norm": 22.8784236907959, |
|
"learning_rate": 1.536842105263158e-05, |
|
"loss": 1.4332, |
|
"num_tokens": 251897.0, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.25736961451247165, |
|
"grad_norm": 22.019994735717773, |
|
"learning_rate": 1.534502923976608e-05, |
|
"loss": 1.418, |
|
"num_tokens": 253031.0, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.2585034013605442, |
|
"grad_norm": 22.86067008972168, |
|
"learning_rate": 1.5321637426900587e-05, |
|
"loss": 1.6491, |
|
"num_tokens": 254172.0, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.25963718820861675, |
|
"grad_norm": 22.04460334777832, |
|
"learning_rate": 1.529824561403509e-05, |
|
"loss": 1.373, |
|
"num_tokens": 255350.0, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.26077097505668934, |
|
"grad_norm": 21.948341369628906, |
|
"learning_rate": 1.5274853801169593e-05, |
|
"loss": 1.441, |
|
"num_tokens": 256554.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2619047619047619, |
|
"grad_norm": 22.99373435974121, |
|
"learning_rate": 1.5251461988304095e-05, |
|
"loss": 1.5293, |
|
"num_tokens": 257729.0, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.26303854875283444, |
|
"grad_norm": 25.595821380615234, |
|
"learning_rate": 1.5228070175438598e-05, |
|
"loss": 1.4453, |
|
"num_tokens": 258806.0, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.264172335600907, |
|
"grad_norm": 22.947647094726562, |
|
"learning_rate": 1.52046783625731e-05, |
|
"loss": 1.5168, |
|
"num_tokens": 259874.0, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.2653061224489796, |
|
"grad_norm": 25.86823081970215, |
|
"learning_rate": 1.5181286549707603e-05, |
|
"loss": 1.5426, |
|
"num_tokens": 260959.0, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.26643990929705214, |
|
"grad_norm": 20.40091896057129, |
|
"learning_rate": 1.5157894736842107e-05, |
|
"loss": 1.7117, |
|
"num_tokens": 262189.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.2675736961451247, |
|
"grad_norm": 21.5074462890625, |
|
"learning_rate": 1.5134502923976608e-05, |
|
"loss": 1.4967, |
|
"num_tokens": 263381.0, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.2687074829931973, |
|
"grad_norm": 20.92792320251465, |
|
"learning_rate": 1.5111111111111112e-05, |
|
"loss": 1.5214, |
|
"num_tokens": 264514.0, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.2698412698412698, |
|
"grad_norm": 22.8386287689209, |
|
"learning_rate": 1.5087719298245615e-05, |
|
"loss": 1.5155, |
|
"num_tokens": 265629.0, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.2709750566893424, |
|
"grad_norm": 21.064926147460938, |
|
"learning_rate": 1.5064327485380119e-05, |
|
"loss": 1.44, |
|
"num_tokens": 266750.0, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.272108843537415, |
|
"grad_norm": 22.04184913635254, |
|
"learning_rate": 1.504093567251462e-05, |
|
"loss": 1.5298, |
|
"num_tokens": 267781.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2732426303854875, |
|
"grad_norm": 24.02085304260254, |
|
"learning_rate": 1.5017543859649124e-05, |
|
"loss": 1.425, |
|
"num_tokens": 268921.0, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.2743764172335601, |
|
"grad_norm": 22.363149642944336, |
|
"learning_rate": 1.4994152046783627e-05, |
|
"loss": 1.4996, |
|
"num_tokens": 269985.0, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.2755102040816326, |
|
"grad_norm": 23.14397430419922, |
|
"learning_rate": 1.497076023391813e-05, |
|
"loss": 1.5728, |
|
"num_tokens": 271085.0, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.2766439909297052, |
|
"grad_norm": 21.870580673217773, |
|
"learning_rate": 1.4947368421052632e-05, |
|
"loss": 1.5506, |
|
"num_tokens": 272207.0, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"grad_norm": 23.548410415649414, |
|
"learning_rate": 1.4923976608187136e-05, |
|
"loss": 1.4119, |
|
"num_tokens": 273195.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.2789115646258503, |
|
"grad_norm": 22.918581008911133, |
|
"learning_rate": 1.4900584795321637e-05, |
|
"loss": 1.593, |
|
"num_tokens": 274395.0, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.2800453514739229, |
|
"grad_norm": 24.52320671081543, |
|
"learning_rate": 1.4877192982456141e-05, |
|
"loss": 1.592, |
|
"num_tokens": 275413.0, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.2811791383219955, |
|
"grad_norm": 25.395458221435547, |
|
"learning_rate": 1.4853801169590644e-05, |
|
"loss": 1.3833, |
|
"num_tokens": 276425.0, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.282312925170068, |
|
"grad_norm": 20.522045135498047, |
|
"learning_rate": 1.4830409356725148e-05, |
|
"loss": 1.4719, |
|
"num_tokens": 277674.0, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.2834467120181406, |
|
"grad_norm": 22.4309024810791, |
|
"learning_rate": 1.4807017543859649e-05, |
|
"loss": 1.3931, |
|
"num_tokens": 278761.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.28458049886621317, |
|
"grad_norm": 23.054508209228516, |
|
"learning_rate": 1.4783625730994153e-05, |
|
"loss": 1.6357, |
|
"num_tokens": 279900.0, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 22.669353485107422, |
|
"learning_rate": 1.4760233918128658e-05, |
|
"loss": 1.5954, |
|
"num_tokens": 281152.0, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.2868480725623583, |
|
"grad_norm": 24.546659469604492, |
|
"learning_rate": 1.4736842105263159e-05, |
|
"loss": 1.5536, |
|
"num_tokens": 282236.0, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.28798185941043086, |
|
"grad_norm": 22.03496551513672, |
|
"learning_rate": 1.4713450292397661e-05, |
|
"loss": 1.4711, |
|
"num_tokens": 283294.0, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.2891156462585034, |
|
"grad_norm": 23.8269100189209, |
|
"learning_rate": 1.4690058479532165e-05, |
|
"loss": 1.5215, |
|
"num_tokens": 284345.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.29024943310657597, |
|
"grad_norm": 21.356388092041016, |
|
"learning_rate": 1.4666666666666666e-05, |
|
"loss": 1.6076, |
|
"num_tokens": 285419.0, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.29138321995464855, |
|
"grad_norm": 20.89931869506836, |
|
"learning_rate": 1.464327485380117e-05, |
|
"loss": 1.4129, |
|
"num_tokens": 286535.0, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.2925170068027211, |
|
"grad_norm": 21.1955623626709, |
|
"learning_rate": 1.4619883040935675e-05, |
|
"loss": 1.469, |
|
"num_tokens": 287631.0, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.29365079365079366, |
|
"grad_norm": 19.76694107055664, |
|
"learning_rate": 1.4596491228070177e-05, |
|
"loss": 1.6142, |
|
"num_tokens": 288754.0, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.2947845804988662, |
|
"grad_norm": 21.243698120117188, |
|
"learning_rate": 1.4573099415204678e-05, |
|
"loss": 1.5581, |
|
"num_tokens": 289958.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.29591836734693877, |
|
"grad_norm": 22.617021560668945, |
|
"learning_rate": 1.4549707602339183e-05, |
|
"loss": 1.4423, |
|
"num_tokens": 291137.0, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.29705215419501135, |
|
"grad_norm": 20.3275146484375, |
|
"learning_rate": 1.4526315789473687e-05, |
|
"loss": 1.5287, |
|
"num_tokens": 292312.0, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.2981859410430839, |
|
"grad_norm": 26.21241569519043, |
|
"learning_rate": 1.4502923976608188e-05, |
|
"loss": 1.4952, |
|
"num_tokens": 293355.0, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.29931972789115646, |
|
"grad_norm": 23.586990356445312, |
|
"learning_rate": 1.447953216374269e-05, |
|
"loss": 1.4412, |
|
"num_tokens": 294480.0, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.30045351473922904, |
|
"grad_norm": 23.049999237060547, |
|
"learning_rate": 1.4456140350877195e-05, |
|
"loss": 1.5597, |
|
"num_tokens": 295543.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.30158730158730157, |
|
"grad_norm": 22.225637435913086, |
|
"learning_rate": 1.4432748538011695e-05, |
|
"loss": 1.5182, |
|
"num_tokens": 296682.0, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.30272108843537415, |
|
"grad_norm": 22.19669532775879, |
|
"learning_rate": 1.44093567251462e-05, |
|
"loss": 1.5603, |
|
"num_tokens": 297762.0, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.30385487528344673, |
|
"grad_norm": 20.865692138671875, |
|
"learning_rate": 1.4385964912280704e-05, |
|
"loss": 1.4716, |
|
"num_tokens": 298916.0, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.30498866213151926, |
|
"grad_norm": 24.449691772460938, |
|
"learning_rate": 1.4362573099415207e-05, |
|
"loss": 1.3971, |
|
"num_tokens": 299936.0, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.30612244897959184, |
|
"grad_norm": 21.59602165222168, |
|
"learning_rate": 1.4339181286549707e-05, |
|
"loss": 1.5539, |
|
"num_tokens": 301074.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3072562358276644, |
|
"grad_norm": 23.660263061523438, |
|
"learning_rate": 1.4315789473684212e-05, |
|
"loss": 1.4892, |
|
"num_tokens": 302125.0, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.30839002267573695, |
|
"grad_norm": 22.853479385375977, |
|
"learning_rate": 1.4292397660818716e-05, |
|
"loss": 1.4667, |
|
"num_tokens": 303263.0, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.30952380952380953, |
|
"grad_norm": 23.96630859375, |
|
"learning_rate": 1.4269005847953217e-05, |
|
"loss": 1.4877, |
|
"num_tokens": 304463.0, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.31065759637188206, |
|
"grad_norm": 21.596799850463867, |
|
"learning_rate": 1.4245614035087721e-05, |
|
"loss": 1.4767, |
|
"num_tokens": 305660.0, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.31179138321995464, |
|
"grad_norm": 23.900022506713867, |
|
"learning_rate": 1.4222222222222224e-05, |
|
"loss": 1.4882, |
|
"num_tokens": 306684.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3129251700680272, |
|
"grad_norm": 21.063495635986328, |
|
"learning_rate": 1.4198830409356725e-05, |
|
"loss": 1.4495, |
|
"num_tokens": 307813.0, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.31405895691609975, |
|
"grad_norm": 23.05027198791504, |
|
"learning_rate": 1.4175438596491229e-05, |
|
"loss": 1.4677, |
|
"num_tokens": 308876.0, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.31519274376417233, |
|
"grad_norm": 20.81597900390625, |
|
"learning_rate": 1.4152046783625733e-05, |
|
"loss": 1.4085, |
|
"num_tokens": 310012.0, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.3163265306122449, |
|
"grad_norm": 23.965967178344727, |
|
"learning_rate": 1.4128654970760236e-05, |
|
"loss": 1.4615, |
|
"num_tokens": 311070.0, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.31746031746031744, |
|
"grad_norm": 22.640148162841797, |
|
"learning_rate": 1.4105263157894738e-05, |
|
"loss": 1.3822, |
|
"num_tokens": 312154.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.31859410430839, |
|
"grad_norm": 22.867446899414062, |
|
"learning_rate": 1.408187134502924e-05, |
|
"loss": 1.6723, |
|
"num_tokens": 313253.0, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.3197278911564626, |
|
"grad_norm": 22.407011032104492, |
|
"learning_rate": 1.4058479532163745e-05, |
|
"loss": 1.4687, |
|
"num_tokens": 314325.0, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.32086167800453513, |
|
"grad_norm": 21.54814338684082, |
|
"learning_rate": 1.4035087719298246e-05, |
|
"loss": 1.5139, |
|
"num_tokens": 315399.0, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.3219954648526077, |
|
"grad_norm": 21.578617095947266, |
|
"learning_rate": 1.401169590643275e-05, |
|
"loss": 1.3751, |
|
"num_tokens": 316543.0, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.3231292517006803, |
|
"grad_norm": 21.266693115234375, |
|
"learning_rate": 1.3988304093567253e-05, |
|
"loss": 1.4847, |
|
"num_tokens": 317658.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3242630385487528, |
|
"grad_norm": 23.686180114746094, |
|
"learning_rate": 1.3964912280701755e-05, |
|
"loss": 1.491, |
|
"num_tokens": 318773.0, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.3253968253968254, |
|
"grad_norm": 22.65009117126465, |
|
"learning_rate": 1.3941520467836258e-05, |
|
"loss": 1.5428, |
|
"num_tokens": 319995.0, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.32653061224489793, |
|
"grad_norm": 20.74267578125, |
|
"learning_rate": 1.3918128654970762e-05, |
|
"loss": 1.3828, |
|
"num_tokens": 321078.0, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.3276643990929705, |
|
"grad_norm": 22.6479434967041, |
|
"learning_rate": 1.3894736842105265e-05, |
|
"loss": 1.4634, |
|
"num_tokens": 322140.0, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.3287981859410431, |
|
"grad_norm": 23.637678146362305, |
|
"learning_rate": 1.3871345029239767e-05, |
|
"loss": 1.4539, |
|
"num_tokens": 323326.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3299319727891156, |
|
"grad_norm": 20.191329956054688, |
|
"learning_rate": 1.384795321637427e-05, |
|
"loss": 1.3562, |
|
"num_tokens": 324391.0, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.3310657596371882, |
|
"grad_norm": 24.18254852294922, |
|
"learning_rate": 1.3824561403508774e-05, |
|
"loss": 1.407, |
|
"num_tokens": 325423.0, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.3321995464852608, |
|
"grad_norm": 22.894956588745117, |
|
"learning_rate": 1.3801169590643275e-05, |
|
"loss": 1.4151, |
|
"num_tokens": 326471.0, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 25.60346221923828, |
|
"learning_rate": 1.377777777777778e-05, |
|
"loss": 1.5366, |
|
"num_tokens": 327570.0, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.3344671201814059, |
|
"grad_norm": 23.408321380615234, |
|
"learning_rate": 1.3754385964912282e-05, |
|
"loss": 1.5041, |
|
"num_tokens": 328672.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.3356009070294785, |
|
"grad_norm": 23.32335662841797, |
|
"learning_rate": 1.3730994152046784e-05, |
|
"loss": 1.5033, |
|
"num_tokens": 329696.0, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.336734693877551, |
|
"grad_norm": 20.227418899536133, |
|
"learning_rate": 1.3707602339181287e-05, |
|
"loss": 1.3922, |
|
"num_tokens": 330810.0, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.3378684807256236, |
|
"grad_norm": 20.30182647705078, |
|
"learning_rate": 1.3684210526315791e-05, |
|
"loss": 1.4642, |
|
"num_tokens": 331998.0, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.33900226757369617, |
|
"grad_norm": 22.34268569946289, |
|
"learning_rate": 1.3660818713450294e-05, |
|
"loss": 1.3948, |
|
"num_tokens": 333011.0, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.3401360544217687, |
|
"grad_norm": 21.91162872314453, |
|
"learning_rate": 1.3637426900584796e-05, |
|
"loss": 1.4822, |
|
"num_tokens": 334104.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3412698412698413, |
|
"grad_norm": 22.55190658569336, |
|
"learning_rate": 1.3614035087719299e-05, |
|
"loss": 1.6097, |
|
"num_tokens": 335283.0, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.3424036281179138, |
|
"grad_norm": 21.861995697021484, |
|
"learning_rate": 1.3590643274853803e-05, |
|
"loss": 1.588, |
|
"num_tokens": 336363.0, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.3435374149659864, |
|
"grad_norm": 21.788677215576172, |
|
"learning_rate": 1.3567251461988304e-05, |
|
"loss": 1.582, |
|
"num_tokens": 337576.0, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.34467120181405897, |
|
"grad_norm": 20.740257263183594, |
|
"learning_rate": 1.3543859649122808e-05, |
|
"loss": 1.3907, |
|
"num_tokens": 338641.0, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.3458049886621315, |
|
"grad_norm": 21.97249412536621, |
|
"learning_rate": 1.3520467836257311e-05, |
|
"loss": 1.5256, |
|
"num_tokens": 339795.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.3469387755102041, |
|
"grad_norm": 21.80385971069336, |
|
"learning_rate": 1.3497076023391814e-05, |
|
"loss": 1.4808, |
|
"num_tokens": 340889.0, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.34807256235827666, |
|
"grad_norm": 22.424528121948242, |
|
"learning_rate": 1.3473684210526316e-05, |
|
"loss": 1.575, |
|
"num_tokens": 342007.0, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.3492063492063492, |
|
"grad_norm": 22.06599998474121, |
|
"learning_rate": 1.345029239766082e-05, |
|
"loss": 1.307, |
|
"num_tokens": 343151.0, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.35034013605442177, |
|
"grad_norm": 24.721797943115234, |
|
"learning_rate": 1.3426900584795323e-05, |
|
"loss": 1.5736, |
|
"num_tokens": 344208.0, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.35147392290249435, |
|
"grad_norm": 22.60153579711914, |
|
"learning_rate": 1.3403508771929826e-05, |
|
"loss": 1.5604, |
|
"num_tokens": 345374.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3526077097505669, |
|
"grad_norm": 21.573318481445312, |
|
"learning_rate": 1.3380116959064328e-05, |
|
"loss": 1.524, |
|
"num_tokens": 346608.0, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.35374149659863946, |
|
"grad_norm": 25.30203628540039, |
|
"learning_rate": 1.3356725146198832e-05, |
|
"loss": 1.6416, |
|
"num_tokens": 347714.0, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.35487528344671204, |
|
"grad_norm": 22.905746459960938, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 1.5275, |
|
"num_tokens": 348930.0, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.35600907029478457, |
|
"grad_norm": 23.923063278198242, |
|
"learning_rate": 1.3309941520467838e-05, |
|
"loss": 1.5784, |
|
"num_tokens": 349999.0, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 22.065505981445312, |
|
"learning_rate": 1.328654970760234e-05, |
|
"loss": 1.5701, |
|
"num_tokens": 351123.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.35827664399092973, |
|
"grad_norm": 22.438655853271484, |
|
"learning_rate": 1.3263157894736843e-05, |
|
"loss": 1.5035, |
|
"num_tokens": 352245.0, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.35941043083900226, |
|
"grad_norm": 21.77728271484375, |
|
"learning_rate": 1.3239766081871345e-05, |
|
"loss": 1.4594, |
|
"num_tokens": 353380.0, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.36054421768707484, |
|
"grad_norm": 26.78565788269043, |
|
"learning_rate": 1.321637426900585e-05, |
|
"loss": 1.6554, |
|
"num_tokens": 354375.0, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.36167800453514737, |
|
"grad_norm": 22.74375343322754, |
|
"learning_rate": 1.3192982456140354e-05, |
|
"loss": 1.4583, |
|
"num_tokens": 355502.0, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.36281179138321995, |
|
"grad_norm": 22.99059295654297, |
|
"learning_rate": 1.3169590643274855e-05, |
|
"loss": 1.4056, |
|
"num_tokens": 356615.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.36394557823129253, |
|
"grad_norm": 19.973115921020508, |
|
"learning_rate": 1.3146198830409357e-05, |
|
"loss": 1.4359, |
|
"num_tokens": 357688.0, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.36507936507936506, |
|
"grad_norm": 20.598600387573242, |
|
"learning_rate": 1.3122807017543862e-05, |
|
"loss": 1.3685, |
|
"num_tokens": 358832.0, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.36621315192743764, |
|
"grad_norm": 23.178176879882812, |
|
"learning_rate": 1.3099415204678362e-05, |
|
"loss": 1.4443, |
|
"num_tokens": 359904.0, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.3673469387755102, |
|
"grad_norm": 23.088056564331055, |
|
"learning_rate": 1.3076023391812867e-05, |
|
"loss": 1.3505, |
|
"num_tokens": 361014.0, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.36848072562358275, |
|
"grad_norm": 22.677352905273438, |
|
"learning_rate": 1.305263157894737e-05, |
|
"loss": 1.432, |
|
"num_tokens": 362163.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.36961451247165533, |
|
"grad_norm": 22.128244400024414, |
|
"learning_rate": 1.3029239766081872e-05, |
|
"loss": 1.4475, |
|
"num_tokens": 363222.0, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.3707482993197279, |
|
"grad_norm": 20.658218383789062, |
|
"learning_rate": 1.3005847953216374e-05, |
|
"loss": 1.4307, |
|
"num_tokens": 364332.0, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.37188208616780044, |
|
"grad_norm": 24.494672775268555, |
|
"learning_rate": 1.2982456140350879e-05, |
|
"loss": 1.5444, |
|
"num_tokens": 365433.0, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.373015873015873, |
|
"grad_norm": 24.10431671142578, |
|
"learning_rate": 1.2959064327485383e-05, |
|
"loss": 1.4431, |
|
"num_tokens": 366487.0, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.3741496598639456, |
|
"grad_norm": 20.076684951782227, |
|
"learning_rate": 1.2935672514619884e-05, |
|
"loss": 1.4626, |
|
"num_tokens": 367618.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.37528344671201813, |
|
"grad_norm": 22.610557556152344, |
|
"learning_rate": 1.2912280701754386e-05, |
|
"loss": 1.4713, |
|
"num_tokens": 368738.0, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.3764172335600907, |
|
"grad_norm": 25.21763801574707, |
|
"learning_rate": 1.288888888888889e-05, |
|
"loss": 1.5092, |
|
"num_tokens": 369720.0, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.37755102040816324, |
|
"grad_norm": 20.842025756835938, |
|
"learning_rate": 1.2865497076023392e-05, |
|
"loss": 1.4863, |
|
"num_tokens": 370885.0, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.3786848072562358, |
|
"grad_norm": 21.969791412353516, |
|
"learning_rate": 1.2842105263157896e-05, |
|
"loss": 1.3838, |
|
"num_tokens": 371958.0, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.3798185941043084, |
|
"grad_norm": 23.09484100341797, |
|
"learning_rate": 1.28187134502924e-05, |
|
"loss": 1.6291, |
|
"num_tokens": 373085.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.38095238095238093, |
|
"grad_norm": 20.638565063476562, |
|
"learning_rate": 1.2795321637426901e-05, |
|
"loss": 1.4701, |
|
"num_tokens": 374210.0, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.3820861678004535, |
|
"grad_norm": 21.330707550048828, |
|
"learning_rate": 1.2771929824561404e-05, |
|
"loss": 1.4856, |
|
"num_tokens": 375310.0, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.3832199546485261, |
|
"grad_norm": 22.242534637451172, |
|
"learning_rate": 1.2748538011695908e-05, |
|
"loss": 1.4001, |
|
"num_tokens": 376372.0, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.3843537414965986, |
|
"grad_norm": 22.377954483032227, |
|
"learning_rate": 1.2725146198830412e-05, |
|
"loss": 1.6322, |
|
"num_tokens": 377527.0, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.3854875283446712, |
|
"grad_norm": 22.19904327392578, |
|
"learning_rate": 1.2701754385964913e-05, |
|
"loss": 1.3738, |
|
"num_tokens": 378604.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3866213151927438, |
|
"grad_norm": 21.403820037841797, |
|
"learning_rate": 1.2678362573099417e-05, |
|
"loss": 1.3584, |
|
"num_tokens": 379678.0, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.3877551020408163, |
|
"grad_norm": 21.75101089477539, |
|
"learning_rate": 1.265497076023392e-05, |
|
"loss": 1.4217, |
|
"num_tokens": 380729.0, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.3888888888888889, |
|
"grad_norm": 23.82843589782715, |
|
"learning_rate": 1.263157894736842e-05, |
|
"loss": 1.447, |
|
"num_tokens": 381876.0, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.3900226757369615, |
|
"grad_norm": 23.757530212402344, |
|
"learning_rate": 1.2608187134502925e-05, |
|
"loss": 1.4956, |
|
"num_tokens": 382962.0, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.391156462585034, |
|
"grad_norm": 23.748046875, |
|
"learning_rate": 1.258479532163743e-05, |
|
"loss": 1.5339, |
|
"num_tokens": 384096.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.3922902494331066, |
|
"grad_norm": 21.567293167114258, |
|
"learning_rate": 1.256140350877193e-05, |
|
"loss": 1.3744, |
|
"num_tokens": 385175.0, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.3934240362811791, |
|
"grad_norm": 22.490840911865234, |
|
"learning_rate": 1.2538011695906434e-05, |
|
"loss": 1.4251, |
|
"num_tokens": 386358.0, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.3945578231292517, |
|
"grad_norm": 21.747709274291992, |
|
"learning_rate": 1.2514619883040937e-05, |
|
"loss": 1.4746, |
|
"num_tokens": 387482.0, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.3956916099773243, |
|
"grad_norm": 19.943635940551758, |
|
"learning_rate": 1.2491228070175441e-05, |
|
"loss": 1.4278, |
|
"num_tokens": 388662.0, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.3968253968253968, |
|
"grad_norm": 22.539836883544922, |
|
"learning_rate": 1.2467836257309942e-05, |
|
"loss": 1.6124, |
|
"num_tokens": 389731.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3979591836734694, |
|
"grad_norm": 21.41621208190918, |
|
"learning_rate": 1.2444444444444446e-05, |
|
"loss": 1.4527, |
|
"num_tokens": 390921.0, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.39909297052154197, |
|
"grad_norm": 21.026447296142578, |
|
"learning_rate": 1.2421052631578949e-05, |
|
"loss": 1.5012, |
|
"num_tokens": 392035.0, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.4002267573696145, |
|
"grad_norm": 25.59100914001465, |
|
"learning_rate": 1.239766081871345e-05, |
|
"loss": 1.5081, |
|
"num_tokens": 393093.0, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.4013605442176871, |
|
"grad_norm": 22.007505416870117, |
|
"learning_rate": 1.2374269005847954e-05, |
|
"loss": 1.5663, |
|
"num_tokens": 394239.0, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.40249433106575966, |
|
"grad_norm": 23.461524963378906, |
|
"learning_rate": 1.2350877192982458e-05, |
|
"loss": 1.4302, |
|
"num_tokens": 395291.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.4036281179138322, |
|
"grad_norm": 20.203760147094727, |
|
"learning_rate": 1.232748538011696e-05, |
|
"loss": 1.3541, |
|
"num_tokens": 396485.0, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.40476190476190477, |
|
"grad_norm": 19.759798049926758, |
|
"learning_rate": 1.2304093567251463e-05, |
|
"loss": 1.4233, |
|
"num_tokens": 397644.0, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.40589569160997735, |
|
"grad_norm": 22.704025268554688, |
|
"learning_rate": 1.2280701754385966e-05, |
|
"loss": 1.3426, |
|
"num_tokens": 398789.0, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.4070294784580499, |
|
"grad_norm": 21.620922088623047, |
|
"learning_rate": 1.225730994152047e-05, |
|
"loss": 1.5188, |
|
"num_tokens": 399878.0, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.40816326530612246, |
|
"grad_norm": 19.23693084716797, |
|
"learning_rate": 1.2233918128654971e-05, |
|
"loss": 1.3182, |
|
"num_tokens": 400987.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.409297052154195, |
|
"grad_norm": 21.466594696044922, |
|
"learning_rate": 1.2210526315789475e-05, |
|
"loss": 1.5335, |
|
"num_tokens": 402056.0, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.41043083900226757, |
|
"grad_norm": 21.63371467590332, |
|
"learning_rate": 1.2187134502923978e-05, |
|
"loss": 1.4244, |
|
"num_tokens": 403197.0, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.41156462585034015, |
|
"grad_norm": 20.83876609802246, |
|
"learning_rate": 1.216374269005848e-05, |
|
"loss": 1.6196, |
|
"num_tokens": 404342.0, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.4126984126984127, |
|
"grad_norm": 21.863889694213867, |
|
"learning_rate": 1.2140350877192983e-05, |
|
"loss": 1.5207, |
|
"num_tokens": 405485.0, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.41383219954648526, |
|
"grad_norm": 20.582901000976562, |
|
"learning_rate": 1.2116959064327487e-05, |
|
"loss": 1.4857, |
|
"num_tokens": 406682.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.41496598639455784, |
|
"grad_norm": 22.900217056274414, |
|
"learning_rate": 1.2093567251461988e-05, |
|
"loss": 1.453, |
|
"num_tokens": 407731.0, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.41609977324263037, |
|
"grad_norm": 24.468358993530273, |
|
"learning_rate": 1.2070175438596493e-05, |
|
"loss": 1.5036, |
|
"num_tokens": 408773.0, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.41723356009070295, |
|
"grad_norm": 22.844940185546875, |
|
"learning_rate": 1.2046783625730995e-05, |
|
"loss": 1.3458, |
|
"num_tokens": 409839.0, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.41836734693877553, |
|
"grad_norm": 23.0582332611084, |
|
"learning_rate": 1.20233918128655e-05, |
|
"loss": 1.4995, |
|
"num_tokens": 410856.0, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.41950113378684806, |
|
"grad_norm": 20.938154220581055, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.4394, |
|
"num_tokens": 411957.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.42063492063492064, |
|
"grad_norm": 21.037168502807617, |
|
"learning_rate": 1.1976608187134505e-05, |
|
"loss": 1.3752, |
|
"num_tokens": 413070.0, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.4217687074829932, |
|
"grad_norm": 23.35750961303711, |
|
"learning_rate": 1.1953216374269007e-05, |
|
"loss": 1.4844, |
|
"num_tokens": 414204.0, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.42290249433106575, |
|
"grad_norm": 20.994701385498047, |
|
"learning_rate": 1.192982456140351e-05, |
|
"loss": 1.4312, |
|
"num_tokens": 415467.0, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.42403628117913833, |
|
"grad_norm": 21.293333053588867, |
|
"learning_rate": 1.1906432748538012e-05, |
|
"loss": 1.4493, |
|
"num_tokens": 416638.0, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.42517006802721086, |
|
"grad_norm": 21.202180862426758, |
|
"learning_rate": 1.1883040935672517e-05, |
|
"loss": 1.4468, |
|
"num_tokens": 417771.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.42630385487528344, |
|
"grad_norm": 22.729934692382812, |
|
"learning_rate": 1.1859649122807017e-05, |
|
"loss": 1.5065, |
|
"num_tokens": 418899.0, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.427437641723356, |
|
"grad_norm": 20.654706954956055, |
|
"learning_rate": 1.1836257309941522e-05, |
|
"loss": 1.453, |
|
"num_tokens": 420087.0, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 22.81949806213379, |
|
"learning_rate": 1.1812865497076024e-05, |
|
"loss": 1.4696, |
|
"num_tokens": 421175.0, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.42970521541950113, |
|
"grad_norm": 22.954544067382812, |
|
"learning_rate": 1.1789473684210527e-05, |
|
"loss": 1.4957, |
|
"num_tokens": 422301.0, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.4308390022675737, |
|
"grad_norm": 20.20991325378418, |
|
"learning_rate": 1.176608187134503e-05, |
|
"loss": 1.3547, |
|
"num_tokens": 423479.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.43197278911564624, |
|
"grad_norm": 21.98236846923828, |
|
"learning_rate": 1.1742690058479534e-05, |
|
"loss": 1.5336, |
|
"num_tokens": 424599.0, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.4331065759637188, |
|
"grad_norm": 22.076784133911133, |
|
"learning_rate": 1.1719298245614036e-05, |
|
"loss": 1.4168, |
|
"num_tokens": 425638.0, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.4342403628117914, |
|
"grad_norm": 20.801210403442383, |
|
"learning_rate": 1.1695906432748539e-05, |
|
"loss": 1.5112, |
|
"num_tokens": 426831.0, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.43537414965986393, |
|
"grad_norm": 21.711530685424805, |
|
"learning_rate": 1.1672514619883041e-05, |
|
"loss": 1.5072, |
|
"num_tokens": 428008.0, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.4365079365079365, |
|
"grad_norm": 20.649227142333984, |
|
"learning_rate": 1.1649122807017546e-05, |
|
"loss": 1.3347, |
|
"num_tokens": 429150.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.4376417233560091, |
|
"grad_norm": 24.207242965698242, |
|
"learning_rate": 1.1625730994152047e-05, |
|
"loss": 1.4249, |
|
"num_tokens": 430401.0, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.4387755102040816, |
|
"grad_norm": 22.359968185424805, |
|
"learning_rate": 1.160233918128655e-05, |
|
"loss": 1.4618, |
|
"num_tokens": 431572.0, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.4399092970521542, |
|
"grad_norm": 23.305208206176758, |
|
"learning_rate": 1.1578947368421053e-05, |
|
"loss": 1.6135, |
|
"num_tokens": 432748.0, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.4410430839002268, |
|
"grad_norm": 23.788869857788086, |
|
"learning_rate": 1.1555555555555556e-05, |
|
"loss": 1.4025, |
|
"num_tokens": 433673.0, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.4421768707482993, |
|
"grad_norm": 23.072362899780273, |
|
"learning_rate": 1.1532163742690059e-05, |
|
"loss": 1.4262, |
|
"num_tokens": 434803.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4433106575963719, |
|
"grad_norm": 24.782913208007812, |
|
"learning_rate": 1.1508771929824563e-05, |
|
"loss": 1.5596, |
|
"num_tokens": 435895.0, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 22.748624801635742, |
|
"learning_rate": 1.1485380116959065e-05, |
|
"loss": 1.5855, |
|
"num_tokens": 436983.0, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.445578231292517, |
|
"grad_norm": 24.416845321655273, |
|
"learning_rate": 1.1461988304093568e-05, |
|
"loss": 1.5917, |
|
"num_tokens": 438073.0, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.4467120181405896, |
|
"grad_norm": 21.77821922302246, |
|
"learning_rate": 1.143859649122807e-05, |
|
"loss": 1.3617, |
|
"num_tokens": 439139.0, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.4478458049886621, |
|
"grad_norm": 22.38814926147461, |
|
"learning_rate": 1.1415204678362575e-05, |
|
"loss": 1.4778, |
|
"num_tokens": 440259.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.4489795918367347, |
|
"grad_norm": 21.56502342224121, |
|
"learning_rate": 1.1391812865497076e-05, |
|
"loss": 1.5352, |
|
"num_tokens": 441416.0, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.4501133786848073, |
|
"grad_norm": 20.565168380737305, |
|
"learning_rate": 1.136842105263158e-05, |
|
"loss": 1.5067, |
|
"num_tokens": 442549.0, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.4512471655328798, |
|
"grad_norm": 19.611265182495117, |
|
"learning_rate": 1.1345029239766083e-05, |
|
"loss": 1.4137, |
|
"num_tokens": 443711.0, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.4523809523809524, |
|
"grad_norm": 20.92315101623535, |
|
"learning_rate": 1.1321637426900585e-05, |
|
"loss": 1.4697, |
|
"num_tokens": 444840.0, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.45351473922902497, |
|
"grad_norm": 19.84340476989746, |
|
"learning_rate": 1.1298245614035088e-05, |
|
"loss": 1.5124, |
|
"num_tokens": 445920.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4546485260770975, |
|
"grad_norm": 20.4117431640625, |
|
"learning_rate": 1.1274853801169592e-05, |
|
"loss": 1.5706, |
|
"num_tokens": 447140.0, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.4557823129251701, |
|
"grad_norm": 20.32288360595703, |
|
"learning_rate": 1.1251461988304096e-05, |
|
"loss": 1.3753, |
|
"num_tokens": 448247.0, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.45691609977324266, |
|
"grad_norm": 20.550880432128906, |
|
"learning_rate": 1.1228070175438597e-05, |
|
"loss": 1.3749, |
|
"num_tokens": 449442.0, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.4580498866213152, |
|
"grad_norm": 22.589542388916016, |
|
"learning_rate": 1.12046783625731e-05, |
|
"loss": 1.4055, |
|
"num_tokens": 450561.0, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.45918367346938777, |
|
"grad_norm": 20.559473037719727, |
|
"learning_rate": 1.1181286549707604e-05, |
|
"loss": 1.5184, |
|
"num_tokens": 451707.0, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.4603174603174603, |
|
"grad_norm": 19.94198226928711, |
|
"learning_rate": 1.1157894736842105e-05, |
|
"loss": 1.4678, |
|
"num_tokens": 452849.0, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.4614512471655329, |
|
"grad_norm": 21.226221084594727, |
|
"learning_rate": 1.1134502923976609e-05, |
|
"loss": 1.358, |
|
"num_tokens": 453908.0, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.46258503401360546, |
|
"grad_norm": 20.742549896240234, |
|
"learning_rate": 1.1111111111111113e-05, |
|
"loss": 1.3297, |
|
"num_tokens": 455072.0, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.463718820861678, |
|
"grad_norm": 20.73780059814453, |
|
"learning_rate": 1.1087719298245614e-05, |
|
"loss": 1.5569, |
|
"num_tokens": 456203.0, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.46485260770975056, |
|
"grad_norm": 21.602121353149414, |
|
"learning_rate": 1.1064327485380117e-05, |
|
"loss": 1.512, |
|
"num_tokens": 457306.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.46598639455782315, |
|
"grad_norm": 22.79952049255371, |
|
"learning_rate": 1.1040935672514621e-05, |
|
"loss": 1.5161, |
|
"num_tokens": 458435.0, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.4671201814058957, |
|
"grad_norm": 22.42172622680664, |
|
"learning_rate": 1.1017543859649125e-05, |
|
"loss": 1.2366, |
|
"num_tokens": 459465.0, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.46825396825396826, |
|
"grad_norm": 22.881622314453125, |
|
"learning_rate": 1.0994152046783626e-05, |
|
"loss": 1.5695, |
|
"num_tokens": 460630.0, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.46938775510204084, |
|
"grad_norm": 21.594409942626953, |
|
"learning_rate": 1.0970760233918129e-05, |
|
"loss": 1.446, |
|
"num_tokens": 461789.0, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.47052154195011336, |
|
"grad_norm": 23.080078125, |
|
"learning_rate": 1.0947368421052633e-05, |
|
"loss": 1.4408, |
|
"num_tokens": 462919.0, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.47165532879818595, |
|
"grad_norm": 22.40519142150879, |
|
"learning_rate": 1.0923976608187134e-05, |
|
"loss": 1.3175, |
|
"num_tokens": 463976.0, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.47278911564625853, |
|
"grad_norm": 23.810379028320312, |
|
"learning_rate": 1.0900584795321638e-05, |
|
"loss": 1.5058, |
|
"num_tokens": 465105.0, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.47392290249433106, |
|
"grad_norm": 21.764375686645508, |
|
"learning_rate": 1.0877192982456142e-05, |
|
"loss": 1.4464, |
|
"num_tokens": 466236.0, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.47505668934240364, |
|
"grad_norm": 21.064970016479492, |
|
"learning_rate": 1.0853801169590643e-05, |
|
"loss": 1.4898, |
|
"num_tokens": 467384.0, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 21.655441284179688, |
|
"learning_rate": 1.0830409356725146e-05, |
|
"loss": 1.3508, |
|
"num_tokens": 468501.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.47732426303854875, |
|
"grad_norm": 23.890254974365234, |
|
"learning_rate": 1.080701754385965e-05, |
|
"loss": 1.597, |
|
"num_tokens": 469514.0, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.47845804988662133, |
|
"grad_norm": 22.981708526611328, |
|
"learning_rate": 1.0783625730994154e-05, |
|
"loss": 1.5223, |
|
"num_tokens": 470570.0, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.47959183673469385, |
|
"grad_norm": 23.430938720703125, |
|
"learning_rate": 1.0760233918128655e-05, |
|
"loss": 1.4387, |
|
"num_tokens": 471783.0, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.48072562358276644, |
|
"grad_norm": 22.37723159790039, |
|
"learning_rate": 1.073684210526316e-05, |
|
"loss": 1.4399, |
|
"num_tokens": 472934.0, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.481859410430839, |
|
"grad_norm": 22.111480712890625, |
|
"learning_rate": 1.0713450292397662e-05, |
|
"loss": 1.5395, |
|
"num_tokens": 474128.0, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.48299319727891155, |
|
"grad_norm": 22.287321090698242, |
|
"learning_rate": 1.0690058479532163e-05, |
|
"loss": 1.419, |
|
"num_tokens": 475240.0, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.48412698412698413, |
|
"grad_norm": 21.262020111083984, |
|
"learning_rate": 1.0666666666666667e-05, |
|
"loss": 1.5711, |
|
"num_tokens": 476354.0, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.4852607709750567, |
|
"grad_norm": 22.599716186523438, |
|
"learning_rate": 1.0643274853801172e-05, |
|
"loss": 1.5905, |
|
"num_tokens": 477408.0, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.48639455782312924, |
|
"grad_norm": 23.220651626586914, |
|
"learning_rate": 1.0619883040935672e-05, |
|
"loss": 1.5256, |
|
"num_tokens": 478458.0, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.4875283446712018, |
|
"grad_norm": 22.43808937072754, |
|
"learning_rate": 1.0596491228070177e-05, |
|
"loss": 1.4177, |
|
"num_tokens": 479577.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4886621315192744, |
|
"grad_norm": 23.958772659301758, |
|
"learning_rate": 1.057309941520468e-05, |
|
"loss": 1.4556, |
|
"num_tokens": 480596.0, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.4897959183673469, |
|
"grad_norm": 23.579402923583984, |
|
"learning_rate": 1.0549707602339184e-05, |
|
"loss": 1.54, |
|
"num_tokens": 481636.0, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.4909297052154195, |
|
"grad_norm": 23.185216903686523, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 1.5177, |
|
"num_tokens": 482749.0, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.49206349206349204, |
|
"grad_norm": 21.95652961730957, |
|
"learning_rate": 1.0502923976608189e-05, |
|
"loss": 1.4486, |
|
"num_tokens": 483894.0, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.4931972789115646, |
|
"grad_norm": 21.265581130981445, |
|
"learning_rate": 1.0479532163742691e-05, |
|
"loss": 1.5921, |
|
"num_tokens": 485099.0, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.4943310657596372, |
|
"grad_norm": 21.50389289855957, |
|
"learning_rate": 1.0456140350877194e-05, |
|
"loss": 1.5166, |
|
"num_tokens": 486238.0, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.4954648526077097, |
|
"grad_norm": 22.350412368774414, |
|
"learning_rate": 1.0432748538011696e-05, |
|
"loss": 1.4719, |
|
"num_tokens": 487264.0, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.4965986394557823, |
|
"grad_norm": 21.480653762817383, |
|
"learning_rate": 1.04093567251462e-05, |
|
"loss": 1.4648, |
|
"num_tokens": 488457.0, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.4977324263038549, |
|
"grad_norm": 23.51890754699707, |
|
"learning_rate": 1.0385964912280702e-05, |
|
"loss": 1.5191, |
|
"num_tokens": 489594.0, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.4988662131519274, |
|
"grad_norm": 21.588485717773438, |
|
"learning_rate": 1.0362573099415206e-05, |
|
"loss": 1.5039, |
|
"num_tokens": 490762.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 21.730770111083984, |
|
"learning_rate": 1.0339181286549708e-05, |
|
"loss": 1.4574, |
|
"num_tokens": 491869.0, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.5011337868480725, |
|
"grad_norm": 21.45746612548828, |
|
"learning_rate": 1.0315789473684213e-05, |
|
"loss": 1.5759, |
|
"num_tokens": 492996.0, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.5022675736961452, |
|
"grad_norm": 25.636009216308594, |
|
"learning_rate": 1.0292397660818714e-05, |
|
"loss": 1.5797, |
|
"num_tokens": 494026.0, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.5034013605442177, |
|
"grad_norm": 20.464859008789062, |
|
"learning_rate": 1.0269005847953218e-05, |
|
"loss": 1.469, |
|
"num_tokens": 495187.0, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.5045351473922902, |
|
"grad_norm": 21.59822654724121, |
|
"learning_rate": 1.024561403508772e-05, |
|
"loss": 1.4327, |
|
"num_tokens": 496377.0, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.5056689342403629, |
|
"grad_norm": 22.208587646484375, |
|
"learning_rate": 1.0222222222222223e-05, |
|
"loss": 1.5198, |
|
"num_tokens": 497515.0, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.5068027210884354, |
|
"grad_norm": 20.97501564025879, |
|
"learning_rate": 1.0198830409356726e-05, |
|
"loss": 1.3439, |
|
"num_tokens": 498679.0, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.5079365079365079, |
|
"grad_norm": 22.33463478088379, |
|
"learning_rate": 1.017543859649123e-05, |
|
"loss": 1.4406, |
|
"num_tokens": 499695.0, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.5090702947845805, |
|
"grad_norm": 23.04416275024414, |
|
"learning_rate": 1.015204678362573e-05, |
|
"loss": 1.397, |
|
"num_tokens": 500855.0, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.5102040816326531, |
|
"grad_norm": 21.69202995300293, |
|
"learning_rate": 1.0128654970760235e-05, |
|
"loss": 1.5077, |
|
"num_tokens": 501968.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5113378684807256, |
|
"grad_norm": 24.2927303314209, |
|
"learning_rate": 1.0105263157894738e-05, |
|
"loss": 1.4229, |
|
"num_tokens": 502939.0, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.5124716553287982, |
|
"grad_norm": 22.449220657348633, |
|
"learning_rate": 1.0081871345029242e-05, |
|
"loss": 1.3924, |
|
"num_tokens": 503993.0, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.5136054421768708, |
|
"grad_norm": 23.610368728637695, |
|
"learning_rate": 1.0058479532163743e-05, |
|
"loss": 1.4355, |
|
"num_tokens": 505157.0, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.5147392290249433, |
|
"grad_norm": 21.155065536499023, |
|
"learning_rate": 1.0035087719298247e-05, |
|
"loss": 1.4111, |
|
"num_tokens": 506313.0, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.5158730158730159, |
|
"grad_norm": 23.355241775512695, |
|
"learning_rate": 1.001169590643275e-05, |
|
"loss": 1.4867, |
|
"num_tokens": 507464.0, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.5170068027210885, |
|
"grad_norm": 22.317737579345703, |
|
"learning_rate": 9.988304093567252e-06, |
|
"loss": 1.5623, |
|
"num_tokens": 508542.0, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.518140589569161, |
|
"grad_norm": 20.3128662109375, |
|
"learning_rate": 9.964912280701755e-06, |
|
"loss": 1.4996, |
|
"num_tokens": 509696.0, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.5192743764172335, |
|
"grad_norm": 21.98442268371582, |
|
"learning_rate": 9.941520467836257e-06, |
|
"loss": 1.5096, |
|
"num_tokens": 510844.0, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.5204081632653061, |
|
"grad_norm": 22.291845321655273, |
|
"learning_rate": 9.918128654970762e-06, |
|
"loss": 1.5076, |
|
"num_tokens": 511905.0, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.5215419501133787, |
|
"grad_norm": 21.11138153076172, |
|
"learning_rate": 9.894736842105264e-06, |
|
"loss": 1.4024, |
|
"num_tokens": 513025.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5226757369614512, |
|
"grad_norm": 21.474123001098633, |
|
"learning_rate": 9.871345029239767e-06, |
|
"loss": 1.3739, |
|
"num_tokens": 514096.0, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.5238095238095238, |
|
"grad_norm": 22.77983283996582, |
|
"learning_rate": 9.84795321637427e-06, |
|
"loss": 1.537, |
|
"num_tokens": 515114.0, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.5249433106575964, |
|
"grad_norm": 22.172109603881836, |
|
"learning_rate": 9.824561403508772e-06, |
|
"loss": 1.4446, |
|
"num_tokens": 516218.0, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.5260770975056689, |
|
"grad_norm": 21.138439178466797, |
|
"learning_rate": 9.801169590643276e-06, |
|
"loss": 1.4733, |
|
"num_tokens": 517329.0, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.5272108843537415, |
|
"grad_norm": 22.17593765258789, |
|
"learning_rate": 9.777777777777779e-06, |
|
"loss": 1.5316, |
|
"num_tokens": 518435.0, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.528344671201814, |
|
"grad_norm": 22.072914123535156, |
|
"learning_rate": 9.754385964912281e-06, |
|
"loss": 1.3249, |
|
"num_tokens": 519509.0, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.5294784580498866, |
|
"grad_norm": 22.987707138061523, |
|
"learning_rate": 9.730994152046784e-06, |
|
"loss": 1.5979, |
|
"num_tokens": 520620.0, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.5306122448979592, |
|
"grad_norm": 20.49030876159668, |
|
"learning_rate": 9.707602339181286e-06, |
|
"loss": 1.3421, |
|
"num_tokens": 521806.0, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.5317460317460317, |
|
"grad_norm": 22.049985885620117, |
|
"learning_rate": 9.68421052631579e-06, |
|
"loss": 1.5731, |
|
"num_tokens": 522945.0, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.5328798185941043, |
|
"grad_norm": 21.49173927307129, |
|
"learning_rate": 9.660818713450293e-06, |
|
"loss": 1.4883, |
|
"num_tokens": 524157.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5340136054421769, |
|
"grad_norm": 22.7564697265625, |
|
"learning_rate": 9.637426900584796e-06, |
|
"loss": 1.4914, |
|
"num_tokens": 525245.0, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.5351473922902494, |
|
"grad_norm": 21.756540298461914, |
|
"learning_rate": 9.614035087719298e-06, |
|
"loss": 1.4791, |
|
"num_tokens": 526398.0, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.536281179138322, |
|
"grad_norm": 20.813621520996094, |
|
"learning_rate": 9.590643274853801e-06, |
|
"loss": 1.5602, |
|
"num_tokens": 527591.0, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.5374149659863946, |
|
"grad_norm": 20.991981506347656, |
|
"learning_rate": 9.567251461988305e-06, |
|
"loss": 1.4139, |
|
"num_tokens": 528722.0, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.5385487528344671, |
|
"grad_norm": 21.36113166809082, |
|
"learning_rate": 9.543859649122808e-06, |
|
"loss": 1.3992, |
|
"num_tokens": 529926.0, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5396825396825397, |
|
"grad_norm": 23.542648315429688, |
|
"learning_rate": 9.52046783625731e-06, |
|
"loss": 1.462, |
|
"num_tokens": 531002.0, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.5408163265306123, |
|
"grad_norm": 23.813064575195312, |
|
"learning_rate": 9.497076023391813e-06, |
|
"loss": 1.5113, |
|
"num_tokens": 532063.0, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.5419501133786848, |
|
"grad_norm": 23.0479736328125, |
|
"learning_rate": 9.473684210526315e-06, |
|
"loss": 1.3654, |
|
"num_tokens": 533122.0, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.5430839002267573, |
|
"grad_norm": 23.56155014038086, |
|
"learning_rate": 9.45029239766082e-06, |
|
"loss": 1.4703, |
|
"num_tokens": 534354.0, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.54421768707483, |
|
"grad_norm": 21.863500595092773, |
|
"learning_rate": 9.426900584795322e-06, |
|
"loss": 1.3612, |
|
"num_tokens": 535447.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5453514739229025, |
|
"grad_norm": 20.762577056884766, |
|
"learning_rate": 9.403508771929825e-06, |
|
"loss": 1.5177, |
|
"num_tokens": 536573.0, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.546485260770975, |
|
"grad_norm": 21.422998428344727, |
|
"learning_rate": 9.380116959064327e-06, |
|
"loss": 1.435, |
|
"num_tokens": 537707.0, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.5476190476190477, |
|
"grad_norm": 21.787960052490234, |
|
"learning_rate": 9.35672514619883e-06, |
|
"loss": 1.5013, |
|
"num_tokens": 538818.0, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.5487528344671202, |
|
"grad_norm": 23.668500900268555, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 1.5828, |
|
"num_tokens": 539908.0, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.5498866213151927, |
|
"grad_norm": 22.385894775390625, |
|
"learning_rate": 9.309941520467837e-06, |
|
"loss": 1.4457, |
|
"num_tokens": 541045.0, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.5510204081632653, |
|
"grad_norm": 20.410974502563477, |
|
"learning_rate": 9.28654970760234e-06, |
|
"loss": 1.4135, |
|
"num_tokens": 542124.0, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.5521541950113379, |
|
"grad_norm": 22.673994064331055, |
|
"learning_rate": 9.263157894736842e-06, |
|
"loss": 1.4219, |
|
"num_tokens": 543220.0, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.5532879818594104, |
|
"grad_norm": 19.593944549560547, |
|
"learning_rate": 9.239766081871345e-06, |
|
"loss": 1.4269, |
|
"num_tokens": 544354.0, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.5544217687074829, |
|
"grad_norm": 23.19577407836914, |
|
"learning_rate": 9.216374269005849e-06, |
|
"loss": 1.5597, |
|
"num_tokens": 545379.0, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 20.372406005859375, |
|
"learning_rate": 9.192982456140351e-06, |
|
"loss": 1.3499, |
|
"num_tokens": 546460.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5566893424036281, |
|
"grad_norm": 18.678747177124023, |
|
"learning_rate": 9.169590643274856e-06, |
|
"loss": 1.3365, |
|
"num_tokens": 547604.0, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.5578231292517006, |
|
"grad_norm": 23.01070785522461, |
|
"learning_rate": 9.146198830409357e-06, |
|
"loss": 1.4461, |
|
"num_tokens": 548661.0, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.5589569160997733, |
|
"grad_norm": 23.752355575561523, |
|
"learning_rate": 9.12280701754386e-06, |
|
"loss": 1.5772, |
|
"num_tokens": 549763.0, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.5600907029478458, |
|
"grad_norm": 22.22798728942871, |
|
"learning_rate": 9.099415204678363e-06, |
|
"loss": 1.3951, |
|
"num_tokens": 550930.0, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.5612244897959183, |
|
"grad_norm": 21.435941696166992, |
|
"learning_rate": 9.076023391812866e-06, |
|
"loss": 1.5268, |
|
"num_tokens": 552031.0, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.562358276643991, |
|
"grad_norm": 22.89337730407715, |
|
"learning_rate": 9.05263157894737e-06, |
|
"loss": 1.4932, |
|
"num_tokens": 553188.0, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.5634920634920635, |
|
"grad_norm": 20.6674747467041, |
|
"learning_rate": 9.029239766081873e-06, |
|
"loss": 1.3984, |
|
"num_tokens": 554282.0, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.564625850340136, |
|
"grad_norm": 20.50768280029297, |
|
"learning_rate": 9.005847953216374e-06, |
|
"loss": 1.4644, |
|
"num_tokens": 555422.0, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.5657596371882087, |
|
"grad_norm": 20.704288482666016, |
|
"learning_rate": 8.982456140350878e-06, |
|
"loss": 1.6067, |
|
"num_tokens": 556575.0, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.5668934240362812, |
|
"grad_norm": 22.014612197875977, |
|
"learning_rate": 8.95906432748538e-06, |
|
"loss": 1.5485, |
|
"num_tokens": 557680.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5680272108843537, |
|
"grad_norm": 23.033823013305664, |
|
"learning_rate": 8.935672514619885e-06, |
|
"loss": 1.4399, |
|
"num_tokens": 558791.0, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.5691609977324263, |
|
"grad_norm": 22.817472457885742, |
|
"learning_rate": 8.912280701754387e-06, |
|
"loss": 1.239, |
|
"num_tokens": 559835.0, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.5702947845804989, |
|
"grad_norm": 21.822277069091797, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 1.4371, |
|
"num_tokens": 560975.0, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 22.173824310302734, |
|
"learning_rate": 8.865497076023393e-06, |
|
"loss": 1.3944, |
|
"num_tokens": 562132.0, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.572562358276644, |
|
"grad_norm": 22.091264724731445, |
|
"learning_rate": 8.842105263157895e-06, |
|
"loss": 1.3903, |
|
"num_tokens": 563198.0, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.5736961451247166, |
|
"grad_norm": 20.748226165771484, |
|
"learning_rate": 8.8187134502924e-06, |
|
"loss": 1.3665, |
|
"num_tokens": 564323.0, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.5748299319727891, |
|
"grad_norm": 25.668231964111328, |
|
"learning_rate": 8.795321637426902e-06, |
|
"loss": 1.6003, |
|
"num_tokens": 565410.0, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.5759637188208617, |
|
"grad_norm": 21.842506408691406, |
|
"learning_rate": 8.771929824561405e-06, |
|
"loss": 1.4388, |
|
"num_tokens": 566583.0, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.5770975056689343, |
|
"grad_norm": 21.225013732910156, |
|
"learning_rate": 8.748538011695907e-06, |
|
"loss": 1.3724, |
|
"num_tokens": 567705.0, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.5782312925170068, |
|
"grad_norm": 22.67068862915039, |
|
"learning_rate": 8.72514619883041e-06, |
|
"loss": 1.3551, |
|
"num_tokens": 568800.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5793650793650794, |
|
"grad_norm": 21.34926986694336, |
|
"learning_rate": 8.701754385964914e-06, |
|
"loss": 1.3908, |
|
"num_tokens": 569913.0, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.5804988662131519, |
|
"grad_norm": 22.283945083618164, |
|
"learning_rate": 8.678362573099417e-06, |
|
"loss": 1.4563, |
|
"num_tokens": 571017.0, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.5816326530612245, |
|
"grad_norm": 20.993438720703125, |
|
"learning_rate": 8.654970760233919e-06, |
|
"loss": 1.3512, |
|
"num_tokens": 572126.0, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.5827664399092971, |
|
"grad_norm": 22.38656234741211, |
|
"learning_rate": 8.631578947368422e-06, |
|
"loss": 1.5402, |
|
"num_tokens": 573264.0, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.5839002267573696, |
|
"grad_norm": 23.537073135375977, |
|
"learning_rate": 8.608187134502924e-06, |
|
"loss": 1.3946, |
|
"num_tokens": 574289.0, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.5850340136054422, |
|
"grad_norm": 22.79361915588379, |
|
"learning_rate": 8.584795321637429e-06, |
|
"loss": 1.5559, |
|
"num_tokens": 575350.0, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.5861678004535147, |
|
"grad_norm": 21.405969619750977, |
|
"learning_rate": 8.561403508771931e-06, |
|
"loss": 1.4803, |
|
"num_tokens": 576453.0, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.5873015873015873, |
|
"grad_norm": 23.284671783447266, |
|
"learning_rate": 8.538011695906434e-06, |
|
"loss": 1.5008, |
|
"num_tokens": 577497.0, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.5884353741496599, |
|
"grad_norm": 21.37405776977539, |
|
"learning_rate": 8.514619883040936e-06, |
|
"loss": 1.4739, |
|
"num_tokens": 578563.0, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.5895691609977324, |
|
"grad_norm": 24.321304321289062, |
|
"learning_rate": 8.491228070175439e-06, |
|
"loss": 1.5819, |
|
"num_tokens": 579721.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.590702947845805, |
|
"grad_norm": 21.786588668823242, |
|
"learning_rate": 8.467836257309943e-06, |
|
"loss": 1.4846, |
|
"num_tokens": 580835.0, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.5918367346938775, |
|
"grad_norm": 22.301950454711914, |
|
"learning_rate": 8.444444444444446e-06, |
|
"loss": 1.4472, |
|
"num_tokens": 582031.0, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.5929705215419501, |
|
"grad_norm": 22.732772827148438, |
|
"learning_rate": 8.421052631578948e-06, |
|
"loss": 1.4803, |
|
"num_tokens": 583095.0, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.5941043083900227, |
|
"grad_norm": 21.661388397216797, |
|
"learning_rate": 8.39766081871345e-06, |
|
"loss": 1.3937, |
|
"num_tokens": 584233.0, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.5952380952380952, |
|
"grad_norm": 20.343976974487305, |
|
"learning_rate": 8.374269005847953e-06, |
|
"loss": 1.353, |
|
"num_tokens": 585302.0, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.5963718820861678, |
|
"grad_norm": 22.84857940673828, |
|
"learning_rate": 8.350877192982458e-06, |
|
"loss": 1.4354, |
|
"num_tokens": 586371.0, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.5975056689342404, |
|
"grad_norm": 22.94053077697754, |
|
"learning_rate": 8.32748538011696e-06, |
|
"loss": 1.5107, |
|
"num_tokens": 587512.0, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.5986394557823129, |
|
"grad_norm": 22.8999080657959, |
|
"learning_rate": 8.304093567251463e-06, |
|
"loss": 1.6091, |
|
"num_tokens": 588596.0, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.5997732426303855, |
|
"grad_norm": 23.222049713134766, |
|
"learning_rate": 8.280701754385965e-06, |
|
"loss": 1.4075, |
|
"num_tokens": 589699.0, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.6009070294784581, |
|
"grad_norm": 20.573713302612305, |
|
"learning_rate": 8.257309941520468e-06, |
|
"loss": 1.5905, |
|
"num_tokens": 590891.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6020408163265306, |
|
"grad_norm": 21.398988723754883, |
|
"learning_rate": 8.233918128654972e-06, |
|
"loss": 1.4535, |
|
"num_tokens": 592028.0, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.6031746031746031, |
|
"grad_norm": 22.95913314819336, |
|
"learning_rate": 8.210526315789475e-06, |
|
"loss": 1.371, |
|
"num_tokens": 593213.0, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.6043083900226758, |
|
"grad_norm": 21.470468521118164, |
|
"learning_rate": 8.187134502923977e-06, |
|
"loss": 1.6771, |
|
"num_tokens": 594442.0, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.6054421768707483, |
|
"grad_norm": 22.546823501586914, |
|
"learning_rate": 8.16374269005848e-06, |
|
"loss": 1.5234, |
|
"num_tokens": 595601.0, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.6065759637188208, |
|
"grad_norm": 25.772891998291016, |
|
"learning_rate": 8.140350877192983e-06, |
|
"loss": 1.4929, |
|
"num_tokens": 596802.0, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.6077097505668935, |
|
"grad_norm": 22.497135162353516, |
|
"learning_rate": 8.116959064327487e-06, |
|
"loss": 1.4492, |
|
"num_tokens": 597920.0, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.608843537414966, |
|
"grad_norm": 24.795047760009766, |
|
"learning_rate": 8.09356725146199e-06, |
|
"loss": 1.3766, |
|
"num_tokens": 598998.0, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.6099773242630385, |
|
"grad_norm": 20.771310806274414, |
|
"learning_rate": 8.070175438596492e-06, |
|
"loss": 1.6282, |
|
"num_tokens": 600212.0, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.6111111111111112, |
|
"grad_norm": 24.573957443237305, |
|
"learning_rate": 8.046783625730994e-06, |
|
"loss": 1.3345, |
|
"num_tokens": 601248.0, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.6122448979591837, |
|
"grad_norm": 20.914003372192383, |
|
"learning_rate": 8.023391812865497e-06, |
|
"loss": 1.4553, |
|
"num_tokens": 602367.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6133786848072562, |
|
"grad_norm": 22.030210494995117, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.4759, |
|
"num_tokens": 603524.0, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.6145124716553289, |
|
"grad_norm": 22.657468795776367, |
|
"learning_rate": 7.976608187134504e-06, |
|
"loss": 1.4916, |
|
"num_tokens": 604618.0, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.6156462585034014, |
|
"grad_norm": 23.671598434448242, |
|
"learning_rate": 7.953216374269006e-06, |
|
"loss": 1.3016, |
|
"num_tokens": 605667.0, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.6167800453514739, |
|
"grad_norm": 22.971860885620117, |
|
"learning_rate": 7.929824561403509e-06, |
|
"loss": 1.5056, |
|
"num_tokens": 606768.0, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.6179138321995464, |
|
"grad_norm": 22.38102912902832, |
|
"learning_rate": 7.906432748538012e-06, |
|
"loss": 1.5457, |
|
"num_tokens": 607836.0, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.6190476190476191, |
|
"grad_norm": 21.6705379486084, |
|
"learning_rate": 7.883040935672516e-06, |
|
"loss": 1.4587, |
|
"num_tokens": 608915.0, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.6201814058956916, |
|
"grad_norm": 22.290454864501953, |
|
"learning_rate": 7.859649122807018e-06, |
|
"loss": 1.5118, |
|
"num_tokens": 609979.0, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.6213151927437641, |
|
"grad_norm": 22.28474998474121, |
|
"learning_rate": 7.836257309941521e-06, |
|
"loss": 1.5042, |
|
"num_tokens": 611160.0, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.6224489795918368, |
|
"grad_norm": 23.13262176513672, |
|
"learning_rate": 7.812865497076024e-06, |
|
"loss": 1.4265, |
|
"num_tokens": 612235.0, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.6235827664399093, |
|
"grad_norm": 25.128522872924805, |
|
"learning_rate": 7.789473684210526e-06, |
|
"loss": 1.6554, |
|
"num_tokens": 613310.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6247165532879818, |
|
"grad_norm": 20.930503845214844, |
|
"learning_rate": 7.76608187134503e-06, |
|
"loss": 1.4841, |
|
"num_tokens": 614403.0, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.6258503401360545, |
|
"grad_norm": 21.933982849121094, |
|
"learning_rate": 7.742690058479533e-06, |
|
"loss": 1.4161, |
|
"num_tokens": 615531.0, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.626984126984127, |
|
"grad_norm": 20.2902889251709, |
|
"learning_rate": 7.719298245614036e-06, |
|
"loss": 1.4999, |
|
"num_tokens": 616780.0, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.6281179138321995, |
|
"grad_norm": 20.942724227905273, |
|
"learning_rate": 7.695906432748538e-06, |
|
"loss": 1.3684, |
|
"num_tokens": 617962.0, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.6292517006802721, |
|
"grad_norm": 21.612571716308594, |
|
"learning_rate": 7.67251461988304e-06, |
|
"loss": 1.5484, |
|
"num_tokens": 619151.0, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.6303854875283447, |
|
"grad_norm": 22.789594650268555, |
|
"learning_rate": 7.649122807017545e-06, |
|
"loss": 1.4786, |
|
"num_tokens": 620263.0, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.6315192743764172, |
|
"grad_norm": 20.90570640563965, |
|
"learning_rate": 7.625730994152048e-06, |
|
"loss": 1.6095, |
|
"num_tokens": 621484.0, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.6326530612244898, |
|
"grad_norm": 25.00489616394043, |
|
"learning_rate": 7.60233918128655e-06, |
|
"loss": 1.6409, |
|
"num_tokens": 622514.0, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.6337868480725624, |
|
"grad_norm": 20.243322372436523, |
|
"learning_rate": 7.578947368421054e-06, |
|
"loss": 1.5188, |
|
"num_tokens": 623693.0, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.6349206349206349, |
|
"grad_norm": 22.97846221923828, |
|
"learning_rate": 7.555555555555556e-06, |
|
"loss": 1.5657, |
|
"num_tokens": 624782.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6360544217687075, |
|
"grad_norm": 24.399961471557617, |
|
"learning_rate": 7.5321637426900596e-06, |
|
"loss": 1.5053, |
|
"num_tokens": 625875.0, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.63718820861678, |
|
"grad_norm": 20.759233474731445, |
|
"learning_rate": 7.508771929824562e-06, |
|
"loss": 1.5293, |
|
"num_tokens": 627019.0, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.6383219954648526, |
|
"grad_norm": 21.16717529296875, |
|
"learning_rate": 7.485380116959065e-06, |
|
"loss": 1.3894, |
|
"num_tokens": 628139.0, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.6394557823129252, |
|
"grad_norm": 22.11092758178711, |
|
"learning_rate": 7.461988304093568e-06, |
|
"loss": 1.4995, |
|
"num_tokens": 629274.0, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.6405895691609977, |
|
"grad_norm": 27.72738265991211, |
|
"learning_rate": 7.438596491228071e-06, |
|
"loss": 1.3619, |
|
"num_tokens": 630243.0, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.6417233560090703, |
|
"grad_norm": 20.804826736450195, |
|
"learning_rate": 7.415204678362574e-06, |
|
"loss": 1.4518, |
|
"num_tokens": 631375.0, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.6428571428571429, |
|
"grad_norm": 21.256473541259766, |
|
"learning_rate": 7.391812865497077e-06, |
|
"loss": 1.4972, |
|
"num_tokens": 632486.0, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.6439909297052154, |
|
"grad_norm": 22.026762008666992, |
|
"learning_rate": 7.368421052631579e-06, |
|
"loss": 1.3551, |
|
"num_tokens": 633618.0, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.645124716553288, |
|
"grad_norm": 21.86498260498047, |
|
"learning_rate": 7.345029239766083e-06, |
|
"loss": 1.5152, |
|
"num_tokens": 634700.0, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.6462585034013606, |
|
"grad_norm": 22.414583206176758, |
|
"learning_rate": 7.321637426900585e-06, |
|
"loss": 1.3151, |
|
"num_tokens": 635812.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6473922902494331, |
|
"grad_norm": 21.637290954589844, |
|
"learning_rate": 7.298245614035089e-06, |
|
"loss": 1.4313, |
|
"num_tokens": 636943.0, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.6485260770975056, |
|
"grad_norm": 22.154354095458984, |
|
"learning_rate": 7.274853801169591e-06, |
|
"loss": 1.3439, |
|
"num_tokens": 638032.0, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.6496598639455783, |
|
"grad_norm": 24.491914749145508, |
|
"learning_rate": 7.251461988304094e-06, |
|
"loss": 1.5135, |
|
"num_tokens": 639185.0, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.6507936507936508, |
|
"grad_norm": 23.253528594970703, |
|
"learning_rate": 7.228070175438597e-06, |
|
"loss": 1.4544, |
|
"num_tokens": 640432.0, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.6519274376417233, |
|
"grad_norm": 22.823341369628906, |
|
"learning_rate": 7.2046783625731e-06, |
|
"loss": 1.326, |
|
"num_tokens": 641575.0, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.6530612244897959, |
|
"grad_norm": 22.95077896118164, |
|
"learning_rate": 7.181286549707603e-06, |
|
"loss": 1.6407, |
|
"num_tokens": 642791.0, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.6541950113378685, |
|
"grad_norm": 22.988304138183594, |
|
"learning_rate": 7.157894736842106e-06, |
|
"loss": 1.4079, |
|
"num_tokens": 643851.0, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.655328798185941, |
|
"grad_norm": 22.63962745666504, |
|
"learning_rate": 7.134502923976608e-06, |
|
"loss": 1.3522, |
|
"num_tokens": 644917.0, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.6564625850340136, |
|
"grad_norm": 26.570436477661133, |
|
"learning_rate": 7.111111111111112e-06, |
|
"loss": 1.5682, |
|
"num_tokens": 645970.0, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.6575963718820862, |
|
"grad_norm": 22.55550765991211, |
|
"learning_rate": 7.087719298245614e-06, |
|
"loss": 1.4318, |
|
"num_tokens": 647052.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6587301587301587, |
|
"grad_norm": 22.61783218383789, |
|
"learning_rate": 7.064327485380118e-06, |
|
"loss": 1.4444, |
|
"num_tokens": 648119.0, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.6598639455782312, |
|
"grad_norm": 22.553382873535156, |
|
"learning_rate": 7.04093567251462e-06, |
|
"loss": 1.359, |
|
"num_tokens": 649162.0, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.6609977324263039, |
|
"grad_norm": 21.678308486938477, |
|
"learning_rate": 7.017543859649123e-06, |
|
"loss": 1.3491, |
|
"num_tokens": 650283.0, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.6621315192743764, |
|
"grad_norm": 21.3242130279541, |
|
"learning_rate": 6.994152046783626e-06, |
|
"loss": 1.4176, |
|
"num_tokens": 651440.0, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.6632653061224489, |
|
"grad_norm": 24.34210777282715, |
|
"learning_rate": 6.970760233918129e-06, |
|
"loss": 1.5172, |
|
"num_tokens": 652549.0, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.6643990929705216, |
|
"grad_norm": 24.187849044799805, |
|
"learning_rate": 6.947368421052632e-06, |
|
"loss": 1.5697, |
|
"num_tokens": 653590.0, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.6655328798185941, |
|
"grad_norm": 22.247058868408203, |
|
"learning_rate": 6.923976608187135e-06, |
|
"loss": 1.3541, |
|
"num_tokens": 654714.0, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 21.859542846679688, |
|
"learning_rate": 6.9005847953216375e-06, |
|
"loss": 1.492, |
|
"num_tokens": 655964.0, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.6678004535147393, |
|
"grad_norm": 22.576213836669922, |
|
"learning_rate": 6.877192982456141e-06, |
|
"loss": 1.4299, |
|
"num_tokens": 657065.0, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.6689342403628118, |
|
"grad_norm": 22.9122257232666, |
|
"learning_rate": 6.8538011695906435e-06, |
|
"loss": 1.4698, |
|
"num_tokens": 658255.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6700680272108843, |
|
"grad_norm": 23.04901123046875, |
|
"learning_rate": 6.830409356725147e-06, |
|
"loss": 1.5064, |
|
"num_tokens": 659325.0, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.671201814058957, |
|
"grad_norm": 23.730091094970703, |
|
"learning_rate": 6.8070175438596495e-06, |
|
"loss": 1.4925, |
|
"num_tokens": 660485.0, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.6723356009070295, |
|
"grad_norm": 22.81539535522461, |
|
"learning_rate": 6.783625730994152e-06, |
|
"loss": 1.3731, |
|
"num_tokens": 661513.0, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.673469387755102, |
|
"grad_norm": 23.25772476196289, |
|
"learning_rate": 6.7602339181286555e-06, |
|
"loss": 1.4893, |
|
"num_tokens": 662618.0, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.6746031746031746, |
|
"grad_norm": 23.883750915527344, |
|
"learning_rate": 6.736842105263158e-06, |
|
"loss": 1.2276, |
|
"num_tokens": 663737.0, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.6757369614512472, |
|
"grad_norm": 23.94465446472168, |
|
"learning_rate": 6.7134502923976615e-06, |
|
"loss": 1.4642, |
|
"num_tokens": 664857.0, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.6768707482993197, |
|
"grad_norm": 22.831388473510742, |
|
"learning_rate": 6.690058479532164e-06, |
|
"loss": 1.4457, |
|
"num_tokens": 665954.0, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.6780045351473923, |
|
"grad_norm": 22.41533851623535, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.5384, |
|
"num_tokens": 667045.0, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.6791383219954649, |
|
"grad_norm": 21.215373992919922, |
|
"learning_rate": 6.64327485380117e-06, |
|
"loss": 1.5598, |
|
"num_tokens": 668272.0, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.6802721088435374, |
|
"grad_norm": 21.893632888793945, |
|
"learning_rate": 6.619883040935673e-06, |
|
"loss": 1.5232, |
|
"num_tokens": 669382.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.68140589569161, |
|
"grad_norm": 22.526578903198242, |
|
"learning_rate": 6.596491228070177e-06, |
|
"loss": 1.5125, |
|
"num_tokens": 670453.0, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.6825396825396826, |
|
"grad_norm": 22.798067092895508, |
|
"learning_rate": 6.573099415204679e-06, |
|
"loss": 1.4695, |
|
"num_tokens": 671545.0, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.6836734693877551, |
|
"grad_norm": 21.24146842956543, |
|
"learning_rate": 6.549707602339181e-06, |
|
"loss": 1.46, |
|
"num_tokens": 672697.0, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.6848072562358276, |
|
"grad_norm": 24.16074562072754, |
|
"learning_rate": 6.526315789473685e-06, |
|
"loss": 1.4916, |
|
"num_tokens": 673798.0, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.6859410430839002, |
|
"grad_norm": 21.984033584594727, |
|
"learning_rate": 6.502923976608187e-06, |
|
"loss": 1.3227, |
|
"num_tokens": 674822.0, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.6870748299319728, |
|
"grad_norm": 23.14649772644043, |
|
"learning_rate": 6.4795321637426915e-06, |
|
"loss": 1.5123, |
|
"num_tokens": 675953.0, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.6882086167800453, |
|
"grad_norm": 21.985637664794922, |
|
"learning_rate": 6.456140350877193e-06, |
|
"loss": 1.5573, |
|
"num_tokens": 677069.0, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.6893424036281179, |
|
"grad_norm": 22.13504409790039, |
|
"learning_rate": 6.432748538011696e-06, |
|
"loss": 1.5341, |
|
"num_tokens": 678207.0, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.6904761904761905, |
|
"grad_norm": 22.876258850097656, |
|
"learning_rate": 6.4093567251462e-06, |
|
"loss": 1.5077, |
|
"num_tokens": 679333.0, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.691609977324263, |
|
"grad_norm": 22.648876190185547, |
|
"learning_rate": 6.385964912280702e-06, |
|
"loss": 1.5165, |
|
"num_tokens": 680472.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6927437641723356, |
|
"grad_norm": 24.44489860534668, |
|
"learning_rate": 6.362573099415206e-06, |
|
"loss": 1.4595, |
|
"num_tokens": 681550.0, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.6938775510204082, |
|
"grad_norm": 23.062849044799805, |
|
"learning_rate": 6.339181286549709e-06, |
|
"loss": 1.5479, |
|
"num_tokens": 682665.0, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.6950113378684807, |
|
"grad_norm": 25.6104793548584, |
|
"learning_rate": 6.31578947368421e-06, |
|
"loss": 1.4372, |
|
"num_tokens": 683797.0, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.6961451247165533, |
|
"grad_norm": 22.21469497680664, |
|
"learning_rate": 6.292397660818715e-06, |
|
"loss": 1.4886, |
|
"num_tokens": 684941.0, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.6972789115646258, |
|
"grad_norm": 22.583446502685547, |
|
"learning_rate": 6.269005847953217e-06, |
|
"loss": 1.4647, |
|
"num_tokens": 686074.0, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.6984126984126984, |
|
"grad_norm": 23.686786651611328, |
|
"learning_rate": 6.245614035087721e-06, |
|
"loss": 1.3958, |
|
"num_tokens": 687271.0, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.699546485260771, |
|
"grad_norm": 23.05241584777832, |
|
"learning_rate": 6.222222222222223e-06, |
|
"loss": 1.4258, |
|
"num_tokens": 688341.0, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.7006802721088435, |
|
"grad_norm": 23.340354919433594, |
|
"learning_rate": 6.198830409356725e-06, |
|
"loss": 1.5196, |
|
"num_tokens": 689435.0, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.7018140589569161, |
|
"grad_norm": 22.070066452026367, |
|
"learning_rate": 6.175438596491229e-06, |
|
"loss": 1.3828, |
|
"num_tokens": 690569.0, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.7029478458049887, |
|
"grad_norm": 23.374649047851562, |
|
"learning_rate": 6.152046783625732e-06, |
|
"loss": 1.4309, |
|
"num_tokens": 691672.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7040816326530612, |
|
"grad_norm": 23.68497085571289, |
|
"learning_rate": 6.128654970760235e-06, |
|
"loss": 1.4063, |
|
"num_tokens": 692947.0, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.7052154195011338, |
|
"grad_norm": 22.70351791381836, |
|
"learning_rate": 6.105263157894738e-06, |
|
"loss": 1.4569, |
|
"num_tokens": 694052.0, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.7063492063492064, |
|
"grad_norm": 21.42176055908203, |
|
"learning_rate": 6.08187134502924e-06, |
|
"loss": 1.4617, |
|
"num_tokens": 695248.0, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.7074829931972789, |
|
"grad_norm": 22.26094627380371, |
|
"learning_rate": 6.058479532163744e-06, |
|
"loss": 1.516, |
|
"num_tokens": 696356.0, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.7086167800453514, |
|
"grad_norm": 26.08213233947754, |
|
"learning_rate": 6.035087719298246e-06, |
|
"loss": 1.4949, |
|
"num_tokens": 697517.0, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.7097505668934241, |
|
"grad_norm": 22.348121643066406, |
|
"learning_rate": 6.01169590643275e-06, |
|
"loss": 1.4339, |
|
"num_tokens": 698582.0, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.7108843537414966, |
|
"grad_norm": 22.22833251953125, |
|
"learning_rate": 5.988304093567252e-06, |
|
"loss": 1.4808, |
|
"num_tokens": 699745.0, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.7120181405895691, |
|
"grad_norm": 23.78014373779297, |
|
"learning_rate": 5.964912280701755e-06, |
|
"loss": 1.4515, |
|
"num_tokens": 700778.0, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.7131519274376418, |
|
"grad_norm": 23.705251693725586, |
|
"learning_rate": 5.941520467836258e-06, |
|
"loss": 1.4206, |
|
"num_tokens": 701913.0, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 20.9232234954834, |
|
"learning_rate": 5.918128654970761e-06, |
|
"loss": 1.337, |
|
"num_tokens": 703034.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7154195011337868, |
|
"grad_norm": 20.941648483276367, |
|
"learning_rate": 5.8947368421052634e-06, |
|
"loss": 1.3962, |
|
"num_tokens": 704183.0, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.7165532879818595, |
|
"grad_norm": 21.786916732788086, |
|
"learning_rate": 5.871345029239767e-06, |
|
"loss": 1.3946, |
|
"num_tokens": 705285.0, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.717687074829932, |
|
"grad_norm": 24.243087768554688, |
|
"learning_rate": 5.847953216374269e-06, |
|
"loss": 1.4448, |
|
"num_tokens": 706394.0, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.7188208616780045, |
|
"grad_norm": 24.048946380615234, |
|
"learning_rate": 5.824561403508773e-06, |
|
"loss": 1.5736, |
|
"num_tokens": 707507.0, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.719954648526077, |
|
"grad_norm": 20.490083694458008, |
|
"learning_rate": 5.801169590643275e-06, |
|
"loss": 1.531, |
|
"num_tokens": 708763.0, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.7210884353741497, |
|
"grad_norm": 23.876779556274414, |
|
"learning_rate": 5.777777777777778e-06, |
|
"loss": 1.4557, |
|
"num_tokens": 709792.0, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.7222222222222222, |
|
"grad_norm": 24.66008186340332, |
|
"learning_rate": 5.754385964912281e-06, |
|
"loss": 1.517, |
|
"num_tokens": 710874.0, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.7233560090702947, |
|
"grad_norm": 20.79994010925293, |
|
"learning_rate": 5.730994152046784e-06, |
|
"loss": 1.3886, |
|
"num_tokens": 712053.0, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.7244897959183674, |
|
"grad_norm": 23.969280242919922, |
|
"learning_rate": 5.707602339181287e-06, |
|
"loss": 1.3954, |
|
"num_tokens": 713114.0, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.7256235827664399, |
|
"grad_norm": 23.537185668945312, |
|
"learning_rate": 5.68421052631579e-06, |
|
"loss": 1.3893, |
|
"num_tokens": 714201.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7267573696145124, |
|
"grad_norm": 22.46196937561035, |
|
"learning_rate": 5.6608187134502925e-06, |
|
"loss": 1.5846, |
|
"num_tokens": 715281.0, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.7278911564625851, |
|
"grad_norm": 20.890539169311523, |
|
"learning_rate": 5.637426900584796e-06, |
|
"loss": 1.4379, |
|
"num_tokens": 716404.0, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.7290249433106576, |
|
"grad_norm": 21.008331298828125, |
|
"learning_rate": 5.6140350877192985e-06, |
|
"loss": 1.4799, |
|
"num_tokens": 717488.0, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.7301587301587301, |
|
"grad_norm": 22.868465423583984, |
|
"learning_rate": 5.590643274853802e-06, |
|
"loss": 1.484, |
|
"num_tokens": 718571.0, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.7312925170068028, |
|
"grad_norm": 23.07869529724121, |
|
"learning_rate": 5.5672514619883045e-06, |
|
"loss": 1.484, |
|
"num_tokens": 719716.0, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.7324263038548753, |
|
"grad_norm": 24.570358276367188, |
|
"learning_rate": 5.543859649122807e-06, |
|
"loss": 1.5601, |
|
"num_tokens": 720905.0, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.7335600907029478, |
|
"grad_norm": 26.3187255859375, |
|
"learning_rate": 5.5204678362573105e-06, |
|
"loss": 1.5891, |
|
"num_tokens": 722076.0, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.7346938775510204, |
|
"grad_norm": 22.848508834838867, |
|
"learning_rate": 5.497076023391813e-06, |
|
"loss": 1.3089, |
|
"num_tokens": 723183.0, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.735827664399093, |
|
"grad_norm": 22.79510498046875, |
|
"learning_rate": 5.4736842105263165e-06, |
|
"loss": 1.5494, |
|
"num_tokens": 724329.0, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.7369614512471655, |
|
"grad_norm": 20.732099533081055, |
|
"learning_rate": 5.450292397660819e-06, |
|
"loss": 1.4599, |
|
"num_tokens": 725447.0, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7380952380952381, |
|
"grad_norm": 22.604984283447266, |
|
"learning_rate": 5.426900584795322e-06, |
|
"loss": 1.3155, |
|
"num_tokens": 726477.0, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.7392290249433107, |
|
"grad_norm": 21.90055274963379, |
|
"learning_rate": 5.403508771929825e-06, |
|
"loss": 1.5524, |
|
"num_tokens": 727606.0, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.7403628117913832, |
|
"grad_norm": 21.966693878173828, |
|
"learning_rate": 5.380116959064328e-06, |
|
"loss": 1.5046, |
|
"num_tokens": 728718.0, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.7414965986394558, |
|
"grad_norm": 25.208890914916992, |
|
"learning_rate": 5.356725146198831e-06, |
|
"loss": 1.6362, |
|
"num_tokens": 729745.0, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.7426303854875284, |
|
"grad_norm": 24.090883255004883, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 1.3422, |
|
"num_tokens": 730924.0, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.7437641723356009, |
|
"grad_norm": 22.79339599609375, |
|
"learning_rate": 5.309941520467836e-06, |
|
"loss": 1.3421, |
|
"num_tokens": 731980.0, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.7448979591836735, |
|
"grad_norm": 23.833890914916992, |
|
"learning_rate": 5.28654970760234e-06, |
|
"loss": 1.5008, |
|
"num_tokens": 733073.0, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.746031746031746, |
|
"grad_norm": 19.78729820251465, |
|
"learning_rate": 5.263157894736842e-06, |
|
"loss": 1.3997, |
|
"num_tokens": 734355.0, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.7471655328798186, |
|
"grad_norm": 24.19782829284668, |
|
"learning_rate": 5.239766081871346e-06, |
|
"loss": 1.4371, |
|
"num_tokens": 735425.0, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.7482993197278912, |
|
"grad_norm": 23.320068359375, |
|
"learning_rate": 5.216374269005848e-06, |
|
"loss": 1.3869, |
|
"num_tokens": 736522.0, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7494331065759637, |
|
"grad_norm": 22.0587158203125, |
|
"learning_rate": 5.192982456140351e-06, |
|
"loss": 1.4641, |
|
"num_tokens": 737633.0, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.7505668934240363, |
|
"grad_norm": 22.994808197021484, |
|
"learning_rate": 5.169590643274854e-06, |
|
"loss": 1.4685, |
|
"num_tokens": 738753.0, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.7517006802721088, |
|
"grad_norm": 22.301300048828125, |
|
"learning_rate": 5.146198830409357e-06, |
|
"loss": 1.4651, |
|
"num_tokens": 739872.0, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.7528344671201814, |
|
"grad_norm": 21.97808837890625, |
|
"learning_rate": 5.12280701754386e-06, |
|
"loss": 1.4382, |
|
"num_tokens": 741008.0, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.753968253968254, |
|
"grad_norm": 21.360986709594727, |
|
"learning_rate": 5.099415204678363e-06, |
|
"loss": 1.5411, |
|
"num_tokens": 742227.0, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.7551020408163265, |
|
"grad_norm": 22.520597457885742, |
|
"learning_rate": 5.076023391812865e-06, |
|
"loss": 1.4047, |
|
"num_tokens": 743294.0, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.7562358276643991, |
|
"grad_norm": 22.39497184753418, |
|
"learning_rate": 5.052631578947369e-06, |
|
"loss": 1.3934, |
|
"num_tokens": 744498.0, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.7573696145124716, |
|
"grad_norm": 21.658981323242188, |
|
"learning_rate": 5.029239766081871e-06, |
|
"loss": 1.4161, |
|
"num_tokens": 745753.0, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.7585034013605442, |
|
"grad_norm": 22.147428512573242, |
|
"learning_rate": 5.005847953216375e-06, |
|
"loss": 1.365, |
|
"num_tokens": 746899.0, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.7596371882086168, |
|
"grad_norm": 22.580663681030273, |
|
"learning_rate": 4.982456140350877e-06, |
|
"loss": 1.5026, |
|
"num_tokens": 747982.0, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7607709750566893, |
|
"grad_norm": 22.9129581451416, |
|
"learning_rate": 4.959064327485381e-06, |
|
"loss": 1.3533, |
|
"num_tokens": 749043.0, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.7619047619047619, |
|
"grad_norm": 20.52309226989746, |
|
"learning_rate": 4.935672514619883e-06, |
|
"loss": 1.3866, |
|
"num_tokens": 750230.0, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.7630385487528345, |
|
"grad_norm": 25.36422348022461, |
|
"learning_rate": 4.912280701754386e-06, |
|
"loss": 1.5019, |
|
"num_tokens": 751311.0, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.764172335600907, |
|
"grad_norm": 20.621496200561523, |
|
"learning_rate": 4.888888888888889e-06, |
|
"loss": 1.4865, |
|
"num_tokens": 752533.0, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.7653061224489796, |
|
"grad_norm": 25.740957260131836, |
|
"learning_rate": 4.865497076023392e-06, |
|
"loss": 1.363, |
|
"num_tokens": 753621.0, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.7664399092970522, |
|
"grad_norm": 22.625244140625, |
|
"learning_rate": 4.842105263157895e-06, |
|
"loss": 1.4387, |
|
"num_tokens": 754720.0, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.7675736961451247, |
|
"grad_norm": 23.55186653137207, |
|
"learning_rate": 4.818713450292398e-06, |
|
"loss": 1.4493, |
|
"num_tokens": 755780.0, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.7687074829931972, |
|
"grad_norm": 22.29022789001465, |
|
"learning_rate": 4.7953216374269005e-06, |
|
"loss": 1.4572, |
|
"num_tokens": 756845.0, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.7698412698412699, |
|
"grad_norm": 23.95270538330078, |
|
"learning_rate": 4.771929824561404e-06, |
|
"loss": 1.4394, |
|
"num_tokens": 758014.0, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.7709750566893424, |
|
"grad_norm": 22.06515121459961, |
|
"learning_rate": 4.7485380116959065e-06, |
|
"loss": 1.5034, |
|
"num_tokens": 759116.0, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7721088435374149, |
|
"grad_norm": 21.582685470581055, |
|
"learning_rate": 4.72514619883041e-06, |
|
"loss": 1.4249, |
|
"num_tokens": 760209.0, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.7732426303854876, |
|
"grad_norm": 23.25868797302246, |
|
"learning_rate": 4.7017543859649125e-06, |
|
"loss": 1.4951, |
|
"num_tokens": 761296.0, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.7743764172335601, |
|
"grad_norm": 22.286924362182617, |
|
"learning_rate": 4.678362573099415e-06, |
|
"loss": 1.473, |
|
"num_tokens": 762380.0, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.7755102040816326, |
|
"grad_norm": 22.370563507080078, |
|
"learning_rate": 4.6549707602339184e-06, |
|
"loss": 1.3724, |
|
"num_tokens": 763416.0, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.7766439909297053, |
|
"grad_norm": 22.624408721923828, |
|
"learning_rate": 4.631578947368421e-06, |
|
"loss": 1.3322, |
|
"num_tokens": 764519.0, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.7777777777777778, |
|
"grad_norm": 22.60655403137207, |
|
"learning_rate": 4.6081871345029244e-06, |
|
"loss": 1.4132, |
|
"num_tokens": 765685.0, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.7789115646258503, |
|
"grad_norm": 22.81845474243164, |
|
"learning_rate": 4.584795321637428e-06, |
|
"loss": 1.4029, |
|
"num_tokens": 766859.0, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.780045351473923, |
|
"grad_norm": 22.1308536529541, |
|
"learning_rate": 4.56140350877193e-06, |
|
"loss": 1.3881, |
|
"num_tokens": 768070.0, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.7811791383219955, |
|
"grad_norm": 23.998682022094727, |
|
"learning_rate": 4.538011695906433e-06, |
|
"loss": 1.5834, |
|
"num_tokens": 769163.0, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.782312925170068, |
|
"grad_norm": 23.69707489013672, |
|
"learning_rate": 4.5146198830409364e-06, |
|
"loss": 1.5529, |
|
"num_tokens": 770331.0, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7834467120181405, |
|
"grad_norm": 24.159103393554688, |
|
"learning_rate": 4.491228070175439e-06, |
|
"loss": 1.3963, |
|
"num_tokens": 771348.0, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.7845804988662132, |
|
"grad_norm": 22.975770950317383, |
|
"learning_rate": 4.467836257309942e-06, |
|
"loss": 1.3646, |
|
"num_tokens": 772423.0, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.7857142857142857, |
|
"grad_norm": 24.724035263061523, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 1.3742, |
|
"num_tokens": 773491.0, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.7868480725623582, |
|
"grad_norm": 23.44858741760254, |
|
"learning_rate": 4.4210526315789476e-06, |
|
"loss": 1.5088, |
|
"num_tokens": 774638.0, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.7879818594104309, |
|
"grad_norm": 23.948352813720703, |
|
"learning_rate": 4.397660818713451e-06, |
|
"loss": 1.7207, |
|
"num_tokens": 775775.0, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.7891156462585034, |
|
"grad_norm": 21.847509384155273, |
|
"learning_rate": 4.3742690058479536e-06, |
|
"loss": 1.5141, |
|
"num_tokens": 776879.0, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.7902494331065759, |
|
"grad_norm": 23.483224868774414, |
|
"learning_rate": 4.350877192982457e-06, |
|
"loss": 1.5361, |
|
"num_tokens": 778025.0, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.7913832199546486, |
|
"grad_norm": 22.497262954711914, |
|
"learning_rate": 4.3274853801169596e-06, |
|
"loss": 1.5269, |
|
"num_tokens": 779182.0, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.7925170068027211, |
|
"grad_norm": 24.064851760864258, |
|
"learning_rate": 4.304093567251462e-06, |
|
"loss": 1.4504, |
|
"num_tokens": 780292.0, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.7936507936507936, |
|
"grad_norm": 25.360252380371094, |
|
"learning_rate": 4.2807017543859656e-06, |
|
"loss": 1.358, |
|
"num_tokens": 781293.0, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7947845804988662, |
|
"grad_norm": 22.048860549926758, |
|
"learning_rate": 4.257309941520468e-06, |
|
"loss": 1.4928, |
|
"num_tokens": 782489.0, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.7959183673469388, |
|
"grad_norm": 25.04562759399414, |
|
"learning_rate": 4.2339181286549715e-06, |
|
"loss": 1.4883, |
|
"num_tokens": 783575.0, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.7970521541950113, |
|
"grad_norm": 23.770830154418945, |
|
"learning_rate": 4.210526315789474e-06, |
|
"loss": 1.512, |
|
"num_tokens": 784656.0, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.7981859410430839, |
|
"grad_norm": 22.704742431640625, |
|
"learning_rate": 4.187134502923977e-06, |
|
"loss": 1.3788, |
|
"num_tokens": 785780.0, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.7993197278911565, |
|
"grad_norm": 21.932666778564453, |
|
"learning_rate": 4.16374269005848e-06, |
|
"loss": 1.381, |
|
"num_tokens": 786993.0, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.800453514739229, |
|
"grad_norm": 22.353124618530273, |
|
"learning_rate": 4.140350877192983e-06, |
|
"loss": 1.3069, |
|
"num_tokens": 788042.0, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.8015873015873016, |
|
"grad_norm": 23.022003173828125, |
|
"learning_rate": 4.116959064327486e-06, |
|
"loss": 1.5089, |
|
"num_tokens": 789182.0, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.8027210884353742, |
|
"grad_norm": 22.555904388427734, |
|
"learning_rate": 4.093567251461989e-06, |
|
"loss": 1.4996, |
|
"num_tokens": 790326.0, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.8038548752834467, |
|
"grad_norm": 22.719186782836914, |
|
"learning_rate": 4.070175438596491e-06, |
|
"loss": 1.5449, |
|
"num_tokens": 791473.0, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.8049886621315193, |
|
"grad_norm": 21.924585342407227, |
|
"learning_rate": 4.046783625730995e-06, |
|
"loss": 1.3993, |
|
"num_tokens": 792646.0, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8061224489795918, |
|
"grad_norm": 20.699281692504883, |
|
"learning_rate": 4.023391812865497e-06, |
|
"loss": 1.4973, |
|
"num_tokens": 793797.0, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.8072562358276644, |
|
"grad_norm": 22.318761825561523, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.4515, |
|
"num_tokens": 794861.0, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.808390022675737, |
|
"grad_norm": 21.21175765991211, |
|
"learning_rate": 3.976608187134503e-06, |
|
"loss": 1.3681, |
|
"num_tokens": 796010.0, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.8095238095238095, |
|
"grad_norm": 22.589601516723633, |
|
"learning_rate": 3.953216374269006e-06, |
|
"loss": 1.4962, |
|
"num_tokens": 797073.0, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.8106575963718821, |
|
"grad_norm": 21.697351455688477, |
|
"learning_rate": 3.929824561403509e-06, |
|
"loss": 1.4605, |
|
"num_tokens": 798194.0, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.8117913832199547, |
|
"grad_norm": 21.772241592407227, |
|
"learning_rate": 3.906432748538012e-06, |
|
"loss": 1.4382, |
|
"num_tokens": 799326.0, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.8129251700680272, |
|
"grad_norm": 21.598237991333008, |
|
"learning_rate": 3.883040935672515e-06, |
|
"loss": 1.4977, |
|
"num_tokens": 800462.0, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.8140589569160998, |
|
"grad_norm": 21.325088500976562, |
|
"learning_rate": 3.859649122807018e-06, |
|
"loss": 1.5207, |
|
"num_tokens": 801660.0, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.8151927437641724, |
|
"grad_norm": 24.253833770751953, |
|
"learning_rate": 3.83625730994152e-06, |
|
"loss": 1.4913, |
|
"num_tokens": 802681.0, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"grad_norm": 22.219438552856445, |
|
"learning_rate": 3.812865497076024e-06, |
|
"loss": 1.4802, |
|
"num_tokens": 803845.0, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8174603174603174, |
|
"grad_norm": 22.79910659790039, |
|
"learning_rate": 3.789473684210527e-06, |
|
"loss": 1.4913, |
|
"num_tokens": 804919.0, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.81859410430839, |
|
"grad_norm": 22.189367294311523, |
|
"learning_rate": 3.7660818713450298e-06, |
|
"loss": 1.4382, |
|
"num_tokens": 806066.0, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.8197278911564626, |
|
"grad_norm": 21.064807891845703, |
|
"learning_rate": 3.7426900584795324e-06, |
|
"loss": 1.5483, |
|
"num_tokens": 807298.0, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.8208616780045351, |
|
"grad_norm": 22.164886474609375, |
|
"learning_rate": 3.7192982456140354e-06, |
|
"loss": 1.4852, |
|
"num_tokens": 808431.0, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.8219954648526077, |
|
"grad_norm": 22.567136764526367, |
|
"learning_rate": 3.6959064327485384e-06, |
|
"loss": 1.5195, |
|
"num_tokens": 809525.0, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.8231292517006803, |
|
"grad_norm": 22.86061668395996, |
|
"learning_rate": 3.6725146198830414e-06, |
|
"loss": 1.4455, |
|
"num_tokens": 810602.0, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.8242630385487528, |
|
"grad_norm": 24.17923927307129, |
|
"learning_rate": 3.6491228070175443e-06, |
|
"loss": 1.4668, |
|
"num_tokens": 811652.0, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.8253968253968254, |
|
"grad_norm": 23.038942337036133, |
|
"learning_rate": 3.625730994152047e-06, |
|
"loss": 1.5045, |
|
"num_tokens": 812716.0, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.826530612244898, |
|
"grad_norm": 21.556453704833984, |
|
"learning_rate": 3.60233918128655e-06, |
|
"loss": 1.3505, |
|
"num_tokens": 813814.0, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.8276643990929705, |
|
"grad_norm": 21.08284568786621, |
|
"learning_rate": 3.578947368421053e-06, |
|
"loss": 1.3864, |
|
"num_tokens": 814929.0, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.828798185941043, |
|
"grad_norm": 23.882568359375, |
|
"learning_rate": 3.555555555555556e-06, |
|
"loss": 1.3743, |
|
"num_tokens": 815979.0, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.8299319727891157, |
|
"grad_norm": 24.24713897705078, |
|
"learning_rate": 3.532163742690059e-06, |
|
"loss": 1.509, |
|
"num_tokens": 817290.0, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.8310657596371882, |
|
"grad_norm": 22.145536422729492, |
|
"learning_rate": 3.5087719298245615e-06, |
|
"loss": 1.501, |
|
"num_tokens": 818392.0, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.8321995464852607, |
|
"grad_norm": 21.552515029907227, |
|
"learning_rate": 3.4853801169590645e-06, |
|
"loss": 1.3933, |
|
"num_tokens": 819609.0, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 25.56783676147461, |
|
"learning_rate": 3.4619883040935675e-06, |
|
"loss": 1.583, |
|
"num_tokens": 820656.0, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.8344671201814059, |
|
"grad_norm": 22.9251766204834, |
|
"learning_rate": 3.4385964912280705e-06, |
|
"loss": 1.3717, |
|
"num_tokens": 821762.0, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.8356009070294784, |
|
"grad_norm": 24.673242568969727, |
|
"learning_rate": 3.4152046783625735e-06, |
|
"loss": 1.5318, |
|
"num_tokens": 822936.0, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.8367346938775511, |
|
"grad_norm": 22.047882080078125, |
|
"learning_rate": 3.391812865497076e-06, |
|
"loss": 1.4858, |
|
"num_tokens": 824105.0, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.8378684807256236, |
|
"grad_norm": 26.981224060058594, |
|
"learning_rate": 3.368421052631579e-06, |
|
"loss": 1.6137, |
|
"num_tokens": 825116.0, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.8390022675736961, |
|
"grad_norm": 24.74795913696289, |
|
"learning_rate": 3.345029239766082e-06, |
|
"loss": 1.4405, |
|
"num_tokens": 826109.0, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.8401360544217688, |
|
"grad_norm": 23.003847122192383, |
|
"learning_rate": 3.321637426900585e-06, |
|
"loss": 1.4222, |
|
"num_tokens": 827151.0, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.8412698412698413, |
|
"grad_norm": 25.1625919342041, |
|
"learning_rate": 3.2982456140350885e-06, |
|
"loss": 1.538, |
|
"num_tokens": 828251.0, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.8424036281179138, |
|
"grad_norm": 23.558082580566406, |
|
"learning_rate": 3.2748538011695906e-06, |
|
"loss": 1.2967, |
|
"num_tokens": 829406.0, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.8435374149659864, |
|
"grad_norm": 22.346006393432617, |
|
"learning_rate": 3.2514619883040936e-06, |
|
"loss": 1.3927, |
|
"num_tokens": 830517.0, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.844671201814059, |
|
"grad_norm": 21.72648048400879, |
|
"learning_rate": 3.2280701754385966e-06, |
|
"loss": 1.5264, |
|
"num_tokens": 831751.0, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.8458049886621315, |
|
"grad_norm": 23.65290641784668, |
|
"learning_rate": 3.2046783625731e-06, |
|
"loss": 1.5314, |
|
"num_tokens": 832836.0, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.8469387755102041, |
|
"grad_norm": 22.38150978088379, |
|
"learning_rate": 3.181286549707603e-06, |
|
"loss": 1.6097, |
|
"num_tokens": 834005.0, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.8480725623582767, |
|
"grad_norm": 23.84773826599121, |
|
"learning_rate": 3.157894736842105e-06, |
|
"loss": 1.422, |
|
"num_tokens": 835035.0, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.8492063492063492, |
|
"grad_norm": 23.476564407348633, |
|
"learning_rate": 3.1345029239766086e-06, |
|
"loss": 1.4523, |
|
"num_tokens": 836174.0, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.8503401360544217, |
|
"grad_norm": 24.975629806518555, |
|
"learning_rate": 3.1111111111111116e-06, |
|
"loss": 1.4704, |
|
"num_tokens": 837345.0, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8514739229024944, |
|
"grad_norm": 25.92818260192871, |
|
"learning_rate": 3.0877192982456146e-06, |
|
"loss": 1.354, |
|
"num_tokens": 838470.0, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.8526077097505669, |
|
"grad_norm": 21.76276969909668, |
|
"learning_rate": 3.0643274853801176e-06, |
|
"loss": 1.2758, |
|
"num_tokens": 839542.0, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.8537414965986394, |
|
"grad_norm": 23.923139572143555, |
|
"learning_rate": 3.04093567251462e-06, |
|
"loss": 1.5039, |
|
"num_tokens": 840615.0, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.854875283446712, |
|
"grad_norm": 20.571720123291016, |
|
"learning_rate": 3.017543859649123e-06, |
|
"loss": 1.5064, |
|
"num_tokens": 841800.0, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.8560090702947846, |
|
"grad_norm": 22.663013458251953, |
|
"learning_rate": 2.994152046783626e-06, |
|
"loss": 1.5114, |
|
"num_tokens": 842943.0, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 22.52815055847168, |
|
"learning_rate": 2.970760233918129e-06, |
|
"loss": 1.5805, |
|
"num_tokens": 844094.0, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.8582766439909297, |
|
"grad_norm": 22.185199737548828, |
|
"learning_rate": 2.9473684210526317e-06, |
|
"loss": 1.4734, |
|
"num_tokens": 845227.0, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.8594104308390023, |
|
"grad_norm": 21.498750686645508, |
|
"learning_rate": 2.9239766081871347e-06, |
|
"loss": 1.3279, |
|
"num_tokens": 846379.0, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.8605442176870748, |
|
"grad_norm": 22.456575393676758, |
|
"learning_rate": 2.9005847953216377e-06, |
|
"loss": 1.3714, |
|
"num_tokens": 847478.0, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.8616780045351474, |
|
"grad_norm": 25.112070083618164, |
|
"learning_rate": 2.8771929824561407e-06, |
|
"loss": 1.5006, |
|
"num_tokens": 848561.0, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.86281179138322, |
|
"grad_norm": 21.80204963684082, |
|
"learning_rate": 2.8538011695906437e-06, |
|
"loss": 1.4335, |
|
"num_tokens": 849697.0, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.8639455782312925, |
|
"grad_norm": 23.306922912597656, |
|
"learning_rate": 2.8304093567251463e-06, |
|
"loss": 1.4661, |
|
"num_tokens": 850827.0, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.8650793650793651, |
|
"grad_norm": 22.614166259765625, |
|
"learning_rate": 2.8070175438596493e-06, |
|
"loss": 1.4819, |
|
"num_tokens": 851967.0, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.8662131519274376, |
|
"grad_norm": 22.74590301513672, |
|
"learning_rate": 2.7836257309941523e-06, |
|
"loss": 1.4953, |
|
"num_tokens": 853111.0, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.8673469387755102, |
|
"grad_norm": 22.92290687561035, |
|
"learning_rate": 2.7602339181286553e-06, |
|
"loss": 1.4969, |
|
"num_tokens": 854225.0, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.8684807256235828, |
|
"grad_norm": 21.96657371520996, |
|
"learning_rate": 2.7368421052631583e-06, |
|
"loss": 1.5487, |
|
"num_tokens": 855400.0, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.8696145124716553, |
|
"grad_norm": 22.646835327148438, |
|
"learning_rate": 2.713450292397661e-06, |
|
"loss": 1.391, |
|
"num_tokens": 856571.0, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.8707482993197279, |
|
"grad_norm": 22.9639892578125, |
|
"learning_rate": 2.690058479532164e-06, |
|
"loss": 1.4171, |
|
"num_tokens": 857782.0, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.8718820861678005, |
|
"grad_norm": 23.083837509155273, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 1.5668, |
|
"num_tokens": 858959.0, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.873015873015873, |
|
"grad_norm": 22.602506637573242, |
|
"learning_rate": 2.64327485380117e-06, |
|
"loss": 1.4862, |
|
"num_tokens": 860124.0, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.8741496598639455, |
|
"grad_norm": 23.74089241027832, |
|
"learning_rate": 2.619883040935673e-06, |
|
"loss": 1.4123, |
|
"num_tokens": 861312.0, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.8752834467120182, |
|
"grad_norm": 22.073772430419922, |
|
"learning_rate": 2.5964912280701754e-06, |
|
"loss": 1.4083, |
|
"num_tokens": 862379.0, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.8764172335600907, |
|
"grad_norm": 23.299543380737305, |
|
"learning_rate": 2.5730994152046784e-06, |
|
"loss": 1.3877, |
|
"num_tokens": 863526.0, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.8775510204081632, |
|
"grad_norm": 23.912397384643555, |
|
"learning_rate": 2.5497076023391814e-06, |
|
"loss": 1.6234, |
|
"num_tokens": 864663.0, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.8786848072562359, |
|
"grad_norm": 24.13743019104004, |
|
"learning_rate": 2.5263157894736844e-06, |
|
"loss": 1.6174, |
|
"num_tokens": 865755.0, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.8798185941043084, |
|
"grad_norm": 23.58803939819336, |
|
"learning_rate": 2.5029239766081874e-06, |
|
"loss": 1.5354, |
|
"num_tokens": 866867.0, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.8809523809523809, |
|
"grad_norm": 22.72237205505371, |
|
"learning_rate": 2.4795321637426904e-06, |
|
"loss": 1.4695, |
|
"num_tokens": 867947.0, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.8820861678004536, |
|
"grad_norm": 23.858057022094727, |
|
"learning_rate": 2.456140350877193e-06, |
|
"loss": 1.4492, |
|
"num_tokens": 869047.0, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.8832199546485261, |
|
"grad_norm": 21.552154541015625, |
|
"learning_rate": 2.432748538011696e-06, |
|
"loss": 1.4438, |
|
"num_tokens": 870112.0, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.8843537414965986, |
|
"grad_norm": 24.152172088623047, |
|
"learning_rate": 2.409356725146199e-06, |
|
"loss": 1.4744, |
|
"num_tokens": 871206.0, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.8854875283446711, |
|
"grad_norm": 22.032583236694336, |
|
"learning_rate": 2.385964912280702e-06, |
|
"loss": 1.3994, |
|
"num_tokens": 872313.0, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.8866213151927438, |
|
"grad_norm": 22.46315574645996, |
|
"learning_rate": 2.362573099415205e-06, |
|
"loss": 1.4018, |
|
"num_tokens": 873407.0, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.8877551020408163, |
|
"grad_norm": 22.419082641601562, |
|
"learning_rate": 2.3391812865497075e-06, |
|
"loss": 1.3361, |
|
"num_tokens": 874507.0, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 22.973846435546875, |
|
"learning_rate": 2.3157894736842105e-06, |
|
"loss": 1.5251, |
|
"num_tokens": 875594.0, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.8900226757369615, |
|
"grad_norm": 21.73171043395996, |
|
"learning_rate": 2.292397660818714e-06, |
|
"loss": 1.4877, |
|
"num_tokens": 876759.0, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.891156462585034, |
|
"grad_norm": 22.867074966430664, |
|
"learning_rate": 2.2690058479532165e-06, |
|
"loss": 1.4999, |
|
"num_tokens": 877790.0, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.8922902494331065, |
|
"grad_norm": 23.208694458007812, |
|
"learning_rate": 2.2456140350877195e-06, |
|
"loss": 1.4502, |
|
"num_tokens": 878974.0, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.8934240362811792, |
|
"grad_norm": 23.5717830657959, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 1.3898, |
|
"num_tokens": 880052.0, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.8945578231292517, |
|
"grad_norm": 23.512678146362305, |
|
"learning_rate": 2.1988304093567255e-06, |
|
"loss": 1.4121, |
|
"num_tokens": 881239.0, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.8956916099773242, |
|
"grad_norm": 23.14679527282715, |
|
"learning_rate": 2.1754385964912285e-06, |
|
"loss": 1.4851, |
|
"num_tokens": 882281.0, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8968253968253969, |
|
"grad_norm": 23.918848037719727, |
|
"learning_rate": 2.152046783625731e-06, |
|
"loss": 1.4963, |
|
"num_tokens": 883346.0, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.8979591836734694, |
|
"grad_norm": 26.56317710876465, |
|
"learning_rate": 2.128654970760234e-06, |
|
"loss": 1.5296, |
|
"num_tokens": 884352.0, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.8990929705215419, |
|
"grad_norm": 22.81806182861328, |
|
"learning_rate": 2.105263157894737e-06, |
|
"loss": 1.5657, |
|
"num_tokens": 885502.0, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.9002267573696145, |
|
"grad_norm": 23.83610725402832, |
|
"learning_rate": 2.08187134502924e-06, |
|
"loss": 1.5682, |
|
"num_tokens": 886626.0, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.9013605442176871, |
|
"grad_norm": 23.975093841552734, |
|
"learning_rate": 2.058479532163743e-06, |
|
"loss": 1.527, |
|
"num_tokens": 887757.0, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.9024943310657596, |
|
"grad_norm": 21.868783950805664, |
|
"learning_rate": 2.0350877192982456e-06, |
|
"loss": 1.4765, |
|
"num_tokens": 888939.0, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.9036281179138322, |
|
"grad_norm": 22.76094627380371, |
|
"learning_rate": 2.0116959064327486e-06, |
|
"loss": 1.3386, |
|
"num_tokens": 890157.0, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.9047619047619048, |
|
"grad_norm": 23.086637496948242, |
|
"learning_rate": 1.9883040935672516e-06, |
|
"loss": 1.6715, |
|
"num_tokens": 891398.0, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.9058956916099773, |
|
"grad_norm": 24.359127044677734, |
|
"learning_rate": 1.9649122807017546e-06, |
|
"loss": 1.3054, |
|
"num_tokens": 892408.0, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.9070294784580499, |
|
"grad_norm": 25.446590423583984, |
|
"learning_rate": 1.9415204678362576e-06, |
|
"loss": 1.4722, |
|
"num_tokens": 893592.0, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9081632653061225, |
|
"grad_norm": 23.30593490600586, |
|
"learning_rate": 1.91812865497076e-06, |
|
"loss": 1.4158, |
|
"num_tokens": 894697.0, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.909297052154195, |
|
"grad_norm": 21.06711769104004, |
|
"learning_rate": 1.8947368421052634e-06, |
|
"loss": 1.592, |
|
"num_tokens": 895818.0, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.9104308390022676, |
|
"grad_norm": 23.522247314453125, |
|
"learning_rate": 1.8713450292397662e-06, |
|
"loss": 1.5568, |
|
"num_tokens": 896885.0, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.9115646258503401, |
|
"grad_norm": 22.43680763244629, |
|
"learning_rate": 1.8479532163742692e-06, |
|
"loss": 1.3984, |
|
"num_tokens": 898032.0, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.9126984126984127, |
|
"grad_norm": 22.576719284057617, |
|
"learning_rate": 1.8245614035087722e-06, |
|
"loss": 1.4604, |
|
"num_tokens": 899163.0, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.9138321995464853, |
|
"grad_norm": 22.583637237548828, |
|
"learning_rate": 1.801169590643275e-06, |
|
"loss": 1.4348, |
|
"num_tokens": 900304.0, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.9149659863945578, |
|
"grad_norm": 21.52495574951172, |
|
"learning_rate": 1.777777777777778e-06, |
|
"loss": 1.364, |
|
"num_tokens": 901464.0, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.9160997732426304, |
|
"grad_norm": 22.536762237548828, |
|
"learning_rate": 1.7543859649122807e-06, |
|
"loss": 1.5089, |
|
"num_tokens": 902652.0, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.9172335600907029, |
|
"grad_norm": 22.26384162902832, |
|
"learning_rate": 1.7309941520467837e-06, |
|
"loss": 1.4325, |
|
"num_tokens": 903725.0, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.9183673469387755, |
|
"grad_norm": 20.824234008789062, |
|
"learning_rate": 1.7076023391812867e-06, |
|
"loss": 1.396, |
|
"num_tokens": 904819.0, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9195011337868481, |
|
"grad_norm": 21.5470027923584, |
|
"learning_rate": 1.6842105263157895e-06, |
|
"loss": 1.4437, |
|
"num_tokens": 905994.0, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.9206349206349206, |
|
"grad_norm": 22.454166412353516, |
|
"learning_rate": 1.6608187134502925e-06, |
|
"loss": 1.4207, |
|
"num_tokens": 907165.0, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.9217687074829932, |
|
"grad_norm": 22.63362693786621, |
|
"learning_rate": 1.6374269005847953e-06, |
|
"loss": 1.5246, |
|
"num_tokens": 908315.0, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.9229024943310657, |
|
"grad_norm": 25.17198371887207, |
|
"learning_rate": 1.6140350877192983e-06, |
|
"loss": 1.527, |
|
"num_tokens": 909504.0, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.9240362811791383, |
|
"grad_norm": 25.855079650878906, |
|
"learning_rate": 1.5906432748538015e-06, |
|
"loss": 1.6569, |
|
"num_tokens": 910579.0, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.9251700680272109, |
|
"grad_norm": 22.92259979248047, |
|
"learning_rate": 1.5672514619883043e-06, |
|
"loss": 1.4674, |
|
"num_tokens": 911639.0, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.9263038548752834, |
|
"grad_norm": 24.303924560546875, |
|
"learning_rate": 1.5438596491228073e-06, |
|
"loss": 1.5105, |
|
"num_tokens": 912701.0, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.927437641723356, |
|
"grad_norm": 21.863985061645508, |
|
"learning_rate": 1.52046783625731e-06, |
|
"loss": 1.6027, |
|
"num_tokens": 913766.0, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.9285714285714286, |
|
"grad_norm": 24.20486068725586, |
|
"learning_rate": 1.497076023391813e-06, |
|
"loss": 1.5517, |
|
"num_tokens": 914798.0, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.9297052154195011, |
|
"grad_norm": 24.408193588256836, |
|
"learning_rate": 1.4736842105263159e-06, |
|
"loss": 1.4336, |
|
"num_tokens": 915953.0, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.9308390022675737, |
|
"grad_norm": 22.91715431213379, |
|
"learning_rate": 1.4502923976608189e-06, |
|
"loss": 1.3284, |
|
"num_tokens": 917153.0, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.9319727891156463, |
|
"grad_norm": 23.430063247680664, |
|
"learning_rate": 1.4269005847953219e-06, |
|
"loss": 1.5508, |
|
"num_tokens": 918239.0, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.9331065759637188, |
|
"grad_norm": 24.82691764831543, |
|
"learning_rate": 1.4035087719298246e-06, |
|
"loss": 1.4022, |
|
"num_tokens": 919263.0, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.9342403628117913, |
|
"grad_norm": 21.78477668762207, |
|
"learning_rate": 1.3801169590643276e-06, |
|
"loss": 1.4777, |
|
"num_tokens": 920398.0, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.935374149659864, |
|
"grad_norm": 23.606664657592773, |
|
"learning_rate": 1.3567251461988304e-06, |
|
"loss": 1.479, |
|
"num_tokens": 921477.0, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.9365079365079365, |
|
"grad_norm": 22.557498931884766, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 1.5736, |
|
"num_tokens": 922580.0, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.937641723356009, |
|
"grad_norm": 22.220712661743164, |
|
"learning_rate": 1.3099415204678364e-06, |
|
"loss": 1.4134, |
|
"num_tokens": 923731.0, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.9387755102040817, |
|
"grad_norm": 20.02456283569336, |
|
"learning_rate": 1.2865497076023392e-06, |
|
"loss": 1.4506, |
|
"num_tokens": 924867.0, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.9399092970521542, |
|
"grad_norm": 23.332258224487305, |
|
"learning_rate": 1.2631578947368422e-06, |
|
"loss": 1.3667, |
|
"num_tokens": 925946.0, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.9410430839002267, |
|
"grad_norm": 23.033876419067383, |
|
"learning_rate": 1.2397660818713452e-06, |
|
"loss": 1.4524, |
|
"num_tokens": 927211.0, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.9421768707482994, |
|
"grad_norm": 21.960102081298828, |
|
"learning_rate": 1.216374269005848e-06, |
|
"loss": 1.4102, |
|
"num_tokens": 928388.0, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.9433106575963719, |
|
"grad_norm": 22.837690353393555, |
|
"learning_rate": 1.192982456140351e-06, |
|
"loss": 1.4648, |
|
"num_tokens": 929505.0, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.9444444444444444, |
|
"grad_norm": 26.00808334350586, |
|
"learning_rate": 1.1695906432748538e-06, |
|
"loss": 1.3195, |
|
"num_tokens": 930491.0, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.9455782312925171, |
|
"grad_norm": 22.8687744140625, |
|
"learning_rate": 1.146198830409357e-06, |
|
"loss": 1.4152, |
|
"num_tokens": 931679.0, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.9467120181405896, |
|
"grad_norm": 23.170955657958984, |
|
"learning_rate": 1.1228070175438598e-06, |
|
"loss": 1.3588, |
|
"num_tokens": 932826.0, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.9478458049886621, |
|
"grad_norm": 21.579723358154297, |
|
"learning_rate": 1.0994152046783627e-06, |
|
"loss": 1.4296, |
|
"num_tokens": 933959.0, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.9489795918367347, |
|
"grad_norm": 22.338029861450195, |
|
"learning_rate": 1.0760233918128655e-06, |
|
"loss": 1.5036, |
|
"num_tokens": 935084.0, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.9501133786848073, |
|
"grad_norm": 24.644001007080078, |
|
"learning_rate": 1.0526315789473685e-06, |
|
"loss": 1.5511, |
|
"num_tokens": 936207.0, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.9512471655328798, |
|
"grad_norm": 20.820331573486328, |
|
"learning_rate": 1.0292397660818715e-06, |
|
"loss": 1.4014, |
|
"num_tokens": 937398.0, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 22.744985580444336, |
|
"learning_rate": 1.0058479532163743e-06, |
|
"loss": 1.5348, |
|
"num_tokens": 938566.0, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.953514739229025, |
|
"grad_norm": 21.92036247253418, |
|
"learning_rate": 9.824561403508773e-07, |
|
"loss": 1.5325, |
|
"num_tokens": 939691.0, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.9546485260770975, |
|
"grad_norm": 21.75764274597168, |
|
"learning_rate": 9.5906432748538e-07, |
|
"loss": 1.468, |
|
"num_tokens": 940827.0, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.95578231292517, |
|
"grad_norm": 22.83157730102539, |
|
"learning_rate": 9.356725146198831e-07, |
|
"loss": 1.4777, |
|
"num_tokens": 941874.0, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.9569160997732427, |
|
"grad_norm": 24.487289428710938, |
|
"learning_rate": 9.122807017543861e-07, |
|
"loss": 1.2818, |
|
"num_tokens": 942976.0, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.9580498866213152, |
|
"grad_norm": 22.184720993041992, |
|
"learning_rate": 8.88888888888889e-07, |
|
"loss": 1.4944, |
|
"num_tokens": 944151.0, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.9591836734693877, |
|
"grad_norm": 24.632143020629883, |
|
"learning_rate": 8.654970760233919e-07, |
|
"loss": 1.4877, |
|
"num_tokens": 945188.0, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.9603174603174603, |
|
"grad_norm": 23.738849639892578, |
|
"learning_rate": 8.421052631578948e-07, |
|
"loss": 1.5023, |
|
"num_tokens": 946295.0, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.9614512471655329, |
|
"grad_norm": 23.87755012512207, |
|
"learning_rate": 8.187134502923977e-07, |
|
"loss": 1.5317, |
|
"num_tokens": 947340.0, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.9625850340136054, |
|
"grad_norm": 23.480918884277344, |
|
"learning_rate": 7.953216374269008e-07, |
|
"loss": 1.4858, |
|
"num_tokens": 948481.0, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.963718820861678, |
|
"grad_norm": 24.38697052001953, |
|
"learning_rate": 7.719298245614036e-07, |
|
"loss": 1.3701, |
|
"num_tokens": 949547.0, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9648526077097506, |
|
"grad_norm": 23.775096893310547, |
|
"learning_rate": 7.485380116959065e-07, |
|
"loss": 1.4726, |
|
"num_tokens": 950588.0, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.9659863945578231, |
|
"grad_norm": 22.794851303100586, |
|
"learning_rate": 7.251461988304094e-07, |
|
"loss": 1.5002, |
|
"num_tokens": 951774.0, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.9671201814058957, |
|
"grad_norm": 21.191730499267578, |
|
"learning_rate": 7.017543859649123e-07, |
|
"loss": 1.4518, |
|
"num_tokens": 952871.0, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.9682539682539683, |
|
"grad_norm": 21.24921417236328, |
|
"learning_rate": 6.783625730994152e-07, |
|
"loss": 1.3679, |
|
"num_tokens": 954031.0, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.9693877551020408, |
|
"grad_norm": 23.635141372680664, |
|
"learning_rate": 6.549707602339182e-07, |
|
"loss": 1.3911, |
|
"num_tokens": 955054.0, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.9705215419501134, |
|
"grad_norm": 23.66080665588379, |
|
"learning_rate": 6.315789473684211e-07, |
|
"loss": 1.3363, |
|
"num_tokens": 956174.0, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.971655328798186, |
|
"grad_norm": 23.329038619995117, |
|
"learning_rate": 6.08187134502924e-07, |
|
"loss": 1.514, |
|
"num_tokens": 957337.0, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.9727891156462585, |
|
"grad_norm": 22.854223251342773, |
|
"learning_rate": 5.847953216374269e-07, |
|
"loss": 1.4398, |
|
"num_tokens": 958396.0, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.9739229024943311, |
|
"grad_norm": 22.29954719543457, |
|
"learning_rate": 5.614035087719299e-07, |
|
"loss": 1.5289, |
|
"num_tokens": 959510.0, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.9750566893424036, |
|
"grad_norm": 23.53312873840332, |
|
"learning_rate": 5.380116959064328e-07, |
|
"loss": 1.4513, |
|
"num_tokens": 960614.0, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9761904761904762, |
|
"grad_norm": 21.8758602142334, |
|
"learning_rate": 5.146198830409358e-07, |
|
"loss": 1.422, |
|
"num_tokens": 961707.0, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.9773242630385488, |
|
"grad_norm": 23.302278518676758, |
|
"learning_rate": 4.912280701754387e-07, |
|
"loss": 1.4902, |
|
"num_tokens": 962890.0, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.9784580498866213, |
|
"grad_norm": 23.379724502563477, |
|
"learning_rate": 4.6783625730994155e-07, |
|
"loss": 1.4964, |
|
"num_tokens": 964107.0, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.9795918367346939, |
|
"grad_norm": 23.007959365844727, |
|
"learning_rate": 4.444444444444445e-07, |
|
"loss": 1.4419, |
|
"num_tokens": 965201.0, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.9807256235827665, |
|
"grad_norm": 24.184219360351562, |
|
"learning_rate": 4.210526315789474e-07, |
|
"loss": 1.3729, |
|
"num_tokens": 966250.0, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.981859410430839, |
|
"grad_norm": 22.903079986572266, |
|
"learning_rate": 3.976608187134504e-07, |
|
"loss": 1.4376, |
|
"num_tokens": 967412.0, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.9829931972789115, |
|
"grad_norm": 21.505783081054688, |
|
"learning_rate": 3.7426900584795327e-07, |
|
"loss": 1.4338, |
|
"num_tokens": 968670.0, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.9841269841269841, |
|
"grad_norm": 24.10959243774414, |
|
"learning_rate": 3.5087719298245616e-07, |
|
"loss": 1.3635, |
|
"num_tokens": 969688.0, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.9852607709750567, |
|
"grad_norm": 23.527223587036133, |
|
"learning_rate": 3.274853801169591e-07, |
|
"loss": 1.5074, |
|
"num_tokens": 970730.0, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.9863945578231292, |
|
"grad_norm": 22.13941764831543, |
|
"learning_rate": 3.04093567251462e-07, |
|
"loss": 1.4279, |
|
"num_tokens": 971851.0, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.9875283446712018, |
|
"grad_norm": 23.4210262298584, |
|
"learning_rate": 2.8070175438596494e-07, |
|
"loss": 1.4491, |
|
"num_tokens": 972955.0, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.9886621315192744, |
|
"grad_norm": 22.403488159179688, |
|
"learning_rate": 2.573099415204679e-07, |
|
"loss": 1.4925, |
|
"num_tokens": 974119.0, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.9897959183673469, |
|
"grad_norm": 22.045902252197266, |
|
"learning_rate": 2.3391812865497077e-07, |
|
"loss": 1.5381, |
|
"num_tokens": 975268.0, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.9909297052154195, |
|
"grad_norm": 22.418025970458984, |
|
"learning_rate": 2.105263157894737e-07, |
|
"loss": 1.3585, |
|
"num_tokens": 976366.0, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.9920634920634921, |
|
"grad_norm": 24.32988739013672, |
|
"learning_rate": 1.8713450292397663e-07, |
|
"loss": 1.3053, |
|
"num_tokens": 977432.0, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.9931972789115646, |
|
"grad_norm": 22.723827362060547, |
|
"learning_rate": 1.6374269005847955e-07, |
|
"loss": 1.461, |
|
"num_tokens": 978586.0, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.9943310657596371, |
|
"grad_norm": 22.09205436706543, |
|
"learning_rate": 1.4035087719298247e-07, |
|
"loss": 1.396, |
|
"num_tokens": 979670.0, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.9954648526077098, |
|
"grad_norm": 24.673280715942383, |
|
"learning_rate": 1.1695906432748539e-07, |
|
"loss": 1.3698, |
|
"num_tokens": 980667.0, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.9965986394557823, |
|
"grad_norm": 22.821020126342773, |
|
"learning_rate": 9.356725146198832e-08, |
|
"loss": 1.4921, |
|
"num_tokens": 981825.0, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.9977324263038548, |
|
"grad_norm": 23.99849510192871, |
|
"learning_rate": 7.017543859649123e-08, |
|
"loss": 1.5141, |
|
"num_tokens": 982883.0, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.9988662131519275, |
|
"grad_norm": 22.911663055419922, |
|
"learning_rate": 4.678362573099416e-08, |
|
"loss": 1.4862, |
|
"num_tokens": 983976.0, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 23.910690307617188, |
|
"learning_rate": 2.339181286549708e-08, |
|
"loss": 1.367, |
|
"num_tokens": 984517.0, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 882, |
|
"total_flos": 5752804923670528.0, |
|
"train_loss": 1.5323709486022827, |
|
"train_runtime": 132.5784, |
|
"train_samples_per_second": 53.169, |
|
"train_steps_per_second": 6.653 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 882, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5752804923670528.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|