diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15297 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.0, + "eval_steps": 500, + "global_step": 1908, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0031446540880503146, + "grad_norm": 1.9843024015426636, + "learning_rate": 4.1666666666666667e-07, + "loss": 0.9991, + "mean_token_accuracy": 0.7260034680366516, + "step": 1 + }, + { + "epoch": 0.006289308176100629, + "grad_norm": 1.9983922243118286, + "learning_rate": 8.333333333333333e-07, + "loss": 0.9796, + "mean_token_accuracy": 0.7287212610244751, + "step": 2 + }, + { + "epoch": 0.009433962264150943, + "grad_norm": 2.008734941482544, + "learning_rate": 1.25e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7293687462806702, + "step": 3 + }, + { + "epoch": 0.012578616352201259, + "grad_norm": 1.9662363529205322, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7301269769668579, + "step": 4 + }, + { + "epoch": 0.015723270440251572, + "grad_norm": 1.9593746662139893, + "learning_rate": 2.0833333333333334e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.7250465154647827, + "step": 5 + }, + { + "epoch": 0.018867924528301886, + "grad_norm": 1.769201636314392, + "learning_rate": 2.5e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7267187833786011, + "step": 6 + }, + { + "epoch": 0.0220125786163522, + "grad_norm": 1.6486475467681885, + "learning_rate": 2.916666666666667e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7299320101737976, + "step": 7 + }, + { + "epoch": 0.025157232704402517, + "grad_norm": 1.1724086999893188, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7348803877830505, + "step": 8 + }, + { + "epoch": 0.02830188679245283, + "grad_norm": 1.0883439779281616, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7360632419586182, + "step": 9 + }, + { + "epoch": 0.031446540880503145, + "grad_norm": 1.0831983089447021, + "learning_rate": 4.166666666666667e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7367658019065857, + "step": 10 + }, + { + "epoch": 0.03459119496855346, + "grad_norm": 0.9518892765045166, + "learning_rate": 4.583333333333333e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7373512387275696, + "step": 11 + }, + { + "epoch": 0.03773584905660377, + "grad_norm": 0.9544086456298828, + "learning_rate": 5e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7343592047691345, + "step": 12 + }, + { + "epoch": 0.040880503144654086, + "grad_norm": 0.8863177299499512, + "learning_rate": 5.416666666666667e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7427428364753723, + "step": 13 + }, + { + "epoch": 0.0440251572327044, + "grad_norm": 0.803634524345398, + "learning_rate": 5.833333333333334e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7462121844291687, + "step": 14 + }, + { + "epoch": 0.04716981132075472, + "grad_norm": 0.9318463802337646, + "learning_rate": 6.25e-06, + "loss": 0.8115, + "mean_token_accuracy": 0.7500566244125366, + "step": 15 + }, + { + "epoch": 0.050314465408805034, + "grad_norm": 0.888688325881958, + "learning_rate": 6.666666666666667e-06, + "loss": 0.8099, + "mean_token_accuracy": 0.7527286410331726, + "step": 16 + }, + { + "epoch": 0.05345911949685535, + "grad_norm": 0.6901258230209351, + "learning_rate": 7.083333333333335e-06, + "loss": 0.7951, + "mean_token_accuracy": 0.7522338032722473, + "step": 17 + }, + { + "epoch": 0.05660377358490566, + "grad_norm": 0.5095185041427612, + "learning_rate": 7.500000000000001e-06, + "loss": 0.7702, + "mean_token_accuracy": 0.7567635774612427, + "step": 18 + }, + { + "epoch": 0.059748427672955975, + "grad_norm": 0.41788381338119507, + "learning_rate": 7.916666666666667e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7630857825279236, + "step": 19 + }, + { + "epoch": 0.06289308176100629, + "grad_norm": 0.49949783086776733, + "learning_rate": 8.333333333333334e-06, + "loss": 0.7309, + "mean_token_accuracy": 0.7646651864051819, + "step": 20 + }, + { + "epoch": 0.0660377358490566, + "grad_norm": 0.4411962032318115, + "learning_rate": 8.750000000000001e-06, + "loss": 0.7133, + "mean_token_accuracy": 0.7669786810874939, + "step": 21 + }, + { + "epoch": 0.06918238993710692, + "grad_norm": 0.381130188703537, + "learning_rate": 9.166666666666666e-06, + "loss": 0.7034, + "mean_token_accuracy": 0.7698013782501221, + "step": 22 + }, + { + "epoch": 0.07232704402515723, + "grad_norm": 0.32305705547332764, + "learning_rate": 9.583333333333335e-06, + "loss": 0.6969, + "mean_token_accuracy": 0.7718903422355652, + "step": 23 + }, + { + "epoch": 0.07547169811320754, + "grad_norm": 0.3746313154697418, + "learning_rate": 1e-05, + "loss": 0.6906, + "mean_token_accuracy": 0.7724348306655884, + "step": 24 + }, + { + "epoch": 0.07861635220125786, + "grad_norm": 0.378999263048172, + "learning_rate": 1.0416666666666668e-05, + "loss": 0.6826, + "mean_token_accuracy": 0.7719895243644714, + "step": 25 + }, + { + "epoch": 0.08176100628930817, + "grad_norm": 0.35218483209609985, + "learning_rate": 1.0833333333333334e-05, + "loss": 0.675, + "mean_token_accuracy": 0.7760418057441711, + "step": 26 + }, + { + "epoch": 0.08490566037735849, + "grad_norm": 0.3063074052333832, + "learning_rate": 1.125e-05, + "loss": 0.6707, + "mean_token_accuracy": 0.7769221067428589, + "step": 27 + }, + { + "epoch": 0.0880503144654088, + "grad_norm": 0.3150438070297241, + "learning_rate": 1.1666666666666668e-05, + "loss": 0.6542, + "mean_token_accuracy": 0.7808550000190735, + "step": 28 + }, + { + "epoch": 0.09119496855345911, + "grad_norm": 0.2937377393245697, + "learning_rate": 1.2083333333333333e-05, + "loss": 0.6654, + "mean_token_accuracy": 0.7757168412208557, + "step": 29 + }, + { + "epoch": 0.09433962264150944, + "grad_norm": 0.2754563093185425, + "learning_rate": 1.25e-05, + "loss": 0.6553, + "mean_token_accuracy": 0.7805117964744568, + "step": 30 + }, + { + "epoch": 0.09748427672955975, + "grad_norm": 0.2745518386363983, + "learning_rate": 1.2916666666666668e-05, + "loss": 0.6383, + "mean_token_accuracy": 0.7834630012512207, + "step": 31 + }, + { + "epoch": 0.10062893081761007, + "grad_norm": 0.2507205307483673, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.6382, + "mean_token_accuracy": 0.7827038764953613, + "step": 32 + }, + { + "epoch": 0.10377358490566038, + "grad_norm": 0.26597172021865845, + "learning_rate": 1.375e-05, + "loss": 0.644, + "mean_token_accuracy": 0.7815650105476379, + "step": 33 + }, + { + "epoch": 0.1069182389937107, + "grad_norm": 0.2784893810749054, + "learning_rate": 1.416666666666667e-05, + "loss": 0.6175, + "mean_token_accuracy": 0.7878676056861877, + "step": 34 + }, + { + "epoch": 0.11006289308176101, + "grad_norm": 0.2109018713235855, + "learning_rate": 1.4583333333333333e-05, + "loss": 0.6174, + "mean_token_accuracy": 0.7876963019371033, + "step": 35 + }, + { + "epoch": 0.11320754716981132, + "grad_norm": 0.2487485110759735, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.6112, + "mean_token_accuracy": 0.7883297204971313, + "step": 36 + }, + { + "epoch": 0.11635220125786164, + "grad_norm": 0.21007870137691498, + "learning_rate": 1.5416666666666668e-05, + "loss": 0.6243, + "mean_token_accuracy": 0.7878017425537109, + "step": 37 + }, + { + "epoch": 0.11949685534591195, + "grad_norm": 0.19941003620624542, + "learning_rate": 1.5833333333333333e-05, + "loss": 0.6225, + "mean_token_accuracy": 0.7866714000701904, + "step": 38 + }, + { + "epoch": 0.12264150943396226, + "grad_norm": 0.22181154787540436, + "learning_rate": 1.6250000000000002e-05, + "loss": 0.6172, + "mean_token_accuracy": 0.7870483994483948, + "step": 39 + }, + { + "epoch": 0.12578616352201258, + "grad_norm": 0.21963359415531158, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.6145, + "mean_token_accuracy": 0.7886669039726257, + "step": 40 + }, + { + "epoch": 0.1289308176100629, + "grad_norm": 0.19179226458072662, + "learning_rate": 1.7083333333333333e-05, + "loss": 0.6039, + "mean_token_accuracy": 0.7911710739135742, + "step": 41 + }, + { + "epoch": 0.1320754716981132, + "grad_norm": 0.16477134823799133, + "learning_rate": 1.7500000000000002e-05, + "loss": 0.6086, + "mean_token_accuracy": 0.7912865281105042, + "step": 42 + }, + { + "epoch": 0.13522012578616352, + "grad_norm": 0.22841010987758636, + "learning_rate": 1.7916666666666667e-05, + "loss": 0.6055, + "mean_token_accuracy": 0.789507269859314, + "step": 43 + }, + { + "epoch": 0.13836477987421383, + "grad_norm": 0.1824677586555481, + "learning_rate": 1.8333333333333333e-05, + "loss": 0.6083, + "mean_token_accuracy": 0.7901920080184937, + "step": 44 + }, + { + "epoch": 0.14150943396226415, + "grad_norm": 0.2477484941482544, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.5883, + "mean_token_accuracy": 0.7944508790969849, + "step": 45 + }, + { + "epoch": 0.14465408805031446, + "grad_norm": 0.21475744247436523, + "learning_rate": 1.916666666666667e-05, + "loss": 0.6063, + "mean_token_accuracy": 0.793271005153656, + "step": 46 + }, + { + "epoch": 0.14779874213836477, + "grad_norm": 0.23494695127010345, + "learning_rate": 1.9583333333333333e-05, + "loss": 0.6035, + "mean_token_accuracy": 0.790216326713562, + "step": 47 + }, + { + "epoch": 0.1509433962264151, + "grad_norm": 0.19835183024406433, + "learning_rate": 2e-05, + "loss": 0.5912, + "mean_token_accuracy": 0.7936276197433472, + "step": 48 + }, + { + "epoch": 0.1540880503144654, + "grad_norm": 0.20389200747013092, + "learning_rate": 2.0416666666666667e-05, + "loss": 0.5883, + "mean_token_accuracy": 0.7947663068771362, + "step": 49 + }, + { + "epoch": 0.15723270440251572, + "grad_norm": 0.25244179368019104, + "learning_rate": 2.0833333333333336e-05, + "loss": 0.593, + "mean_token_accuracy": 0.7952125072479248, + "step": 50 + }, + { + "epoch": 0.16037735849056603, + "grad_norm": 0.18739324808120728, + "learning_rate": 2.125e-05, + "loss": 0.5892, + "mean_token_accuracy": 0.7951387763023376, + "step": 51 + }, + { + "epoch": 0.16352201257861634, + "grad_norm": 0.2114628702402115, + "learning_rate": 2.1666666666666667e-05, + "loss": 0.5838, + "mean_token_accuracy": 0.7957105040550232, + "step": 52 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.2226199507713318, + "learning_rate": 2.2083333333333336e-05, + "loss": 0.5943, + "mean_token_accuracy": 0.7944356799125671, + "step": 53 + }, + { + "epoch": 0.16981132075471697, + "grad_norm": 0.19234661757946014, + "learning_rate": 2.25e-05, + "loss": 0.5899, + "mean_token_accuracy": 0.7960485219955444, + "step": 54 + }, + { + "epoch": 0.17295597484276728, + "grad_norm": 0.25666317343711853, + "learning_rate": 2.2916666666666667e-05, + "loss": 0.5811, + "mean_token_accuracy": 0.7957990169525146, + "step": 55 + }, + { + "epoch": 0.1761006289308176, + "grad_norm": 0.2783811390399933, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.5946, + "mean_token_accuracy": 0.7940381169319153, + "step": 56 + }, + { + "epoch": 0.1792452830188679, + "grad_norm": 0.20457403361797333, + "learning_rate": 2.375e-05, + "loss": 0.5816, + "mean_token_accuracy": 0.7978816628456116, + "step": 57 + }, + { + "epoch": 0.18238993710691823, + "grad_norm": 0.3623892068862915, + "learning_rate": 2.4166666666666667e-05, + "loss": 0.5802, + "mean_token_accuracy": 0.7961991429328918, + "step": 58 + }, + { + "epoch": 0.18553459119496854, + "grad_norm": 0.24155688285827637, + "learning_rate": 2.4583333333333336e-05, + "loss": 0.5683, + "mean_token_accuracy": 0.7976905703544617, + "step": 59 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 0.377756804227829, + "learning_rate": 2.5e-05, + "loss": 0.5762, + "mean_token_accuracy": 0.8002050518989563, + "step": 60 + }, + { + "epoch": 0.1918238993710692, + "grad_norm": 0.3166685104370117, + "learning_rate": 2.5416666666666667e-05, + "loss": 0.584, + "mean_token_accuracy": 0.7962762713432312, + "step": 61 + }, + { + "epoch": 0.1949685534591195, + "grad_norm": 0.29503804445266724, + "learning_rate": 2.5833333333333336e-05, + "loss": 0.5784, + "mean_token_accuracy": 0.7990941405296326, + "step": 62 + }, + { + "epoch": 0.19811320754716982, + "grad_norm": 0.3305257558822632, + "learning_rate": 2.625e-05, + "loss": 0.5846, + "mean_token_accuracy": 0.7966292500495911, + "step": 63 + }, + { + "epoch": 0.20125786163522014, + "grad_norm": 0.24742531776428223, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.5732, + "mean_token_accuracy": 0.7980184555053711, + "step": 64 + }, + { + "epoch": 0.20440251572327045, + "grad_norm": 0.2809283137321472, + "learning_rate": 2.7083333333333335e-05, + "loss": 0.5749, + "mean_token_accuracy": 0.7999182939529419, + "step": 65 + }, + { + "epoch": 0.20754716981132076, + "grad_norm": 0.1994399130344391, + "learning_rate": 2.75e-05, + "loss": 0.5731, + "mean_token_accuracy": 0.7985623478889465, + "step": 66 + }, + { + "epoch": 0.21069182389937108, + "grad_norm": 0.28836458921432495, + "learning_rate": 2.7916666666666666e-05, + "loss": 0.5596, + "mean_token_accuracy": 0.8041678071022034, + "step": 67 + }, + { + "epoch": 0.2138364779874214, + "grad_norm": 0.25330910086631775, + "learning_rate": 2.833333333333334e-05, + "loss": 0.5756, + "mean_token_accuracy": 0.7999730110168457, + "step": 68 + }, + { + "epoch": 0.2169811320754717, + "grad_norm": 0.6575194001197815, + "learning_rate": 2.875e-05, + "loss": 0.5811, + "mean_token_accuracy": 0.7950687408447266, + "step": 69 + }, + { + "epoch": 0.22012578616352202, + "grad_norm": 0.318123996257782, + "learning_rate": 2.9166666666666666e-05, + "loss": 0.5735, + "mean_token_accuracy": 0.7979779243469238, + "step": 70 + }, + { + "epoch": 0.22327044025157233, + "grad_norm": 0.21200031042099, + "learning_rate": 2.958333333333334e-05, + "loss": 0.5595, + "mean_token_accuracy": 0.800320029258728, + "step": 71 + }, + { + "epoch": 0.22641509433962265, + "grad_norm": 0.3333161771297455, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.5587, + "mean_token_accuracy": 0.8015144467353821, + "step": 72 + }, + { + "epoch": 0.22955974842767296, + "grad_norm": 0.2528008818626404, + "learning_rate": 3.0416666666666666e-05, + "loss": 0.5552, + "mean_token_accuracy": 0.8024976253509521, + "step": 73 + }, + { + "epoch": 0.23270440251572327, + "grad_norm": 0.29139530658721924, + "learning_rate": 3.0833333333333335e-05, + "loss": 0.5737, + "mean_token_accuracy": 0.7972533106803894, + "step": 74 + }, + { + "epoch": 0.2358490566037736, + "grad_norm": 0.31739315390586853, + "learning_rate": 3.125e-05, + "loss": 0.5744, + "mean_token_accuracy": 0.7987572550773621, + "step": 75 + }, + { + "epoch": 0.2389937106918239, + "grad_norm": 0.20459042489528656, + "learning_rate": 3.1666666666666666e-05, + "loss": 0.5678, + "mean_token_accuracy": 0.7998490929603577, + "step": 76 + }, + { + "epoch": 0.24213836477987422, + "grad_norm": 0.3210413455963135, + "learning_rate": 3.208333333333334e-05, + "loss": 0.5679, + "mean_token_accuracy": 0.8000871539115906, + "step": 77 + }, + { + "epoch": 0.24528301886792453, + "grad_norm": 0.2642245292663574, + "learning_rate": 3.2500000000000004e-05, + "loss": 0.5688, + "mean_token_accuracy": 0.8012592792510986, + "step": 78 + }, + { + "epoch": 0.24842767295597484, + "grad_norm": 0.3026284873485565, + "learning_rate": 3.291666666666667e-05, + "loss": 0.5601, + "mean_token_accuracy": 0.802608847618103, + "step": 79 + }, + { + "epoch": 0.25157232704402516, + "grad_norm": 0.2567735016345978, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.5488, + "mean_token_accuracy": 0.8038848638534546, + "step": 80 + }, + { + "epoch": 0.25471698113207547, + "grad_norm": 0.26682281494140625, + "learning_rate": 3.375e-05, + "loss": 0.5601, + "mean_token_accuracy": 0.801401674747467, + "step": 81 + }, + { + "epoch": 0.2578616352201258, + "grad_norm": 0.30776286125183105, + "learning_rate": 3.4166666666666666e-05, + "loss": 0.5619, + "mean_token_accuracy": 0.802801251411438, + "step": 82 + }, + { + "epoch": 0.2610062893081761, + "grad_norm": 0.2427256852388382, + "learning_rate": 3.458333333333334e-05, + "loss": 0.5563, + "mean_token_accuracy": 0.80390864610672, + "step": 83 + }, + { + "epoch": 0.2641509433962264, + "grad_norm": 0.2617015838623047, + "learning_rate": 3.5000000000000004e-05, + "loss": 0.5626, + "mean_token_accuracy": 0.8013002276420593, + "step": 84 + }, + { + "epoch": 0.2672955974842767, + "grad_norm": 0.34007972478866577, + "learning_rate": 3.541666666666667e-05, + "loss": 0.5618, + "mean_token_accuracy": 0.8023524880409241, + "step": 85 + }, + { + "epoch": 0.27044025157232704, + "grad_norm": 0.22972999513149261, + "learning_rate": 3.5833333333333335e-05, + "loss": 0.5605, + "mean_token_accuracy": 0.8025376796722412, + "step": 86 + }, + { + "epoch": 0.27358490566037735, + "grad_norm": 0.37038788199424744, + "learning_rate": 3.625e-05, + "loss": 0.561, + "mean_token_accuracy": 0.8009118437767029, + "step": 87 + }, + { + "epoch": 0.27672955974842767, + "grad_norm": 0.23523087799549103, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.5531, + "mean_token_accuracy": 0.8030558228492737, + "step": 88 + }, + { + "epoch": 0.279874213836478, + "grad_norm": 0.3976694941520691, + "learning_rate": 3.708333333333334e-05, + "loss": 0.5502, + "mean_token_accuracy": 0.8050774335861206, + "step": 89 + }, + { + "epoch": 0.2830188679245283, + "grad_norm": 0.2722809314727783, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.5506, + "mean_token_accuracy": 0.8029981255531311, + "step": 90 + }, + { + "epoch": 0.2861635220125786, + "grad_norm": 0.4583202004432678, + "learning_rate": 3.791666666666667e-05, + "loss": 0.5542, + "mean_token_accuracy": 0.8037380576133728, + "step": 91 + }, + { + "epoch": 0.2893081761006289, + "grad_norm": 0.4009253978729248, + "learning_rate": 3.833333333333334e-05, + "loss": 0.5612, + "mean_token_accuracy": 0.7998917698860168, + "step": 92 + }, + { + "epoch": 0.29245283018867924, + "grad_norm": 0.320212721824646, + "learning_rate": 3.875e-05, + "loss": 0.5554, + "mean_token_accuracy": 0.8016734719276428, + "step": 93 + }, + { + "epoch": 0.29559748427672955, + "grad_norm": 0.3593039810657501, + "learning_rate": 3.9166666666666665e-05, + "loss": 0.5599, + "mean_token_accuracy": 0.8014413714408875, + "step": 94 + }, + { + "epoch": 0.29874213836477986, + "grad_norm": 0.2468472719192505, + "learning_rate": 3.958333333333334e-05, + "loss": 0.5611, + "mean_token_accuracy": 0.8008609414100647, + "step": 95 + }, + { + "epoch": 0.3018867924528302, + "grad_norm": 0.4157026410102844, + "learning_rate": 4e-05, + "loss": 0.5603, + "mean_token_accuracy": 0.801600456237793, + "step": 96 + }, + { + "epoch": 0.3050314465408805, + "grad_norm": 0.2701054513454437, + "learning_rate": 3.9999990660718234e-05, + "loss": 0.5551, + "mean_token_accuracy": 0.8028652667999268, + "step": 97 + }, + { + "epoch": 0.3081761006289308, + "grad_norm": 0.38107001781463623, + "learning_rate": 3.999996264288261e-05, + "loss": 0.5554, + "mean_token_accuracy": 0.8040093183517456, + "step": 98 + }, + { + "epoch": 0.3113207547169811, + "grad_norm": 0.2702213525772095, + "learning_rate": 3.999991594652221e-05, + "loss": 0.5479, + "mean_token_accuracy": 0.8046407699584961, + "step": 99 + }, + { + "epoch": 0.31446540880503143, + "grad_norm": 0.274862676858902, + "learning_rate": 3.999985057168549e-05, + "loss": 0.5433, + "mean_token_accuracy": 0.8047059774398804, + "step": 100 + }, + { + "epoch": 0.31761006289308175, + "grad_norm": 0.2879992723464966, + "learning_rate": 3.999976651844029e-05, + "loss": 0.5565, + "mean_token_accuracy": 0.8013796210289001, + "step": 101 + }, + { + "epoch": 0.32075471698113206, + "grad_norm": 0.3317185342311859, + "learning_rate": 3.9999663786873816e-05, + "loss": 0.5556, + "mean_token_accuracy": 0.8010111451148987, + "step": 102 + }, + { + "epoch": 0.3238993710691824, + "grad_norm": 0.2893899083137512, + "learning_rate": 3.99995423770927e-05, + "loss": 0.5392, + "mean_token_accuracy": 0.8065787553787231, + "step": 103 + }, + { + "epoch": 0.3270440251572327, + "grad_norm": 0.27327823638916016, + "learning_rate": 3.999940228922291e-05, + "loss": 0.5418, + "mean_token_accuracy": 0.8071911334991455, + "step": 104 + }, + { + "epoch": 0.330188679245283, + "grad_norm": 0.301395058631897, + "learning_rate": 3.9999243523409826e-05, + "loss": 0.5467, + "mean_token_accuracy": 0.8041232228279114, + "step": 105 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.2282310575246811, + "learning_rate": 3.999906607981819e-05, + "loss": 0.5353, + "mean_token_accuracy": 0.8070929050445557, + "step": 106 + }, + { + "epoch": 0.33647798742138363, + "grad_norm": 0.27638840675354004, + "learning_rate": 3.999886995863214e-05, + "loss": 0.5448, + "mean_token_accuracy": 0.8050588369369507, + "step": 107 + }, + { + "epoch": 0.33962264150943394, + "grad_norm": 0.257600873708725, + "learning_rate": 3.999865516005519e-05, + "loss": 0.5502, + "mean_token_accuracy": 0.8030370473861694, + "step": 108 + }, + { + "epoch": 0.34276729559748426, + "grad_norm": 0.23493166267871857, + "learning_rate": 3.999842168431023e-05, + "loss": 0.5461, + "mean_token_accuracy": 0.8061801195144653, + "step": 109 + }, + { + "epoch": 0.34591194968553457, + "grad_norm": 0.33576369285583496, + "learning_rate": 3.9998169531639546e-05, + "loss": 0.5515, + "mean_token_accuracy": 0.8048064708709717, + "step": 110 + }, + { + "epoch": 0.3490566037735849, + "grad_norm": 0.28176188468933105, + "learning_rate": 3.999789870230479e-05, + "loss": 0.551, + "mean_token_accuracy": 0.8034514784812927, + "step": 111 + }, + { + "epoch": 0.3522012578616352, + "grad_norm": 0.27955150604248047, + "learning_rate": 3.9997609196587005e-05, + "loss": 0.5475, + "mean_token_accuracy": 0.803336501121521, + "step": 112 + }, + { + "epoch": 0.3553459119496855, + "grad_norm": 0.2750742435455322, + "learning_rate": 3.999730101478661e-05, + "loss": 0.5487, + "mean_token_accuracy": 0.8029544353485107, + "step": 113 + }, + { + "epoch": 0.3584905660377358, + "grad_norm": 0.27893856167793274, + "learning_rate": 3.9996974157223395e-05, + "loss": 0.5481, + "mean_token_accuracy": 0.8042543530464172, + "step": 114 + }, + { + "epoch": 0.36163522012578614, + "grad_norm": 0.3115741014480591, + "learning_rate": 3.9996628624236556e-05, + "loss": 0.5463, + "mean_token_accuracy": 0.8052036762237549, + "step": 115 + }, + { + "epoch": 0.36477987421383645, + "grad_norm": 0.21932333707809448, + "learning_rate": 3.999626441618464e-05, + "loss": 0.5362, + "mean_token_accuracy": 0.8065166473388672, + "step": 116 + }, + { + "epoch": 0.36792452830188677, + "grad_norm": 0.3026050329208374, + "learning_rate": 3.999588153344559e-05, + "loss": 0.5307, + "mean_token_accuracy": 0.8093136548995972, + "step": 117 + }, + { + "epoch": 0.3710691823899371, + "grad_norm": 0.3253287076950073, + "learning_rate": 3.9995479976416725e-05, + "loss": 0.5503, + "mean_token_accuracy": 0.8045260310173035, + "step": 118 + }, + { + "epoch": 0.3742138364779874, + "grad_norm": 0.30211326479911804, + "learning_rate": 3.999505974551473e-05, + "loss": 0.5492, + "mean_token_accuracy": 0.8044459223747253, + "step": 119 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 0.2566331923007965, + "learning_rate": 3.9994620841175694e-05, + "loss": 0.5533, + "mean_token_accuracy": 0.8031808733940125, + "step": 120 + }, + { + "epoch": 0.3805031446540881, + "grad_norm": 0.32716792821884155, + "learning_rate": 3.999416326385505e-05, + "loss": 0.5421, + "mean_token_accuracy": 0.8031218647956848, + "step": 121 + }, + { + "epoch": 0.3836477987421384, + "grad_norm": 0.2625165581703186, + "learning_rate": 3.999368701402763e-05, + "loss": 0.5465, + "mean_token_accuracy": 0.8032469749450684, + "step": 122 + }, + { + "epoch": 0.3867924528301887, + "grad_norm": 0.334791362285614, + "learning_rate": 3.9993192092187644e-05, + "loss": 0.5469, + "mean_token_accuracy": 0.8045857548713684, + "step": 123 + }, + { + "epoch": 0.389937106918239, + "grad_norm": 0.28799203038215637, + "learning_rate": 3.9992678498848664e-05, + "loss": 0.544, + "mean_token_accuracy": 0.8061292171478271, + "step": 124 + }, + { + "epoch": 0.39308176100628933, + "grad_norm": 0.3053019642829895, + "learning_rate": 3.9992146234543645e-05, + "loss": 0.5359, + "mean_token_accuracy": 0.807460606098175, + "step": 125 + }, + { + "epoch": 0.39622641509433965, + "grad_norm": 0.26873913407325745, + "learning_rate": 3.999159529982493e-05, + "loss": 0.5514, + "mean_token_accuracy": 0.8021828532218933, + "step": 126 + }, + { + "epoch": 0.39937106918238996, + "grad_norm": 0.2870161533355713, + "learning_rate": 3.9991025695264205e-05, + "loss": 0.539, + "mean_token_accuracy": 0.8052948713302612, + "step": 127 + }, + { + "epoch": 0.4025157232704403, + "grad_norm": 0.2600720226764679, + "learning_rate": 3.9990437421452556e-05, + "loss": 0.5475, + "mean_token_accuracy": 0.8043171763420105, + "step": 128 + }, + { + "epoch": 0.4056603773584906, + "grad_norm": 0.31923291087150574, + "learning_rate": 3.9989830479000435e-05, + "loss": 0.536, + "mean_token_accuracy": 0.8079201579093933, + "step": 129 + }, + { + "epoch": 0.4088050314465409, + "grad_norm": 0.2548767924308777, + "learning_rate": 3.9989204868537654e-05, + "loss": 0.5412, + "mean_token_accuracy": 0.8052085041999817, + "step": 130 + }, + { + "epoch": 0.4119496855345912, + "grad_norm": 0.2763267755508423, + "learning_rate": 3.998856059071342e-05, + "loss": 0.5434, + "mean_token_accuracy": 0.8055262565612793, + "step": 131 + }, + { + "epoch": 0.41509433962264153, + "grad_norm": 0.2880299985408783, + "learning_rate": 3.99878976461963e-05, + "loss": 0.5349, + "mean_token_accuracy": 0.8085739016532898, + "step": 132 + }, + { + "epoch": 0.41823899371069184, + "grad_norm": 0.2466239184141159, + "learning_rate": 3.998721603567422e-05, + "loss": 0.5464, + "mean_token_accuracy": 0.8047433495521545, + "step": 133 + }, + { + "epoch": 0.42138364779874216, + "grad_norm": 0.3075404763221741, + "learning_rate": 3.9986515759854495e-05, + "loss": 0.5366, + "mean_token_accuracy": 0.8063505291938782, + "step": 134 + }, + { + "epoch": 0.42452830188679247, + "grad_norm": 0.2623771131038666, + "learning_rate": 3.9985796819463795e-05, + "loss": 0.5354, + "mean_token_accuracy": 0.8071600198745728, + "step": 135 + }, + { + "epoch": 0.4276729559748428, + "grad_norm": 0.2850979268550873, + "learning_rate": 3.998505921524816e-05, + "loss": 0.5383, + "mean_token_accuracy": 0.8051438331604004, + "step": 136 + }, + { + "epoch": 0.4308176100628931, + "grad_norm": 0.21707168221473694, + "learning_rate": 3.9984302947973005e-05, + "loss": 0.5326, + "mean_token_accuracy": 0.8092407584190369, + "step": 137 + }, + { + "epoch": 0.4339622641509434, + "grad_norm": 0.2864557206630707, + "learning_rate": 3.9983528018423106e-05, + "loss": 0.5376, + "mean_token_accuracy": 0.8068653345108032, + "step": 138 + }, + { + "epoch": 0.4371069182389937, + "grad_norm": 0.227940633893013, + "learning_rate": 3.998273442740261e-05, + "loss": 0.5355, + "mean_token_accuracy": 0.8075131177902222, + "step": 139 + }, + { + "epoch": 0.44025157232704404, + "grad_norm": 0.2534600496292114, + "learning_rate": 3.9981922175735014e-05, + "loss": 0.536, + "mean_token_accuracy": 0.8077684640884399, + "step": 140 + }, + { + "epoch": 0.44339622641509435, + "grad_norm": 0.3271535038948059, + "learning_rate": 3.9981091264263205e-05, + "loss": 0.5227, + "mean_token_accuracy": 0.8108183145523071, + "step": 141 + }, + { + "epoch": 0.44654088050314467, + "grad_norm": 0.2163369208574295, + "learning_rate": 3.998024169384941e-05, + "loss": 0.5402, + "mean_token_accuracy": 0.8060819506645203, + "step": 142 + }, + { + "epoch": 0.449685534591195, + "grad_norm": 0.4553479850292206, + "learning_rate": 3.997937346537522e-05, + "loss": 0.5296, + "mean_token_accuracy": 0.8097839951515198, + "step": 143 + }, + { + "epoch": 0.4528301886792453, + "grad_norm": 0.31248709559440613, + "learning_rate": 3.9978486579741596e-05, + "loss": 0.5374, + "mean_token_accuracy": 0.8072243928909302, + "step": 144 + }, + { + "epoch": 0.4559748427672956, + "grad_norm": 0.5377129316329956, + "learning_rate": 3.9977581037868874e-05, + "loss": 0.5358, + "mean_token_accuracy": 0.8072550296783447, + "step": 145 + }, + { + "epoch": 0.4591194968553459, + "grad_norm": 0.443276584148407, + "learning_rate": 3.997665684069671e-05, + "loss": 0.5346, + "mean_token_accuracy": 0.8065415024757385, + "step": 146 + }, + { + "epoch": 0.46226415094339623, + "grad_norm": 0.4334987699985504, + "learning_rate": 3.997571398918415e-05, + "loss": 0.5322, + "mean_token_accuracy": 0.8077476620674133, + "step": 147 + }, + { + "epoch": 0.46540880503144655, + "grad_norm": 0.4879336655139923, + "learning_rate": 3.99747524843096e-05, + "loss": 0.5402, + "mean_token_accuracy": 0.8054540157318115, + "step": 148 + }, + { + "epoch": 0.46855345911949686, + "grad_norm": 0.3232817053794861, + "learning_rate": 3.9973772327070805e-05, + "loss": 0.5281, + "mean_token_accuracy": 0.8070737719535828, + "step": 149 + }, + { + "epoch": 0.4716981132075472, + "grad_norm": 0.373993843793869, + "learning_rate": 3.997277351848486e-05, + "loss": 0.542, + "mean_token_accuracy": 0.8036733865737915, + "step": 150 + }, + { + "epoch": 0.4748427672955975, + "grad_norm": 0.2840160131454468, + "learning_rate": 3.997175605958825e-05, + "loss": 0.525, + "mean_token_accuracy": 0.8079817891120911, + "step": 151 + }, + { + "epoch": 0.4779874213836478, + "grad_norm": 0.34674006700515747, + "learning_rate": 3.997071995143676e-05, + "loss": 0.5432, + "mean_token_accuracy": 0.8057137727737427, + "step": 152 + }, + { + "epoch": 0.4811320754716981, + "grad_norm": 0.2753613591194153, + "learning_rate": 3.9969665195105585e-05, + "loss": 0.5401, + "mean_token_accuracy": 0.8051686882972717, + "step": 153 + }, + { + "epoch": 0.48427672955974843, + "grad_norm": 0.338226854801178, + "learning_rate": 3.996859179168923e-05, + "loss": 0.5343, + "mean_token_accuracy": 0.8090754747390747, + "step": 154 + }, + { + "epoch": 0.48742138364779874, + "grad_norm": 0.32556068897247314, + "learning_rate": 3.996749974230157e-05, + "loss": 0.5375, + "mean_token_accuracy": 0.8059588670730591, + "step": 155 + }, + { + "epoch": 0.49056603773584906, + "grad_norm": 0.2867760956287384, + "learning_rate": 3.9966389048075815e-05, + "loss": 0.5243, + "mean_token_accuracy": 0.8106968998908997, + "step": 156 + }, + { + "epoch": 0.4937106918238994, + "grad_norm": 0.33583030104637146, + "learning_rate": 3.996525971016453e-05, + "loss": 0.5333, + "mean_token_accuracy": 0.8073939681053162, + "step": 157 + }, + { + "epoch": 0.4968553459119497, + "grad_norm": 0.25750523805618286, + "learning_rate": 3.9964111729739636e-05, + "loss": 0.5366, + "mean_token_accuracy": 0.8069300651550293, + "step": 158 + }, + { + "epoch": 0.5, + "grad_norm": 0.3350702226161957, + "learning_rate": 3.9962945107992384e-05, + "loss": 0.5297, + "mean_token_accuracy": 0.8082327246665955, + "step": 159 + }, + { + "epoch": 0.5031446540880503, + "grad_norm": 0.25913313031196594, + "learning_rate": 3.996175984613337e-05, + "loss": 0.5346, + "mean_token_accuracy": 0.8069829940795898, + "step": 160 + }, + { + "epoch": 0.5062893081761006, + "grad_norm": 0.26537439227104187, + "learning_rate": 3.996055594539255e-05, + "loss": 0.5227, + "mean_token_accuracy": 0.8109566569328308, + "step": 161 + }, + { + "epoch": 0.5094339622641509, + "grad_norm": 0.29593321681022644, + "learning_rate": 3.99593334070192e-05, + "loss": 0.523, + "mean_token_accuracy": 0.8093637228012085, + "step": 162 + }, + { + "epoch": 0.5125786163522013, + "grad_norm": 0.2575869858264923, + "learning_rate": 3.995809223228195e-05, + "loss": 0.5406, + "mean_token_accuracy": 0.8029673099517822, + "step": 163 + }, + { + "epoch": 0.5157232704402516, + "grad_norm": 0.301028847694397, + "learning_rate": 3.995683242246876e-05, + "loss": 0.5331, + "mean_token_accuracy": 0.8078096508979797, + "step": 164 + }, + { + "epoch": 0.5188679245283019, + "grad_norm": 0.29553452134132385, + "learning_rate": 3.995555397888693e-05, + "loss": 0.5213, + "mean_token_accuracy": 0.8104479312896729, + "step": 165 + }, + { + "epoch": 0.5220125786163522, + "grad_norm": 0.30347883701324463, + "learning_rate": 3.995425690286311e-05, + "loss": 0.5386, + "mean_token_accuracy": 0.8064728379249573, + "step": 166 + }, + { + "epoch": 0.5251572327044025, + "grad_norm": 0.3146721422672272, + "learning_rate": 3.995294119574326e-05, + "loss": 0.5259, + "mean_token_accuracy": 0.8080021739006042, + "step": 167 + }, + { + "epoch": 0.5283018867924528, + "grad_norm": 0.26118120551109314, + "learning_rate": 3.99516068588927e-05, + "loss": 0.5187, + "mean_token_accuracy": 0.812569260597229, + "step": 168 + }, + { + "epoch": 0.5314465408805031, + "grad_norm": 0.3231758177280426, + "learning_rate": 3.995025389369606e-05, + "loss": 0.5239, + "mean_token_accuracy": 0.8090735673904419, + "step": 169 + }, + { + "epoch": 0.5345911949685535, + "grad_norm": 0.21853643655776978, + "learning_rate": 3.9948882301557306e-05, + "loss": 0.5162, + "mean_token_accuracy": 0.8130766153335571, + "step": 170 + }, + { + "epoch": 0.5377358490566038, + "grad_norm": 0.2971287667751312, + "learning_rate": 3.9947492083899746e-05, + "loss": 0.5249, + "mean_token_accuracy": 0.8092015385627747, + "step": 171 + }, + { + "epoch": 0.5408805031446541, + "grad_norm": 0.2766264081001282, + "learning_rate": 3.9946083242166e-05, + "loss": 0.5202, + "mean_token_accuracy": 0.8101739287376404, + "step": 172 + }, + { + "epoch": 0.5440251572327044, + "grad_norm": 0.22849610447883606, + "learning_rate": 3.9944655777818024e-05, + "loss": 0.5173, + "mean_token_accuracy": 0.8131073117256165, + "step": 173 + }, + { + "epoch": 0.5471698113207547, + "grad_norm": 0.3846280872821808, + "learning_rate": 3.994320969233709e-05, + "loss": 0.5295, + "mean_token_accuracy": 0.807365357875824, + "step": 174 + }, + { + "epoch": 0.550314465408805, + "grad_norm": 0.2353799194097519, + "learning_rate": 3.9941744987223796e-05, + "loss": 0.5241, + "mean_token_accuracy": 0.810299813747406, + "step": 175 + }, + { + "epoch": 0.5534591194968553, + "grad_norm": 0.36173367500305176, + "learning_rate": 3.994026166399808e-05, + "loss": 0.531, + "mean_token_accuracy": 0.8072293996810913, + "step": 176 + }, + { + "epoch": 0.5566037735849056, + "grad_norm": 0.25161418318748474, + "learning_rate": 3.993875972419916e-05, + "loss": 0.5245, + "mean_token_accuracy": 0.8084467053413391, + "step": 177 + }, + { + "epoch": 0.559748427672956, + "grad_norm": 0.411323219537735, + "learning_rate": 3.993723916938561e-05, + "loss": 0.5297, + "mean_token_accuracy": 0.8063352704048157, + "step": 178 + }, + { + "epoch": 0.5628930817610063, + "grad_norm": 0.28566470742225647, + "learning_rate": 3.9935700001135307e-05, + "loss": 0.5246, + "mean_token_accuracy": 0.8076804280281067, + "step": 179 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 0.3744773864746094, + "learning_rate": 3.9934142221045434e-05, + "loss": 0.5219, + "mean_token_accuracy": 0.8113803863525391, + "step": 180 + }, + { + "epoch": 0.5691823899371069, + "grad_norm": 0.3109516501426697, + "learning_rate": 3.9932565830732505e-05, + "loss": 0.5152, + "mean_token_accuracy": 0.8134464025497437, + "step": 181 + }, + { + "epoch": 0.5723270440251572, + "grad_norm": 0.40750232338905334, + "learning_rate": 3.993097083183233e-05, + "loss": 0.5119, + "mean_token_accuracy": 0.8139127492904663, + "step": 182 + }, + { + "epoch": 0.5754716981132075, + "grad_norm": 0.3474627733230591, + "learning_rate": 3.9929357226000045e-05, + "loss": 0.538, + "mean_token_accuracy": 0.8060604333877563, + "step": 183 + }, + { + "epoch": 0.5786163522012578, + "grad_norm": 0.39326900243759155, + "learning_rate": 3.992772501491008e-05, + "loss": 0.5209, + "mean_token_accuracy": 0.8119919300079346, + "step": 184 + }, + { + "epoch": 0.5817610062893082, + "grad_norm": 0.390663743019104, + "learning_rate": 3.992607420025618e-05, + "loss": 0.5165, + "mean_token_accuracy": 0.8117203712463379, + "step": 185 + }, + { + "epoch": 0.5849056603773585, + "grad_norm": 0.2819702923297882, + "learning_rate": 3.9924404783751385e-05, + "loss": 0.5287, + "mean_token_accuracy": 0.8078507781028748, + "step": 186 + }, + { + "epoch": 0.5880503144654088, + "grad_norm": 0.3405737280845642, + "learning_rate": 3.992271676712805e-05, + "loss": 0.5264, + "mean_token_accuracy": 0.8081623315811157, + "step": 187 + }, + { + "epoch": 0.5911949685534591, + "grad_norm": 0.24368081986904144, + "learning_rate": 3.9921010152137824e-05, + "loss": 0.5298, + "mean_token_accuracy": 0.8077815771102905, + "step": 188 + }, + { + "epoch": 0.5943396226415094, + "grad_norm": 0.2949293553829193, + "learning_rate": 3.991928494055166e-05, + "loss": 0.5169, + "mean_token_accuracy": 0.8117429614067078, + "step": 189 + }, + { + "epoch": 0.5974842767295597, + "grad_norm": 0.2298242747783661, + "learning_rate": 3.991754113415981e-05, + "loss": 0.5209, + "mean_token_accuracy": 0.8099380731582642, + "step": 190 + }, + { + "epoch": 0.60062893081761, + "grad_norm": 0.2921850383281708, + "learning_rate": 3.9915778734771816e-05, + "loss": 0.5249, + "mean_token_accuracy": 0.8096524477005005, + "step": 191 + }, + { + "epoch": 0.6037735849056604, + "grad_norm": 0.19392161071300507, + "learning_rate": 3.991399774421651e-05, + "loss": 0.5114, + "mean_token_accuracy": 0.8131522536277771, + "step": 192 + }, + { + "epoch": 0.6069182389937107, + "grad_norm": 0.3399420976638794, + "learning_rate": 3.991219816434204e-05, + "loss": 0.5156, + "mean_token_accuracy": 0.8106433749198914, + "step": 193 + }, + { + "epoch": 0.610062893081761, + "grad_norm": 0.2603597939014435, + "learning_rate": 3.99103799970158e-05, + "loss": 0.5152, + "mean_token_accuracy": 0.8112037777900696, + "step": 194 + }, + { + "epoch": 0.6132075471698113, + "grad_norm": 0.2951433062553406, + "learning_rate": 3.990854324412453e-05, + "loss": 0.5243, + "mean_token_accuracy": 0.8115384578704834, + "step": 195 + }, + { + "epoch": 0.6163522012578616, + "grad_norm": 0.2732630968093872, + "learning_rate": 3.9906687907574186e-05, + "loss": 0.525, + "mean_token_accuracy": 0.8100634813308716, + "step": 196 + }, + { + "epoch": 0.6194968553459119, + "grad_norm": 0.24679310619831085, + "learning_rate": 3.9904813989290084e-05, + "loss": 0.5178, + "mean_token_accuracy": 0.8117086291313171, + "step": 197 + }, + { + "epoch": 0.6226415094339622, + "grad_norm": 0.2945667505264282, + "learning_rate": 3.990292149121675e-05, + "loss": 0.5088, + "mean_token_accuracy": 0.8141837120056152, + "step": 198 + }, + { + "epoch": 0.6257861635220126, + "grad_norm": 0.24672532081604004, + "learning_rate": 3.9901010415318066e-05, + "loss": 0.5146, + "mean_token_accuracy": 0.8105906844139099, + "step": 199 + }, + { + "epoch": 0.6289308176100629, + "grad_norm": 0.32529643177986145, + "learning_rate": 3.9899080763577126e-05, + "loss": 0.5253, + "mean_token_accuracy": 0.8105737566947937, + "step": 200 + }, + { + "epoch": 0.6320754716981132, + "grad_norm": 0.2720421552658081, + "learning_rate": 3.9897132537996326e-05, + "loss": 0.5182, + "mean_token_accuracy": 0.8114479780197144, + "step": 201 + }, + { + "epoch": 0.6352201257861635, + "grad_norm": 0.2893914580345154, + "learning_rate": 3.9895165740597336e-05, + "loss": 0.5259, + "mean_token_accuracy": 0.8097965121269226, + "step": 202 + }, + { + "epoch": 0.6383647798742138, + "grad_norm": 0.2724241018295288, + "learning_rate": 3.989318037342111e-05, + "loss": 0.5149, + "mean_token_accuracy": 0.8123500347137451, + "step": 203 + }, + { + "epoch": 0.6415094339622641, + "grad_norm": 0.2764907479286194, + "learning_rate": 3.989117643852785e-05, + "loss": 0.5153, + "mean_token_accuracy": 0.8120966553688049, + "step": 204 + }, + { + "epoch": 0.6446540880503144, + "grad_norm": 0.24415266513824463, + "learning_rate": 3.988915393799703e-05, + "loss": 0.5111, + "mean_token_accuracy": 0.8132697343826294, + "step": 205 + }, + { + "epoch": 0.6477987421383647, + "grad_norm": 0.24454502761363983, + "learning_rate": 3.988711287392741e-05, + "loss": 0.5063, + "mean_token_accuracy": 0.8129962682723999, + "step": 206 + }, + { + "epoch": 0.6509433962264151, + "grad_norm": 0.25321948528289795, + "learning_rate": 3.9885053248436986e-05, + "loss": 0.5117, + "mean_token_accuracy": 0.8123859763145447, + "step": 207 + }, + { + "epoch": 0.6540880503144654, + "grad_norm": 0.21559487283229828, + "learning_rate": 3.9882975063663026e-05, + "loss": 0.5242, + "mean_token_accuracy": 0.8082608580589294, + "step": 208 + }, + { + "epoch": 0.6572327044025157, + "grad_norm": 0.2384122759103775, + "learning_rate": 3.9880878321762066e-05, + "loss": 0.5156, + "mean_token_accuracy": 0.8102776408195496, + "step": 209 + }, + { + "epoch": 0.660377358490566, + "grad_norm": 0.24567940831184387, + "learning_rate": 3.9878763024909884e-05, + "loss": 0.5135, + "mean_token_accuracy": 0.8105263113975525, + "step": 210 + }, + { + "epoch": 0.6635220125786163, + "grad_norm": 0.2328757643699646, + "learning_rate": 3.987662917530153e-05, + "loss": 0.5117, + "mean_token_accuracy": 0.812843918800354, + "step": 211 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.22722886502742767, + "learning_rate": 3.987447677515128e-05, + "loss": 0.5118, + "mean_token_accuracy": 0.8124209642410278, + "step": 212 + }, + { + "epoch": 0.6698113207547169, + "grad_norm": 0.24911589920520782, + "learning_rate": 3.9872305826692686e-05, + "loss": 0.5164, + "mean_token_accuracy": 0.809599757194519, + "step": 213 + }, + { + "epoch": 0.6729559748427673, + "grad_norm": 0.21305599808692932, + "learning_rate": 3.987011633217853e-05, + "loss": 0.5184, + "mean_token_accuracy": 0.8106415271759033, + "step": 214 + }, + { + "epoch": 0.6761006289308176, + "grad_norm": 0.2465692013502121, + "learning_rate": 3.986790829388086e-05, + "loss": 0.5182, + "mean_token_accuracy": 0.8113357424736023, + "step": 215 + }, + { + "epoch": 0.6792452830188679, + "grad_norm": 0.23801495134830475, + "learning_rate": 3.986568171409094e-05, + "loss": 0.5161, + "mean_token_accuracy": 0.8109778761863708, + "step": 216 + }, + { + "epoch": 0.6823899371069182, + "grad_norm": 0.22071239352226257, + "learning_rate": 3.9863436595119284e-05, + "loss": 0.5156, + "mean_token_accuracy": 0.8117619156837463, + "step": 217 + }, + { + "epoch": 0.6855345911949685, + "grad_norm": 0.21508371829986572, + "learning_rate": 3.986117293929566e-05, + "loss": 0.4984, + "mean_token_accuracy": 0.8161824345588684, + "step": 218 + }, + { + "epoch": 0.6886792452830188, + "grad_norm": 0.21123456954956055, + "learning_rate": 3.985889074896904e-05, + "loss": 0.5068, + "mean_token_accuracy": 0.8125125169754028, + "step": 219 + }, + { + "epoch": 0.6918238993710691, + "grad_norm": 0.2076350450515747, + "learning_rate": 3.985659002650767e-05, + "loss": 0.5173, + "mean_token_accuracy": 0.8120942115783691, + "step": 220 + }, + { + "epoch": 0.6949685534591195, + "grad_norm": 0.20349815487861633, + "learning_rate": 3.9854270774299e-05, + "loss": 0.5088, + "mean_token_accuracy": 0.8142285943031311, + "step": 221 + }, + { + "epoch": 0.6981132075471698, + "grad_norm": 0.2199343591928482, + "learning_rate": 3.9851932994749704e-05, + "loss": 0.5158, + "mean_token_accuracy": 0.811479926109314, + "step": 222 + }, + { + "epoch": 0.7012578616352201, + "grad_norm": 0.2165432721376419, + "learning_rate": 3.98495766902857e-05, + "loss": 0.5092, + "mean_token_accuracy": 0.8122998476028442, + "step": 223 + }, + { + "epoch": 0.7044025157232704, + "grad_norm": 0.21602460741996765, + "learning_rate": 3.984720186335211e-05, + "loss": 0.5192, + "mean_token_accuracy": 0.811272382736206, + "step": 224 + }, + { + "epoch": 0.7075471698113207, + "grad_norm": 0.2049703598022461, + "learning_rate": 3.9844808516413305e-05, + "loss": 0.5076, + "mean_token_accuracy": 0.8121902346611023, + "step": 225 + }, + { + "epoch": 0.710691823899371, + "grad_norm": 0.22164130210876465, + "learning_rate": 3.9842396651952836e-05, + "loss": 0.5131, + "mean_token_accuracy": 0.8126029968261719, + "step": 226 + }, + { + "epoch": 0.7138364779874213, + "grad_norm": 0.19871044158935547, + "learning_rate": 3.98399662724735e-05, + "loss": 0.5163, + "mean_token_accuracy": 0.8120744228363037, + "step": 227 + }, + { + "epoch": 0.7169811320754716, + "grad_norm": 0.25331369042396545, + "learning_rate": 3.98375173804973e-05, + "loss": 0.512, + "mean_token_accuracy": 0.811780571937561, + "step": 228 + }, + { + "epoch": 0.720125786163522, + "grad_norm": 0.21906249225139618, + "learning_rate": 3.983504997856544e-05, + "loss": 0.5169, + "mean_token_accuracy": 0.8110173344612122, + "step": 229 + }, + { + "epoch": 0.7232704402515723, + "grad_norm": 0.22297775745391846, + "learning_rate": 3.983256406923835e-05, + "loss": 0.5185, + "mean_token_accuracy": 0.8114131689071655, + "step": 230 + }, + { + "epoch": 0.7264150943396226, + "grad_norm": 0.19183681905269623, + "learning_rate": 3.9830059655095625e-05, + "loss": 0.516, + "mean_token_accuracy": 0.8132800459861755, + "step": 231 + }, + { + "epoch": 0.7295597484276729, + "grad_norm": 0.24273158609867096, + "learning_rate": 3.9827536738736115e-05, + "loss": 0.5179, + "mean_token_accuracy": 0.8097060322761536, + "step": 232 + }, + { + "epoch": 0.7327044025157232, + "grad_norm": 0.21596857905387878, + "learning_rate": 3.982499532277785e-05, + "loss": 0.491, + "mean_token_accuracy": 0.8167231678962708, + "step": 233 + }, + { + "epoch": 0.7358490566037735, + "grad_norm": 0.23281732201576233, + "learning_rate": 3.982243540985803e-05, + "loss": 0.5211, + "mean_token_accuracy": 0.8096528649330139, + "step": 234 + }, + { + "epoch": 0.7389937106918238, + "grad_norm": 0.1850733757019043, + "learning_rate": 3.9819857002633084e-05, + "loss": 0.5119, + "mean_token_accuracy": 0.812339723110199, + "step": 235 + }, + { + "epoch": 0.7421383647798742, + "grad_norm": 0.2426994889974594, + "learning_rate": 3.981726010377862e-05, + "loss": 0.5155, + "mean_token_accuracy": 0.8118655681610107, + "step": 236 + }, + { + "epoch": 0.7452830188679245, + "grad_norm": 0.20813558995723724, + "learning_rate": 3.981464471598943e-05, + "loss": 0.5147, + "mean_token_accuracy": 0.8122422099113464, + "step": 237 + }, + { + "epoch": 0.7484276729559748, + "grad_norm": 0.25706005096435547, + "learning_rate": 3.98120108419795e-05, + "loss": 0.5029, + "mean_token_accuracy": 0.814775824546814, + "step": 238 + }, + { + "epoch": 0.7515723270440252, + "grad_norm": 0.22012631595134735, + "learning_rate": 3.9809358484482e-05, + "loss": 0.5086, + "mean_token_accuracy": 0.8135779500007629, + "step": 239 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 0.2591609060764313, + "learning_rate": 3.980668764624927e-05, + "loss": 0.5166, + "mean_token_accuracy": 0.811547577381134, + "step": 240 + }, + { + "epoch": 0.7578616352201258, + "grad_norm": 0.2425551414489746, + "learning_rate": 3.9803998330052834e-05, + "loss": 0.5106, + "mean_token_accuracy": 0.8126974105834961, + "step": 241 + }, + { + "epoch": 0.7610062893081762, + "grad_norm": 0.19677698612213135, + "learning_rate": 3.980129053868339e-05, + "loss": 0.5025, + "mean_token_accuracy": 0.8132906556129456, + "step": 242 + }, + { + "epoch": 0.7641509433962265, + "grad_norm": 0.24536176025867462, + "learning_rate": 3.9798564274950815e-05, + "loss": 0.4983, + "mean_token_accuracy": 0.8168281316757202, + "step": 243 + }, + { + "epoch": 0.7672955974842768, + "grad_norm": 0.23609694838523865, + "learning_rate": 3.979581954168414e-05, + "loss": 0.4942, + "mean_token_accuracy": 0.8176018595695496, + "step": 244 + }, + { + "epoch": 0.7704402515723271, + "grad_norm": 0.2168692797422409, + "learning_rate": 3.9793056341731556e-05, + "loss": 0.5168, + "mean_token_accuracy": 0.8108464479446411, + "step": 245 + }, + { + "epoch": 0.7735849056603774, + "grad_norm": 0.21630840003490448, + "learning_rate": 3.979027467796046e-05, + "loss": 0.501, + "mean_token_accuracy": 0.8146473169326782, + "step": 246 + }, + { + "epoch": 0.7767295597484277, + "grad_norm": 0.20055299997329712, + "learning_rate": 3.978747455325736e-05, + "loss": 0.5082, + "mean_token_accuracy": 0.8135594129562378, + "step": 247 + }, + { + "epoch": 0.779874213836478, + "grad_norm": 0.282925546169281, + "learning_rate": 3.9784655970527934e-05, + "loss": 0.5144, + "mean_token_accuracy": 0.8131219744682312, + "step": 248 + }, + { + "epoch": 0.7830188679245284, + "grad_norm": 0.21999655663967133, + "learning_rate": 3.978181893269703e-05, + "loss": 0.5046, + "mean_token_accuracy": 0.8133974671363831, + "step": 249 + }, + { + "epoch": 0.7861635220125787, + "grad_norm": 0.2439885139465332, + "learning_rate": 3.977896344270864e-05, + "loss": 0.5052, + "mean_token_accuracy": 0.8127160668373108, + "step": 250 + }, + { + "epoch": 0.789308176100629, + "grad_norm": 0.2688886225223541, + "learning_rate": 3.9776089503525895e-05, + "loss": 0.5109, + "mean_token_accuracy": 0.81061190366745, + "step": 251 + }, + { + "epoch": 0.7924528301886793, + "grad_norm": 0.2625892162322998, + "learning_rate": 3.977319711813107e-05, + "loss": 0.5172, + "mean_token_accuracy": 0.8132520914077759, + "step": 252 + }, + { + "epoch": 0.7955974842767296, + "grad_norm": 0.1982739269733429, + "learning_rate": 3.97702862895256e-05, + "loss": 0.499, + "mean_token_accuracy": 0.8147685527801514, + "step": 253 + }, + { + "epoch": 0.7987421383647799, + "grad_norm": 0.2820964455604553, + "learning_rate": 3.976735702073003e-05, + "loss": 0.5134, + "mean_token_accuracy": 0.8140535354614258, + "step": 254 + }, + { + "epoch": 0.8018867924528302, + "grad_norm": 0.19811579585075378, + "learning_rate": 3.9764409314784074e-05, + "loss": 0.5176, + "mean_token_accuracy": 0.8116709589958191, + "step": 255 + }, + { + "epoch": 0.8050314465408805, + "grad_norm": 0.26263049244880676, + "learning_rate": 3.9761443174746556e-05, + "loss": 0.5162, + "mean_token_accuracy": 0.8113158345222473, + "step": 256 + }, + { + "epoch": 0.8081761006289309, + "grad_norm": 0.24067486822605133, + "learning_rate": 3.975845860369542e-05, + "loss": 0.5055, + "mean_token_accuracy": 0.814015805721283, + "step": 257 + }, + { + "epoch": 0.8113207547169812, + "grad_norm": 0.19591133296489716, + "learning_rate": 3.975545560472776e-05, + "loss": 0.5074, + "mean_token_accuracy": 0.8133794069290161, + "step": 258 + }, + { + "epoch": 0.8144654088050315, + "grad_norm": 0.24649196863174438, + "learning_rate": 3.975243418095978e-05, + "loss": 0.5141, + "mean_token_accuracy": 0.8121534585952759, + "step": 259 + }, + { + "epoch": 0.8176100628930818, + "grad_norm": 0.20178139209747314, + "learning_rate": 3.9749394335526806e-05, + "loss": 0.5016, + "mean_token_accuracy": 0.8149169087409973, + "step": 260 + }, + { + "epoch": 0.8207547169811321, + "grad_norm": 0.2396860122680664, + "learning_rate": 3.9746336071583284e-05, + "loss": 0.5112, + "mean_token_accuracy": 0.8116641640663147, + "step": 261 + }, + { + "epoch": 0.8238993710691824, + "grad_norm": 0.2169322371482849, + "learning_rate": 3.9743259392302765e-05, + "loss": 0.5033, + "mean_token_accuracy": 0.8144574761390686, + "step": 262 + }, + { + "epoch": 0.8270440251572327, + "grad_norm": 0.27799248695373535, + "learning_rate": 3.9740164300877905e-05, + "loss": 0.5134, + "mean_token_accuracy": 0.8135626316070557, + "step": 263 + }, + { + "epoch": 0.8301886792452831, + "grad_norm": 0.2029547095298767, + "learning_rate": 3.9737050800520484e-05, + "loss": 0.4998, + "mean_token_accuracy": 0.8158847689628601, + "step": 264 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.23742541670799255, + "learning_rate": 3.973391889446137e-05, + "loss": 0.5002, + "mean_token_accuracy": 0.8158310651779175, + "step": 265 + }, + { + "epoch": 0.8364779874213837, + "grad_norm": 0.23540346324443817, + "learning_rate": 3.973076858595054e-05, + "loss": 0.5075, + "mean_token_accuracy": 0.8140506148338318, + "step": 266 + }, + { + "epoch": 0.839622641509434, + "grad_norm": 0.21212562918663025, + "learning_rate": 3.972759987825706e-05, + "loss": 0.4964, + "mean_token_accuracy": 0.817358672618866, + "step": 267 + }, + { + "epoch": 0.8427672955974843, + "grad_norm": 0.2811741828918457, + "learning_rate": 3.972441277466909e-05, + "loss": 0.5045, + "mean_token_accuracy": 0.8151137828826904, + "step": 268 + }, + { + "epoch": 0.8459119496855346, + "grad_norm": 0.22324548661708832, + "learning_rate": 3.972120727849388e-05, + "loss": 0.5184, + "mean_token_accuracy": 0.8102690577507019, + "step": 269 + }, + { + "epoch": 0.8490566037735849, + "grad_norm": 0.2826754152774811, + "learning_rate": 3.9717983393057774e-05, + "loss": 0.5033, + "mean_token_accuracy": 0.8149948120117188, + "step": 270 + }, + { + "epoch": 0.8522012578616353, + "grad_norm": 0.20113104581832886, + "learning_rate": 3.971474112170618e-05, + "loss": 0.4963, + "mean_token_accuracy": 0.8169792890548706, + "step": 271 + }, + { + "epoch": 0.8553459119496856, + "grad_norm": 0.31522729992866516, + "learning_rate": 3.971148046780361e-05, + "loss": 0.5043, + "mean_token_accuracy": 0.8139733672142029, + "step": 272 + }, + { + "epoch": 0.8584905660377359, + "grad_norm": 0.24440452456474304, + "learning_rate": 3.970820143473363e-05, + "loss": 0.506, + "mean_token_accuracy": 0.8141810297966003, + "step": 273 + }, + { + "epoch": 0.8616352201257862, + "grad_norm": 0.34243300557136536, + "learning_rate": 3.970490402589889e-05, + "loss": 0.5009, + "mean_token_accuracy": 0.8145892024040222, + "step": 274 + }, + { + "epoch": 0.8647798742138365, + "grad_norm": 0.3184676468372345, + "learning_rate": 3.970158824472109e-05, + "loss": 0.5019, + "mean_token_accuracy": 0.8156532049179077, + "step": 275 + }, + { + "epoch": 0.8679245283018868, + "grad_norm": 0.2528303861618042, + "learning_rate": 3.969825409464103e-05, + "loss": 0.5099, + "mean_token_accuracy": 0.8132652044296265, + "step": 276 + }, + { + "epoch": 0.8710691823899371, + "grad_norm": 0.2652378976345062, + "learning_rate": 3.969490157911854e-05, + "loss": 0.4965, + "mean_token_accuracy": 0.8170227408409119, + "step": 277 + }, + { + "epoch": 0.8742138364779874, + "grad_norm": 0.2514651119709015, + "learning_rate": 3.9691530701632525e-05, + "loss": 0.4947, + "mean_token_accuracy": 0.8181447386741638, + "step": 278 + }, + { + "epoch": 0.8773584905660378, + "grad_norm": 0.24551190435886383, + "learning_rate": 3.968814146568093e-05, + "loss": 0.5039, + "mean_token_accuracy": 0.8154646158218384, + "step": 279 + }, + { + "epoch": 0.8805031446540881, + "grad_norm": 0.250418096780777, + "learning_rate": 3.9684733874780764e-05, + "loss": 0.5057, + "mean_token_accuracy": 0.8137528896331787, + "step": 280 + }, + { + "epoch": 0.8836477987421384, + "grad_norm": 0.24162472784519196, + "learning_rate": 3.9681307932468066e-05, + "loss": 0.499, + "mean_token_accuracy": 0.8132847547531128, + "step": 281 + }, + { + "epoch": 0.8867924528301887, + "grad_norm": 0.25049126148223877, + "learning_rate": 3.967786364229794e-05, + "loss": 0.5016, + "mean_token_accuracy": 0.8135824203491211, + "step": 282 + }, + { + "epoch": 0.889937106918239, + "grad_norm": 0.21691341698169708, + "learning_rate": 3.9674401007844525e-05, + "loss": 0.4909, + "mean_token_accuracy": 0.8152883052825928, + "step": 283 + }, + { + "epoch": 0.8930817610062893, + "grad_norm": 0.22291503846645355, + "learning_rate": 3.967092003270098e-05, + "loss": 0.4959, + "mean_token_accuracy": 0.8131790161132812, + "step": 284 + }, + { + "epoch": 0.8962264150943396, + "grad_norm": 0.21946825087070465, + "learning_rate": 3.96674207204795e-05, + "loss": 0.4943, + "mean_token_accuracy": 0.8152416348457336, + "step": 285 + }, + { + "epoch": 0.89937106918239, + "grad_norm": 0.21576789021492004, + "learning_rate": 3.966390307481133e-05, + "loss": 0.4963, + "mean_token_accuracy": 0.8158524036407471, + "step": 286 + }, + { + "epoch": 0.9025157232704403, + "grad_norm": 0.18200084567070007, + "learning_rate": 3.966036709934671e-05, + "loss": 0.4983, + "mean_token_accuracy": 0.8171305656433105, + "step": 287 + }, + { + "epoch": 0.9056603773584906, + "grad_norm": 0.20305195450782776, + "learning_rate": 3.9656812797754924e-05, + "loss": 0.5004, + "mean_token_accuracy": 0.8154840469360352, + "step": 288 + }, + { + "epoch": 0.9088050314465409, + "grad_norm": 0.17034946382045746, + "learning_rate": 3.965324017372426e-05, + "loss": 0.4975, + "mean_token_accuracy": 0.8167921304702759, + "step": 289 + }, + { + "epoch": 0.9119496855345912, + "grad_norm": 0.2208290696144104, + "learning_rate": 3.964964923096202e-05, + "loss": 0.5022, + "mean_token_accuracy": 0.815450131893158, + "step": 290 + }, + { + "epoch": 0.9150943396226415, + "grad_norm": 0.18164092302322388, + "learning_rate": 3.964603997319452e-05, + "loss": 0.4964, + "mean_token_accuracy": 0.8163471221923828, + "step": 291 + }, + { + "epoch": 0.9182389937106918, + "grad_norm": 0.18271009624004364, + "learning_rate": 3.964241240416708e-05, + "loss": 0.489, + "mean_token_accuracy": 0.8168913125991821, + "step": 292 + }, + { + "epoch": 0.9213836477987422, + "grad_norm": 0.19366014003753662, + "learning_rate": 3.963876652764402e-05, + "loss": 0.5002, + "mean_token_accuracy": 0.815049946308136, + "step": 293 + }, + { + "epoch": 0.9245283018867925, + "grad_norm": 0.21568985283374786, + "learning_rate": 3.963510234740866e-05, + "loss": 0.5057, + "mean_token_accuracy": 0.8131378293037415, + "step": 294 + }, + { + "epoch": 0.9276729559748428, + "grad_norm": 0.16388444602489471, + "learning_rate": 3.963141986726332e-05, + "loss": 0.4991, + "mean_token_accuracy": 0.8145990371704102, + "step": 295 + }, + { + "epoch": 0.9308176100628931, + "grad_norm": 0.21059076488018036, + "learning_rate": 3.962771909102928e-05, + "loss": 0.5068, + "mean_token_accuracy": 0.8122072815895081, + "step": 296 + }, + { + "epoch": 0.9339622641509434, + "grad_norm": 0.20338748395442963, + "learning_rate": 3.962400002254685e-05, + "loss": 0.5021, + "mean_token_accuracy": 0.8148384690284729, + "step": 297 + }, + { + "epoch": 0.9371069182389937, + "grad_norm": 0.2134150117635727, + "learning_rate": 3.962026266567529e-05, + "loss": 0.5077, + "mean_token_accuracy": 0.8135893940925598, + "step": 298 + }, + { + "epoch": 0.940251572327044, + "grad_norm": 0.2366155982017517, + "learning_rate": 3.961650702429285e-05, + "loss": 0.5028, + "mean_token_accuracy": 0.8151034712791443, + "step": 299 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 0.19872386753559113, + "learning_rate": 3.9612733102296757e-05, + "loss": 0.49, + "mean_token_accuracy": 0.818320095539093, + "step": 300 + }, + { + "epoch": 0.9465408805031447, + "grad_norm": 0.24557925760746002, + "learning_rate": 3.960894090360319e-05, + "loss": 0.4978, + "mean_token_accuracy": 0.8179511427879333, + "step": 301 + }, + { + "epoch": 0.949685534591195, + "grad_norm": 0.18634836375713348, + "learning_rate": 3.960513043214731e-05, + "loss": 0.508, + "mean_token_accuracy": 0.8148860335350037, + "step": 302 + }, + { + "epoch": 0.9528301886792453, + "grad_norm": 0.2479529231786728, + "learning_rate": 3.960130169188324e-05, + "loss": 0.5061, + "mean_token_accuracy": 0.8134167194366455, + "step": 303 + }, + { + "epoch": 0.9559748427672956, + "grad_norm": 0.2314770370721817, + "learning_rate": 3.959745468678407e-05, + "loss": 0.506, + "mean_token_accuracy": 0.8139018416404724, + "step": 304 + }, + { + "epoch": 0.9591194968553459, + "grad_norm": 0.18175768852233887, + "learning_rate": 3.959358942084179e-05, + "loss": 0.5025, + "mean_token_accuracy": 0.814060389995575, + "step": 305 + }, + { + "epoch": 0.9622641509433962, + "grad_norm": 0.26930001378059387, + "learning_rate": 3.958970589806741e-05, + "loss": 0.4899, + "mean_token_accuracy": 0.8199323415756226, + "step": 306 + }, + { + "epoch": 0.9654088050314465, + "grad_norm": 0.18011023104190826, + "learning_rate": 3.958580412249085e-05, + "loss": 0.4998, + "mean_token_accuracy": 0.8157727122306824, + "step": 307 + }, + { + "epoch": 0.9685534591194969, + "grad_norm": 0.20771653950214386, + "learning_rate": 3.9581884098160965e-05, + "loss": 0.4946, + "mean_token_accuracy": 0.8169547915458679, + "step": 308 + }, + { + "epoch": 0.9716981132075472, + "grad_norm": 0.20119163393974304, + "learning_rate": 3.9577945829145565e-05, + "loss": 0.493, + "mean_token_accuracy": 0.816156268119812, + "step": 309 + }, + { + "epoch": 0.9748427672955975, + "grad_norm": 0.1962110996246338, + "learning_rate": 3.9573989319531376e-05, + "loss": 0.4838, + "mean_token_accuracy": 0.8181387782096863, + "step": 310 + }, + { + "epoch": 0.9779874213836478, + "grad_norm": 0.19966855645179749, + "learning_rate": 3.957001457342407e-05, + "loss": 0.494, + "mean_token_accuracy": 0.8177944421768188, + "step": 311 + }, + { + "epoch": 0.9811320754716981, + "grad_norm": 0.16196459531784058, + "learning_rate": 3.956602159494822e-05, + "loss": 0.4999, + "mean_token_accuracy": 0.8170042037963867, + "step": 312 + }, + { + "epoch": 0.9842767295597484, + "grad_norm": 0.2277836948633194, + "learning_rate": 3.956201038824734e-05, + "loss": 0.5079, + "mean_token_accuracy": 0.8125689625740051, + "step": 313 + }, + { + "epoch": 0.9874213836477987, + "grad_norm": 0.15822745859622955, + "learning_rate": 3.9557980957483857e-05, + "loss": 0.4944, + "mean_token_accuracy": 0.8152970671653748, + "step": 314 + }, + { + "epoch": 0.9905660377358491, + "grad_norm": 0.23701541125774384, + "learning_rate": 3.955393330683909e-05, + "loss": 0.4949, + "mean_token_accuracy": 0.8175249695777893, + "step": 315 + }, + { + "epoch": 0.9937106918238994, + "grad_norm": 0.17451459169387817, + "learning_rate": 3.954986744051329e-05, + "loss": 0.4903, + "mean_token_accuracy": 0.8196095824241638, + "step": 316 + }, + { + "epoch": 0.9968553459119497, + "grad_norm": 0.20685537159442902, + "learning_rate": 3.954578336272559e-05, + "loss": 0.4872, + "mean_token_accuracy": 0.8189926147460938, + "step": 317 + }, + { + "epoch": 1.0, + "grad_norm": 0.24290406703948975, + "learning_rate": 3.954168107771402e-05, + "loss": 0.4891, + "mean_token_accuracy": 0.8178141713142395, + "step": 318 + }, + { + "epoch": 1.0031446540880504, + "grad_norm": 0.17915529012680054, + "learning_rate": 3.953756058973554e-05, + "loss": 0.4626, + "mean_token_accuracy": 0.8239535689353943, + "step": 319 + }, + { + "epoch": 1.0062893081761006, + "grad_norm": 0.24146446585655212, + "learning_rate": 3.9533421903065946e-05, + "loss": 0.4629, + "mean_token_accuracy": 0.8243134617805481, + "step": 320 + }, + { + "epoch": 1.009433962264151, + "grad_norm": 0.24396361410617828, + "learning_rate": 3.9529265021999965e-05, + "loss": 0.4522, + "mean_token_accuracy": 0.8263998031616211, + "step": 321 + }, + { + "epoch": 1.0125786163522013, + "grad_norm": 0.21227137744426727, + "learning_rate": 3.952508995085117e-05, + "loss": 0.4574, + "mean_token_accuracy": 0.8258650898933411, + "step": 322 + }, + { + "epoch": 1.0157232704402517, + "grad_norm": 0.26239192485809326, + "learning_rate": 3.952089669395203e-05, + "loss": 0.4477, + "mean_token_accuracy": 0.826561689376831, + "step": 323 + }, + { + "epoch": 1.0188679245283019, + "grad_norm": 0.21801580488681793, + "learning_rate": 3.951668525565387e-05, + "loss": 0.4621, + "mean_token_accuracy": 0.8246572017669678, + "step": 324 + }, + { + "epoch": 1.0220125786163523, + "grad_norm": 0.24545596539974213, + "learning_rate": 3.95124556403269e-05, + "loss": 0.442, + "mean_token_accuracy": 0.8272949457168579, + "step": 325 + }, + { + "epoch": 1.0251572327044025, + "grad_norm": 0.21632397174835205, + "learning_rate": 3.950820785236018e-05, + "loss": 0.4616, + "mean_token_accuracy": 0.8258722424507141, + "step": 326 + }, + { + "epoch": 1.028301886792453, + "grad_norm": 0.25987663865089417, + "learning_rate": 3.950394189616164e-05, + "loss": 0.4433, + "mean_token_accuracy": 0.8266246318817139, + "step": 327 + }, + { + "epoch": 1.0314465408805031, + "grad_norm": 0.21071231365203857, + "learning_rate": 3.949965777615804e-05, + "loss": 0.4587, + "mean_token_accuracy": 0.8251431584358215, + "step": 328 + }, + { + "epoch": 1.0345911949685536, + "grad_norm": 0.25732994079589844, + "learning_rate": 3.949535549679502e-05, + "loss": 0.4542, + "mean_token_accuracy": 0.825411856174469, + "step": 329 + }, + { + "epoch": 1.0377358490566038, + "grad_norm": 0.2170933187007904, + "learning_rate": 3.9491035062537026e-05, + "loss": 0.4488, + "mean_token_accuracy": 0.8288849592208862, + "step": 330 + }, + { + "epoch": 1.0408805031446542, + "grad_norm": 0.23197437822818756, + "learning_rate": 3.948669647786738e-05, + "loss": 0.4475, + "mean_token_accuracy": 0.8281545042991638, + "step": 331 + }, + { + "epoch": 1.0440251572327044, + "grad_norm": 0.2003556787967682, + "learning_rate": 3.948233974728821e-05, + "loss": 0.4581, + "mean_token_accuracy": 0.8240156173706055, + "step": 332 + }, + { + "epoch": 1.0471698113207548, + "grad_norm": 0.24654971063137054, + "learning_rate": 3.94779648753205e-05, + "loss": 0.4593, + "mean_token_accuracy": 0.8252534866333008, + "step": 333 + }, + { + "epoch": 1.050314465408805, + "grad_norm": 0.1977737993001938, + "learning_rate": 3.947357186650403e-05, + "loss": 0.4569, + "mean_token_accuracy": 0.826411247253418, + "step": 334 + }, + { + "epoch": 1.0534591194968554, + "grad_norm": 0.1900518238544464, + "learning_rate": 3.9469160725397426e-05, + "loss": 0.4448, + "mean_token_accuracy": 0.8280865550041199, + "step": 335 + }, + { + "epoch": 1.0566037735849056, + "grad_norm": 0.20230652391910553, + "learning_rate": 3.946473145657813e-05, + "loss": 0.461, + "mean_token_accuracy": 0.8244823813438416, + "step": 336 + }, + { + "epoch": 1.059748427672956, + "grad_norm": 0.21481932699680328, + "learning_rate": 3.946028406464236e-05, + "loss": 0.4429, + "mean_token_accuracy": 0.8295682668685913, + "step": 337 + }, + { + "epoch": 1.0628930817610063, + "grad_norm": 0.20926858484745026, + "learning_rate": 3.9455818554205186e-05, + "loss": 0.4472, + "mean_token_accuracy": 0.8260708451271057, + "step": 338 + }, + { + "epoch": 1.0660377358490567, + "grad_norm": 0.19324615597724915, + "learning_rate": 3.945133492990045e-05, + "loss": 0.4695, + "mean_token_accuracy": 0.8230428099632263, + "step": 339 + }, + { + "epoch": 1.069182389937107, + "grad_norm": 0.22138728201389313, + "learning_rate": 3.944683319638081e-05, + "loss": 0.4436, + "mean_token_accuracy": 0.8289992213249207, + "step": 340 + }, + { + "epoch": 1.0723270440251573, + "grad_norm": 0.18856871128082275, + "learning_rate": 3.9442313358317694e-05, + "loss": 0.4393, + "mean_token_accuracy": 0.8271635174751282, + "step": 341 + }, + { + "epoch": 1.0754716981132075, + "grad_norm": 0.19933389127254486, + "learning_rate": 3.943777542040134e-05, + "loss": 0.4654, + "mean_token_accuracy": 0.8237259387969971, + "step": 342 + }, + { + "epoch": 1.078616352201258, + "grad_norm": 0.24634775519371033, + "learning_rate": 3.943321938734074e-05, + "loss": 0.4401, + "mean_token_accuracy": 0.829694926738739, + "step": 343 + }, + { + "epoch": 1.0817610062893082, + "grad_norm": 0.19513890147209167, + "learning_rate": 3.942864526386369e-05, + "loss": 0.4558, + "mean_token_accuracy": 0.825875997543335, + "step": 344 + }, + { + "epoch": 1.0849056603773586, + "grad_norm": 0.23661689460277557, + "learning_rate": 3.9424053054716755e-05, + "loss": 0.4544, + "mean_token_accuracy": 0.8247132301330566, + "step": 345 + }, + { + "epoch": 1.0880503144654088, + "grad_norm": 0.23907917737960815, + "learning_rate": 3.941944276466526e-05, + "loss": 0.464, + "mean_token_accuracy": 0.8243969082832336, + "step": 346 + }, + { + "epoch": 1.0911949685534592, + "grad_norm": 0.21444515883922577, + "learning_rate": 3.941481439849328e-05, + "loss": 0.4479, + "mean_token_accuracy": 0.8274782299995422, + "step": 347 + }, + { + "epoch": 1.0943396226415094, + "grad_norm": 0.23370276391506195, + "learning_rate": 3.941016796100368e-05, + "loss": 0.4586, + "mean_token_accuracy": 0.8257689476013184, + "step": 348 + }, + { + "epoch": 1.0974842767295598, + "grad_norm": 0.24292296171188354, + "learning_rate": 3.9405503457018045e-05, + "loss": 0.4514, + "mean_token_accuracy": 0.826076865196228, + "step": 349 + }, + { + "epoch": 1.10062893081761, + "grad_norm": 0.2037438154220581, + "learning_rate": 3.940082089137673e-05, + "loss": 0.4601, + "mean_token_accuracy": 0.8253549933433533, + "step": 350 + }, + { + "epoch": 1.1037735849056605, + "grad_norm": 0.2162904292345047, + "learning_rate": 3.939612026893881e-05, + "loss": 0.4383, + "mean_token_accuracy": 0.8296216726303101, + "step": 351 + }, + { + "epoch": 1.1069182389937107, + "grad_norm": 0.254822313785553, + "learning_rate": 3.939140159458213e-05, + "loss": 0.4668, + "mean_token_accuracy": 0.8235269784927368, + "step": 352 + }, + { + "epoch": 1.110062893081761, + "grad_norm": 0.1879785656929016, + "learning_rate": 3.938666487320323e-05, + "loss": 0.4435, + "mean_token_accuracy": 0.8316510915756226, + "step": 353 + }, + { + "epoch": 1.1132075471698113, + "grad_norm": 0.19271206855773926, + "learning_rate": 3.9381910109717415e-05, + "loss": 0.4598, + "mean_token_accuracy": 0.823648989200592, + "step": 354 + }, + { + "epoch": 1.1163522012578617, + "grad_norm": 0.22165369987487793, + "learning_rate": 3.937713730905868e-05, + "loss": 0.4611, + "mean_token_accuracy": 0.8243672847747803, + "step": 355 + }, + { + "epoch": 1.119496855345912, + "grad_norm": 0.17029280960559845, + "learning_rate": 3.937234647617975e-05, + "loss": 0.4536, + "mean_token_accuracy": 0.8255760073661804, + "step": 356 + }, + { + "epoch": 1.1226415094339623, + "grad_norm": 0.21115252375602722, + "learning_rate": 3.936753761605208e-05, + "loss": 0.4593, + "mean_token_accuracy": 0.8264986276626587, + "step": 357 + }, + { + "epoch": 1.1257861635220126, + "grad_norm": 0.19516626000404358, + "learning_rate": 3.936271073366579e-05, + "loss": 0.4509, + "mean_token_accuracy": 0.8269539475440979, + "step": 358 + }, + { + "epoch": 1.128930817610063, + "grad_norm": 0.1787671595811844, + "learning_rate": 3.935786583402975e-05, + "loss": 0.4511, + "mean_token_accuracy": 0.8269217014312744, + "step": 359 + }, + { + "epoch": 1.1320754716981132, + "grad_norm": 0.19312171638011932, + "learning_rate": 3.935300292217148e-05, + "loss": 0.4382, + "mean_token_accuracy": 0.8278734087944031, + "step": 360 + }, + { + "epoch": 1.1352201257861636, + "grad_norm": 0.1753867268562317, + "learning_rate": 3.9348122003137224e-05, + "loss": 0.4494, + "mean_token_accuracy": 0.8283815979957581, + "step": 361 + }, + { + "epoch": 1.1383647798742138, + "grad_norm": 0.17436008155345917, + "learning_rate": 3.9343223081991904e-05, + "loss": 0.4375, + "mean_token_accuracy": 0.8322175145149231, + "step": 362 + }, + { + "epoch": 1.1415094339622642, + "grad_norm": 0.17123448848724365, + "learning_rate": 3.933830616381912e-05, + "loss": 0.4603, + "mean_token_accuracy": 0.8241879940032959, + "step": 363 + }, + { + "epoch": 1.1446540880503144, + "grad_norm": 0.17863644659519196, + "learning_rate": 3.933337125372115e-05, + "loss": 0.443, + "mean_token_accuracy": 0.8289136290550232, + "step": 364 + }, + { + "epoch": 1.1477987421383649, + "grad_norm": 0.17425286769866943, + "learning_rate": 3.932841835681893e-05, + "loss": 0.4463, + "mean_token_accuracy": 0.8275046348571777, + "step": 365 + }, + { + "epoch": 1.150943396226415, + "grad_norm": 0.18034741282463074, + "learning_rate": 3.932344747825208e-05, + "loss": 0.4535, + "mean_token_accuracy": 0.8258575201034546, + "step": 366 + }, + { + "epoch": 1.1540880503144655, + "grad_norm": 0.166525736451149, + "learning_rate": 3.931845862317887e-05, + "loss": 0.4585, + "mean_token_accuracy": 0.8256432414054871, + "step": 367 + }, + { + "epoch": 1.1572327044025157, + "grad_norm": 0.1865280121564865, + "learning_rate": 3.931345179677622e-05, + "loss": 0.4384, + "mean_token_accuracy": 0.8310089111328125, + "step": 368 + }, + { + "epoch": 1.1603773584905661, + "grad_norm": 0.15174388885498047, + "learning_rate": 3.930842700423971e-05, + "loss": 0.4501, + "mean_token_accuracy": 0.8273698091506958, + "step": 369 + }, + { + "epoch": 1.1635220125786163, + "grad_norm": 0.19328919053077698, + "learning_rate": 3.9303384250783555e-05, + "loss": 0.4473, + "mean_token_accuracy": 0.8262863755226135, + "step": 370 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.19845743477344513, + "learning_rate": 3.929832354164061e-05, + "loss": 0.4517, + "mean_token_accuracy": 0.8262023329734802, + "step": 371 + }, + { + "epoch": 1.169811320754717, + "grad_norm": 0.18804894387722015, + "learning_rate": 3.929324488206235e-05, + "loss": 0.4416, + "mean_token_accuracy": 0.8311719298362732, + "step": 372 + }, + { + "epoch": 1.1729559748427674, + "grad_norm": 0.2016197293996811, + "learning_rate": 3.928814827731892e-05, + "loss": 0.4496, + "mean_token_accuracy": 0.8284710049629211, + "step": 373 + }, + { + "epoch": 1.1761006289308176, + "grad_norm": 0.1766403168439865, + "learning_rate": 3.928303373269903e-05, + "loss": 0.4613, + "mean_token_accuracy": 0.826279878616333, + "step": 374 + }, + { + "epoch": 1.179245283018868, + "grad_norm": 0.19094939529895782, + "learning_rate": 3.927790125351004e-05, + "loss": 0.446, + "mean_token_accuracy": 0.8275184035301208, + "step": 375 + }, + { + "epoch": 1.1823899371069182, + "grad_norm": 0.18953315913677216, + "learning_rate": 3.927275084507791e-05, + "loss": 0.4448, + "mean_token_accuracy": 0.8282947540283203, + "step": 376 + }, + { + "epoch": 1.1855345911949686, + "grad_norm": 0.19597028195858002, + "learning_rate": 3.9267582512747214e-05, + "loss": 0.4535, + "mean_token_accuracy": 0.825032114982605, + "step": 377 + }, + { + "epoch": 1.1886792452830188, + "grad_norm": 0.23007996380329132, + "learning_rate": 3.926239626188112e-05, + "loss": 0.4504, + "mean_token_accuracy": 0.8239325284957886, + "step": 378 + }, + { + "epoch": 1.1918238993710693, + "grad_norm": 0.15653976798057556, + "learning_rate": 3.925719209786139e-05, + "loss": 0.4599, + "mean_token_accuracy": 0.8246617317199707, + "step": 379 + }, + { + "epoch": 1.1949685534591195, + "grad_norm": 0.21206559240818024, + "learning_rate": 3.925197002608837e-05, + "loss": 0.4627, + "mean_token_accuracy": 0.8227834701538086, + "step": 380 + }, + { + "epoch": 1.1981132075471699, + "grad_norm": 0.15478071570396423, + "learning_rate": 3.924673005198099e-05, + "loss": 0.4491, + "mean_token_accuracy": 0.8279451727867126, + "step": 381 + }, + { + "epoch": 1.20125786163522, + "grad_norm": 0.19152425229549408, + "learning_rate": 3.924147218097678e-05, + "loss": 0.4454, + "mean_token_accuracy": 0.8282057046890259, + "step": 382 + }, + { + "epoch": 1.2044025157232705, + "grad_norm": 0.18045583367347717, + "learning_rate": 3.923619641853179e-05, + "loss": 0.4457, + "mean_token_accuracy": 0.8302314877510071, + "step": 383 + }, + { + "epoch": 1.2075471698113207, + "grad_norm": 0.17766040563583374, + "learning_rate": 3.9230902770120705e-05, + "loss": 0.4428, + "mean_token_accuracy": 0.8286284804344177, + "step": 384 + }, + { + "epoch": 1.2106918238993711, + "grad_norm": 0.21487727761268616, + "learning_rate": 3.9225591241236706e-05, + "loss": 0.4427, + "mean_token_accuracy": 0.829619824886322, + "step": 385 + }, + { + "epoch": 1.2138364779874213, + "grad_norm": 0.1838655024766922, + "learning_rate": 3.922026183739156e-05, + "loss": 0.4597, + "mean_token_accuracy": 0.823946475982666, + "step": 386 + }, + { + "epoch": 1.2169811320754718, + "grad_norm": 0.19127146899700165, + "learning_rate": 3.921491456411559e-05, + "loss": 0.45, + "mean_token_accuracy": 0.8261735439300537, + "step": 387 + }, + { + "epoch": 1.220125786163522, + "grad_norm": 0.22842063009738922, + "learning_rate": 3.920954942695764e-05, + "loss": 0.4478, + "mean_token_accuracy": 0.8252214789390564, + "step": 388 + }, + { + "epoch": 1.2232704402515724, + "grad_norm": 0.15844713151454926, + "learning_rate": 3.9204166431485116e-05, + "loss": 0.4488, + "mean_token_accuracy": 0.8267233967781067, + "step": 389 + }, + { + "epoch": 1.2264150943396226, + "grad_norm": 0.19568133354187012, + "learning_rate": 3.9198765583283915e-05, + "loss": 0.4478, + "mean_token_accuracy": 0.8291775584220886, + "step": 390 + }, + { + "epoch": 1.229559748427673, + "grad_norm": 0.1693885773420334, + "learning_rate": 3.919334688795851e-05, + "loss": 0.4669, + "mean_token_accuracy": 0.8229762315750122, + "step": 391 + }, + { + "epoch": 1.2327044025157232, + "grad_norm": 0.18789900839328766, + "learning_rate": 3.918791035113187e-05, + "loss": 0.4546, + "mean_token_accuracy": 0.8253647685050964, + "step": 392 + }, + { + "epoch": 1.2358490566037736, + "grad_norm": 0.16432693600654602, + "learning_rate": 3.918245597844546e-05, + "loss": 0.4563, + "mean_token_accuracy": 0.8253912329673767, + "step": 393 + }, + { + "epoch": 1.2389937106918238, + "grad_norm": 0.1900831013917923, + "learning_rate": 3.9176983775559285e-05, + "loss": 0.4435, + "mean_token_accuracy": 0.8277944326400757, + "step": 394 + }, + { + "epoch": 1.2421383647798743, + "grad_norm": 0.17624549567699432, + "learning_rate": 3.9171493748151836e-05, + "loss": 0.4594, + "mean_token_accuracy": 0.8259245753288269, + "step": 395 + }, + { + "epoch": 1.2452830188679245, + "grad_norm": 0.1721552014350891, + "learning_rate": 3.91659859019201e-05, + "loss": 0.464, + "mean_token_accuracy": 0.8246681094169617, + "step": 396 + }, + { + "epoch": 1.248427672955975, + "grad_norm": 0.17541447281837463, + "learning_rate": 3.916046024257957e-05, + "loss": 0.4527, + "mean_token_accuracy": 0.8268889784812927, + "step": 397 + }, + { + "epoch": 1.251572327044025, + "grad_norm": 0.17931130528450012, + "learning_rate": 3.91549167758642e-05, + "loss": 0.4416, + "mean_token_accuracy": 0.8300866484642029, + "step": 398 + }, + { + "epoch": 1.2547169811320755, + "grad_norm": 0.17346803843975067, + "learning_rate": 3.914935550752643e-05, + "loss": 0.4532, + "mean_token_accuracy": 0.825555145740509, + "step": 399 + }, + { + "epoch": 1.2578616352201257, + "grad_norm": 0.17272527515888214, + "learning_rate": 3.91437764433372e-05, + "loss": 0.4552, + "mean_token_accuracy": 0.8265715837478638, + "step": 400 + }, + { + "epoch": 1.2610062893081762, + "grad_norm": 0.18587706983089447, + "learning_rate": 3.913817958908587e-05, + "loss": 0.4512, + "mean_token_accuracy": 0.8276833295822144, + "step": 401 + }, + { + "epoch": 1.2641509433962264, + "grad_norm": 0.1864849030971527, + "learning_rate": 3.9132564950580286e-05, + "loss": 0.4439, + "mean_token_accuracy": 0.8282519578933716, + "step": 402 + }, + { + "epoch": 1.2672955974842768, + "grad_norm": 0.19797006249427795, + "learning_rate": 3.9126932533646756e-05, + "loss": 0.4452, + "mean_token_accuracy": 0.8268986344337463, + "step": 403 + }, + { + "epoch": 1.270440251572327, + "grad_norm": 0.2109120786190033, + "learning_rate": 3.912128234413002e-05, + "loss": 0.4516, + "mean_token_accuracy": 0.8287456631660461, + "step": 404 + }, + { + "epoch": 1.2735849056603774, + "grad_norm": 0.20502309501171112, + "learning_rate": 3.9115614387893284e-05, + "loss": 0.4585, + "mean_token_accuracy": 0.8249683380126953, + "step": 405 + }, + { + "epoch": 1.2767295597484276, + "grad_norm": 0.195379376411438, + "learning_rate": 3.910992867081815e-05, + "loss": 0.4478, + "mean_token_accuracy": 0.8271308541297913, + "step": 406 + }, + { + "epoch": 1.279874213836478, + "grad_norm": 0.19706301391124725, + "learning_rate": 3.9104225198804697e-05, + "loss": 0.4538, + "mean_token_accuracy": 0.8262473344802856, + "step": 407 + }, + { + "epoch": 1.2830188679245282, + "grad_norm": 0.22555312514305115, + "learning_rate": 3.90985039777714e-05, + "loss": 0.4541, + "mean_token_accuracy": 0.8253132104873657, + "step": 408 + }, + { + "epoch": 1.2861635220125787, + "grad_norm": 0.1870194524526596, + "learning_rate": 3.909276501365515e-05, + "loss": 0.4496, + "mean_token_accuracy": 0.8271132111549377, + "step": 409 + }, + { + "epoch": 1.2893081761006289, + "grad_norm": 0.20130424201488495, + "learning_rate": 3.9087008312411266e-05, + "loss": 0.4508, + "mean_token_accuracy": 0.8274027705192566, + "step": 410 + }, + { + "epoch": 1.2924528301886793, + "grad_norm": 0.19089150428771973, + "learning_rate": 3.908123388001347e-05, + "loss": 0.4437, + "mean_token_accuracy": 0.8283129930496216, + "step": 411 + }, + { + "epoch": 1.2955974842767295, + "grad_norm": 0.18513773381710052, + "learning_rate": 3.907544172245386e-05, + "loss": 0.4512, + "mean_token_accuracy": 0.8256914615631104, + "step": 412 + }, + { + "epoch": 1.29874213836478, + "grad_norm": 0.2461722195148468, + "learning_rate": 3.906963184574297e-05, + "loss": 0.4398, + "mean_token_accuracy": 0.8291003108024597, + "step": 413 + }, + { + "epoch": 1.3018867924528301, + "grad_norm": 0.16066083312034607, + "learning_rate": 3.906380425590969e-05, + "loss": 0.4622, + "mean_token_accuracy": 0.8245102167129517, + "step": 414 + }, + { + "epoch": 1.3050314465408805, + "grad_norm": 0.23007777333259583, + "learning_rate": 3.905795895900129e-05, + "loss": 0.4508, + "mean_token_accuracy": 0.8270963430404663, + "step": 415 + }, + { + "epoch": 1.3081761006289307, + "grad_norm": 0.1997261345386505, + "learning_rate": 3.905209596108342e-05, + "loss": 0.4428, + "mean_token_accuracy": 0.8287739753723145, + "step": 416 + }, + { + "epoch": 1.3113207547169812, + "grad_norm": 0.19289380311965942, + "learning_rate": 3.904621526824011e-05, + "loss": 0.4494, + "mean_token_accuracy": 0.8278332948684692, + "step": 417 + }, + { + "epoch": 1.3144654088050314, + "grad_norm": 0.20662692189216614, + "learning_rate": 3.904031688657375e-05, + "loss": 0.4481, + "mean_token_accuracy": 0.828018844127655, + "step": 418 + }, + { + "epoch": 1.3176100628930818, + "grad_norm": 0.17998790740966797, + "learning_rate": 3.903440082220506e-05, + "loss": 0.4627, + "mean_token_accuracy": 0.8233857154846191, + "step": 419 + }, + { + "epoch": 1.320754716981132, + "grad_norm": 0.18025898933410645, + "learning_rate": 3.902846708127315e-05, + "loss": 0.4435, + "mean_token_accuracy": 0.8296754956245422, + "step": 420 + }, + { + "epoch": 1.3238993710691824, + "grad_norm": 0.1748049110174179, + "learning_rate": 3.902251566993543e-05, + "loss": 0.4504, + "mean_token_accuracy": 0.8265761137008667, + "step": 421 + }, + { + "epoch": 1.3270440251572326, + "grad_norm": 0.1689056158065796, + "learning_rate": 3.901654659436768e-05, + "loss": 0.4475, + "mean_token_accuracy": 0.8279913067817688, + "step": 422 + }, + { + "epoch": 1.330188679245283, + "grad_norm": 0.18498939275741577, + "learning_rate": 3.901055986076399e-05, + "loss": 0.4612, + "mean_token_accuracy": 0.8243342638015747, + "step": 423 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.20584383606910706, + "learning_rate": 3.900455547533679e-05, + "loss": 0.4561, + "mean_token_accuracy": 0.8269695043563843, + "step": 424 + }, + { + "epoch": 1.3364779874213837, + "grad_norm": 0.19053617119789124, + "learning_rate": 3.899853344431681e-05, + "loss": 0.4503, + "mean_token_accuracy": 0.8270792961120605, + "step": 425 + }, + { + "epoch": 1.3396226415094339, + "grad_norm": 0.23964080214500427, + "learning_rate": 3.8992493773953103e-05, + "loss": 0.444, + "mean_token_accuracy": 0.8272242546081543, + "step": 426 + }, + { + "epoch": 1.3427672955974843, + "grad_norm": 0.17868532240390778, + "learning_rate": 3.898643647051303e-05, + "loss": 0.4478, + "mean_token_accuracy": 0.8284726738929749, + "step": 427 + }, + { + "epoch": 1.3459119496855345, + "grad_norm": 0.2097913771867752, + "learning_rate": 3.8980361540282226e-05, + "loss": 0.449, + "mean_token_accuracy": 0.8262530565261841, + "step": 428 + }, + { + "epoch": 1.349056603773585, + "grad_norm": 0.1811225712299347, + "learning_rate": 3.8974268989564655e-05, + "loss": 0.4539, + "mean_token_accuracy": 0.8266646265983582, + "step": 429 + }, + { + "epoch": 1.3522012578616351, + "grad_norm": 0.1976926028728485, + "learning_rate": 3.896815882468253e-05, + "loss": 0.4537, + "mean_token_accuracy": 0.8261896967887878, + "step": 430 + }, + { + "epoch": 1.3553459119496856, + "grad_norm": 0.17476551234722137, + "learning_rate": 3.8962031051976356e-05, + "loss": 0.45, + "mean_token_accuracy": 0.8286691904067993, + "step": 431 + }, + { + "epoch": 1.3584905660377358, + "grad_norm": 0.1752653270959854, + "learning_rate": 3.895588567780491e-05, + "loss": 0.4391, + "mean_token_accuracy": 0.830578088760376, + "step": 432 + }, + { + "epoch": 1.3616352201257862, + "grad_norm": 0.22124166786670685, + "learning_rate": 3.894972270854525e-05, + "loss": 0.4442, + "mean_token_accuracy": 0.8274651169776917, + "step": 433 + }, + { + "epoch": 1.3647798742138364, + "grad_norm": 0.16567222774028778, + "learning_rate": 3.894354215059265e-05, + "loss": 0.4586, + "mean_token_accuracy": 0.8255594968795776, + "step": 434 + }, + { + "epoch": 1.3679245283018868, + "grad_norm": 0.2491973489522934, + "learning_rate": 3.893734401036069e-05, + "loss": 0.4567, + "mean_token_accuracy": 0.825359046459198, + "step": 435 + }, + { + "epoch": 1.371069182389937, + "grad_norm": 0.16659407317638397, + "learning_rate": 3.8931128294281154e-05, + "loss": 0.4487, + "mean_token_accuracy": 0.8270349502563477, + "step": 436 + }, + { + "epoch": 1.3742138364779874, + "grad_norm": 0.21300457417964935, + "learning_rate": 3.892489500880408e-05, + "loss": 0.4584, + "mean_token_accuracy": 0.8247126936912537, + "step": 437 + }, + { + "epoch": 1.3773584905660377, + "grad_norm": 0.22345586121082306, + "learning_rate": 3.891864416039772e-05, + "loss": 0.4463, + "mean_token_accuracy": 0.8268424868583679, + "step": 438 + }, + { + "epoch": 1.380503144654088, + "grad_norm": 0.18328611552715302, + "learning_rate": 3.89123757555486e-05, + "loss": 0.441, + "mean_token_accuracy": 0.8294317126274109, + "step": 439 + }, + { + "epoch": 1.3836477987421385, + "grad_norm": 0.21173825860023499, + "learning_rate": 3.89060898007614e-05, + "loss": 0.4532, + "mean_token_accuracy": 0.826548159122467, + "step": 440 + }, + { + "epoch": 1.3867924528301887, + "grad_norm": 0.14746391773223877, + "learning_rate": 3.889978630255907e-05, + "loss": 0.4555, + "mean_token_accuracy": 0.826274037361145, + "step": 441 + }, + { + "epoch": 1.389937106918239, + "grad_norm": 0.2072405368089676, + "learning_rate": 3.8893465267482705e-05, + "loss": 0.4512, + "mean_token_accuracy": 0.8269615173339844, + "step": 442 + }, + { + "epoch": 1.3930817610062893, + "grad_norm": 0.18459253013134003, + "learning_rate": 3.8887126702091665e-05, + "loss": 0.4499, + "mean_token_accuracy": 0.8274534940719604, + "step": 443 + }, + { + "epoch": 1.3962264150943398, + "grad_norm": 0.1969769448041916, + "learning_rate": 3.8880770612963436e-05, + "loss": 0.4515, + "mean_token_accuracy": 0.8273787498474121, + "step": 444 + }, + { + "epoch": 1.39937106918239, + "grad_norm": 0.1519605964422226, + "learning_rate": 3.887439700669373e-05, + "loss": 0.4438, + "mean_token_accuracy": 0.8282424211502075, + "step": 445 + }, + { + "epoch": 1.4025157232704402, + "grad_norm": 0.2099246382713318, + "learning_rate": 3.8868005889896434e-05, + "loss": 0.4606, + "mean_token_accuracy": 0.8256029486656189, + "step": 446 + }, + { + "epoch": 1.4056603773584906, + "grad_norm": 0.16457189619541168, + "learning_rate": 3.886159726920359e-05, + "loss": 0.4593, + "mean_token_accuracy": 0.8245981335639954, + "step": 447 + }, + { + "epoch": 1.408805031446541, + "grad_norm": 0.2020101696252823, + "learning_rate": 3.8855171151265415e-05, + "loss": 0.4628, + "mean_token_accuracy": 0.8243246674537659, + "step": 448 + }, + { + "epoch": 1.4119496855345912, + "grad_norm": 0.1827785223722458, + "learning_rate": 3.884872754275027e-05, + "loss": 0.462, + "mean_token_accuracy": 0.8254520297050476, + "step": 449 + }, + { + "epoch": 1.4150943396226414, + "grad_norm": 0.1720219850540161, + "learning_rate": 3.8842266450344676e-05, + "loss": 0.4429, + "mean_token_accuracy": 0.8271845579147339, + "step": 450 + }, + { + "epoch": 1.4182389937106918, + "grad_norm": 0.1863139569759369, + "learning_rate": 3.8835787880753305e-05, + "loss": 0.4505, + "mean_token_accuracy": 0.8274635076522827, + "step": 451 + }, + { + "epoch": 1.4213836477987423, + "grad_norm": 0.16976246237754822, + "learning_rate": 3.882929184069894e-05, + "loss": 0.4666, + "mean_token_accuracy": 0.8241984844207764, + "step": 452 + }, + { + "epoch": 1.4245283018867925, + "grad_norm": 0.1592787802219391, + "learning_rate": 3.882277833692253e-05, + "loss": 0.4547, + "mean_token_accuracy": 0.8265584111213684, + "step": 453 + }, + { + "epoch": 1.4276729559748427, + "grad_norm": 0.18003453314304352, + "learning_rate": 3.8816247376183105e-05, + "loss": 0.456, + "mean_token_accuracy": 0.8237389922142029, + "step": 454 + }, + { + "epoch": 1.430817610062893, + "grad_norm": 0.15752705931663513, + "learning_rate": 3.880969896525784e-05, + "loss": 0.4535, + "mean_token_accuracy": 0.8278123140335083, + "step": 455 + }, + { + "epoch": 1.4339622641509435, + "grad_norm": 0.15204060077667236, + "learning_rate": 3.8803133110942e-05, + "loss": 0.4504, + "mean_token_accuracy": 0.8270664215087891, + "step": 456 + }, + { + "epoch": 1.4371069182389937, + "grad_norm": 0.1571756899356842, + "learning_rate": 3.879654982004897e-05, + "loss": 0.4558, + "mean_token_accuracy": 0.8265605568885803, + "step": 457 + }, + { + "epoch": 1.440251572327044, + "grad_norm": 0.1584654152393341, + "learning_rate": 3.8789949099410206e-05, + "loss": 0.4518, + "mean_token_accuracy": 0.8270092606544495, + "step": 458 + }, + { + "epoch": 1.4433962264150944, + "grad_norm": 0.15152078866958618, + "learning_rate": 3.878333095587527e-05, + "loss": 0.4461, + "mean_token_accuracy": 0.8276041150093079, + "step": 459 + }, + { + "epoch": 1.4465408805031448, + "grad_norm": 0.15868431329727173, + "learning_rate": 3.87766953963118e-05, + "loss": 0.4438, + "mean_token_accuracy": 0.8289068937301636, + "step": 460 + }, + { + "epoch": 1.449685534591195, + "grad_norm": 0.14925162494182587, + "learning_rate": 3.8770042427605486e-05, + "loss": 0.4513, + "mean_token_accuracy": 0.8267138600349426, + "step": 461 + }, + { + "epoch": 1.4528301886792452, + "grad_norm": 0.15520182251930237, + "learning_rate": 3.876337205666011e-05, + "loss": 0.4627, + "mean_token_accuracy": 0.825510561466217, + "step": 462 + }, + { + "epoch": 1.4559748427672956, + "grad_norm": 0.15460386872291565, + "learning_rate": 3.875668429039751e-05, + "loss": 0.4524, + "mean_token_accuracy": 0.8280324935913086, + "step": 463 + }, + { + "epoch": 1.459119496855346, + "grad_norm": 0.1757625788450241, + "learning_rate": 3.8749979135757564e-05, + "loss": 0.4476, + "mean_token_accuracy": 0.8284898996353149, + "step": 464 + }, + { + "epoch": 1.4622641509433962, + "grad_norm": 0.17886289954185486, + "learning_rate": 3.874325659969819e-05, + "loss": 0.4525, + "mean_token_accuracy": 0.8280431628227234, + "step": 465 + }, + { + "epoch": 1.4654088050314464, + "grad_norm": 0.17093369364738464, + "learning_rate": 3.873651668919535e-05, + "loss": 0.461, + "mean_token_accuracy": 0.8263329267501831, + "step": 466 + }, + { + "epoch": 1.4685534591194969, + "grad_norm": 0.1534358263015747, + "learning_rate": 3.872975941124305e-05, + "loss": 0.4662, + "mean_token_accuracy": 0.8242008090019226, + "step": 467 + }, + { + "epoch": 1.4716981132075473, + "grad_norm": 0.16316460072994232, + "learning_rate": 3.8722984772853276e-05, + "loss": 0.4401, + "mean_token_accuracy": 0.8306291103363037, + "step": 468 + }, + { + "epoch": 1.4748427672955975, + "grad_norm": 0.14488621056079865, + "learning_rate": 3.8716192781056086e-05, + "loss": 0.4529, + "mean_token_accuracy": 0.8279455900192261, + "step": 469 + }, + { + "epoch": 1.4779874213836477, + "grad_norm": 0.15546061098575592, + "learning_rate": 3.870938344289951e-05, + "loss": 0.4524, + "mean_token_accuracy": 0.8272173404693604, + "step": 470 + }, + { + "epoch": 1.4811320754716981, + "grad_norm": 0.14502915740013123, + "learning_rate": 3.8702556765449564e-05, + "loss": 0.4365, + "mean_token_accuracy": 0.8309008479118347, + "step": 471 + }, + { + "epoch": 1.4842767295597485, + "grad_norm": 0.14981509745121002, + "learning_rate": 3.8695712755790296e-05, + "loss": 0.4553, + "mean_token_accuracy": 0.8265926241874695, + "step": 472 + }, + { + "epoch": 1.4874213836477987, + "grad_norm": 0.1419067680835724, + "learning_rate": 3.8688851421023724e-05, + "loss": 0.4467, + "mean_token_accuracy": 0.8291380405426025, + "step": 473 + }, + { + "epoch": 1.490566037735849, + "grad_norm": 0.14329911768436432, + "learning_rate": 3.868197276826983e-05, + "loss": 0.4404, + "mean_token_accuracy": 0.8298222422599792, + "step": 474 + }, + { + "epoch": 1.4937106918238994, + "grad_norm": 0.14086921513080597, + "learning_rate": 3.8675076804666574e-05, + "loss": 0.4517, + "mean_token_accuracy": 0.8272147178649902, + "step": 475 + }, + { + "epoch": 1.4968553459119498, + "grad_norm": 0.13164976239204407, + "learning_rate": 3.86681635373699e-05, + "loss": 0.4543, + "mean_token_accuracy": 0.8258587121963501, + "step": 476 + }, + { + "epoch": 1.5, + "grad_norm": 0.14340752363204956, + "learning_rate": 3.866123297355368e-05, + "loss": 0.4547, + "mean_token_accuracy": 0.8274818062782288, + "step": 477 + }, + { + "epoch": 1.5031446540880502, + "grad_norm": 0.15563100576400757, + "learning_rate": 3.865428512040975e-05, + "loss": 0.4546, + "mean_token_accuracy": 0.8245983123779297, + "step": 478 + }, + { + "epoch": 1.5062893081761006, + "grad_norm": 0.15186692774295807, + "learning_rate": 3.864731998514788e-05, + "loss": 0.4471, + "mean_token_accuracy": 0.8284518718719482, + "step": 479 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 0.1551363319158554, + "learning_rate": 3.864033757499578e-05, + "loss": 0.4506, + "mean_token_accuracy": 0.8288803696632385, + "step": 480 + }, + { + "epoch": 1.5125786163522013, + "grad_norm": 0.19744457304477692, + "learning_rate": 3.863333789719908e-05, + "loss": 0.4459, + "mean_token_accuracy": 0.8297019600868225, + "step": 481 + }, + { + "epoch": 1.5157232704402515, + "grad_norm": 0.1606452614068985, + "learning_rate": 3.8626320959021336e-05, + "loss": 0.4574, + "mean_token_accuracy": 0.8262441754341125, + "step": 482 + }, + { + "epoch": 1.5188679245283019, + "grad_norm": 0.14951243996620178, + "learning_rate": 3.8619286767744e-05, + "loss": 0.4503, + "mean_token_accuracy": 0.8295007348060608, + "step": 483 + }, + { + "epoch": 1.5220125786163523, + "grad_norm": 0.1706402450799942, + "learning_rate": 3.8612235330666455e-05, + "loss": 0.4522, + "mean_token_accuracy": 0.8276010751724243, + "step": 484 + }, + { + "epoch": 1.5251572327044025, + "grad_norm": 0.18848121166229248, + "learning_rate": 3.860516665510595e-05, + "loss": 0.4478, + "mean_token_accuracy": 0.8272507190704346, + "step": 485 + }, + { + "epoch": 1.5283018867924527, + "grad_norm": 0.18110370635986328, + "learning_rate": 3.859808074839764e-05, + "loss": 0.4633, + "mean_token_accuracy": 0.8253755569458008, + "step": 486 + }, + { + "epoch": 1.5314465408805031, + "grad_norm": 0.15408791601657867, + "learning_rate": 3.859097761789455e-05, + "loss": 0.4608, + "mean_token_accuracy": 0.8244016766548157, + "step": 487 + }, + { + "epoch": 1.5345911949685536, + "grad_norm": 0.209492027759552, + "learning_rate": 3.858385727096759e-05, + "loss": 0.4536, + "mean_token_accuracy": 0.8256571292877197, + "step": 488 + }, + { + "epoch": 1.5377358490566038, + "grad_norm": 0.13831724226474762, + "learning_rate": 3.8576719715005534e-05, + "loss": 0.4471, + "mean_token_accuracy": 0.8289450407028198, + "step": 489 + }, + { + "epoch": 1.540880503144654, + "grad_norm": 0.1807355135679245, + "learning_rate": 3.856956495741501e-05, + "loss": 0.457, + "mean_token_accuracy": 0.8249189853668213, + "step": 490 + }, + { + "epoch": 1.5440251572327044, + "grad_norm": 0.14947283267974854, + "learning_rate": 3.856239300562047e-05, + "loss": 0.4517, + "mean_token_accuracy": 0.827312171459198, + "step": 491 + }, + { + "epoch": 1.5471698113207548, + "grad_norm": 0.1528450846672058, + "learning_rate": 3.855520386706427e-05, + "loss": 0.4469, + "mean_token_accuracy": 0.8281989097595215, + "step": 492 + }, + { + "epoch": 1.550314465408805, + "grad_norm": 0.1903686374425888, + "learning_rate": 3.854799754920654e-05, + "loss": 0.4517, + "mean_token_accuracy": 0.8267768025398254, + "step": 493 + }, + { + "epoch": 1.5534591194968552, + "grad_norm": 0.14876671135425568, + "learning_rate": 3.854077405952527e-05, + "loss": 0.4471, + "mean_token_accuracy": 0.8272234797477722, + "step": 494 + }, + { + "epoch": 1.5566037735849056, + "grad_norm": 0.15654216706752777, + "learning_rate": 3.853353340551626e-05, + "loss": 0.4426, + "mean_token_accuracy": 0.8292421698570251, + "step": 495 + }, + { + "epoch": 1.559748427672956, + "grad_norm": 0.15252597630023956, + "learning_rate": 3.852627559469313e-05, + "loss": 0.4466, + "mean_token_accuracy": 0.828421413898468, + "step": 496 + }, + { + "epoch": 1.5628930817610063, + "grad_norm": 0.15407709777355194, + "learning_rate": 3.8519000634587274e-05, + "loss": 0.4522, + "mean_token_accuracy": 0.8288334608078003, + "step": 497 + }, + { + "epoch": 1.5660377358490565, + "grad_norm": 0.15139752626419067, + "learning_rate": 3.851170853274793e-05, + "loss": 0.4591, + "mean_token_accuracy": 0.8242106437683105, + "step": 498 + }, + { + "epoch": 1.569182389937107, + "grad_norm": 0.16278305649757385, + "learning_rate": 3.8504399296742076e-05, + "loss": 0.4536, + "mean_token_accuracy": 0.8283582329750061, + "step": 499 + }, + { + "epoch": 1.5723270440251573, + "grad_norm": 0.21534794569015503, + "learning_rate": 3.84970729341545e-05, + "loss": 0.4464, + "mean_token_accuracy": 0.8293619155883789, + "step": 500 + }, + { + "epoch": 1.5754716981132075, + "grad_norm": 0.16945937275886536, + "learning_rate": 3.848972945258776e-05, + "loss": 0.4366, + "mean_token_accuracy": 0.8301461338996887, + "step": 501 + }, + { + "epoch": 1.5786163522012577, + "grad_norm": 0.19320879876613617, + "learning_rate": 3.8482368859662156e-05, + "loss": 0.4546, + "mean_token_accuracy": 0.8255838751792908, + "step": 502 + }, + { + "epoch": 1.5817610062893082, + "grad_norm": 0.16316726803779602, + "learning_rate": 3.847499116301577e-05, + "loss": 0.4535, + "mean_token_accuracy": 0.8292089104652405, + "step": 503 + }, + { + "epoch": 1.5849056603773586, + "grad_norm": 0.20331525802612305, + "learning_rate": 3.846759637030443e-05, + "loss": 0.4455, + "mean_token_accuracy": 0.8304234743118286, + "step": 504 + }, + { + "epoch": 1.5880503144654088, + "grad_norm": 0.14327698945999146, + "learning_rate": 3.846018448920168e-05, + "loss": 0.4388, + "mean_token_accuracy": 0.8320124745368958, + "step": 505 + }, + { + "epoch": 1.591194968553459, + "grad_norm": 0.1536184698343277, + "learning_rate": 3.845275552739883e-05, + "loss": 0.4611, + "mean_token_accuracy": 0.8233290910720825, + "step": 506 + }, + { + "epoch": 1.5943396226415094, + "grad_norm": 0.17109455168247223, + "learning_rate": 3.844530949260489e-05, + "loss": 0.4581, + "mean_token_accuracy": 0.8267108798027039, + "step": 507 + }, + { + "epoch": 1.5974842767295598, + "grad_norm": 0.1569495052099228, + "learning_rate": 3.8437846392546603e-05, + "loss": 0.454, + "mean_token_accuracy": 0.8257220387458801, + "step": 508 + }, + { + "epoch": 1.60062893081761, + "grad_norm": 0.15163108706474304, + "learning_rate": 3.84303662349684e-05, + "loss": 0.4459, + "mean_token_accuracy": 0.8274716734886169, + "step": 509 + }, + { + "epoch": 1.6037735849056602, + "grad_norm": 0.15992477536201477, + "learning_rate": 3.842286902763245e-05, + "loss": 0.4506, + "mean_token_accuracy": 0.8289050459861755, + "step": 510 + }, + { + "epoch": 1.6069182389937107, + "grad_norm": 0.16461218893527985, + "learning_rate": 3.841535477831855e-05, + "loss": 0.4523, + "mean_token_accuracy": 0.8275653123855591, + "step": 511 + }, + { + "epoch": 1.610062893081761, + "grad_norm": 0.15126198530197144, + "learning_rate": 3.840782349482426e-05, + "loss": 0.4442, + "mean_token_accuracy": 0.8301685452461243, + "step": 512 + }, + { + "epoch": 1.6132075471698113, + "grad_norm": 0.19573630392551422, + "learning_rate": 3.840027518496475e-05, + "loss": 0.4575, + "mean_token_accuracy": 0.8253160715103149, + "step": 513 + }, + { + "epoch": 1.6163522012578615, + "grad_norm": 0.16733072698116302, + "learning_rate": 3.8392709856572904e-05, + "loss": 0.4481, + "mean_token_accuracy": 0.8281794190406799, + "step": 514 + }, + { + "epoch": 1.619496855345912, + "grad_norm": 0.1560571938753128, + "learning_rate": 3.838512751749924e-05, + "loss": 0.4472, + "mean_token_accuracy": 0.8281599879264832, + "step": 515 + }, + { + "epoch": 1.6226415094339623, + "grad_norm": 0.17163477838039398, + "learning_rate": 3.837752817561194e-05, + "loss": 0.45, + "mean_token_accuracy": 0.8275222778320312, + "step": 516 + }, + { + "epoch": 1.6257861635220126, + "grad_norm": 0.16268257796764374, + "learning_rate": 3.8369911838796816e-05, + "loss": 0.4483, + "mean_token_accuracy": 0.8267813920974731, + "step": 517 + }, + { + "epoch": 1.6289308176100628, + "grad_norm": 0.16738560795783997, + "learning_rate": 3.8362278514957336e-05, + "loss": 0.4375, + "mean_token_accuracy": 0.8297300934791565, + "step": 518 + }, + { + "epoch": 1.6320754716981132, + "grad_norm": 0.16072408854961395, + "learning_rate": 3.8354628212014587e-05, + "loss": 0.4417, + "mean_token_accuracy": 0.8280481100082397, + "step": 519 + }, + { + "epoch": 1.6352201257861636, + "grad_norm": 0.17322075366973877, + "learning_rate": 3.8346960937907264e-05, + "loss": 0.4569, + "mean_token_accuracy": 0.8253117799758911, + "step": 520 + }, + { + "epoch": 1.6383647798742138, + "grad_norm": 0.1540379822254181, + "learning_rate": 3.833927670059168e-05, + "loss": 0.4352, + "mean_token_accuracy": 0.8304820656776428, + "step": 521 + }, + { + "epoch": 1.641509433962264, + "grad_norm": 0.20093458890914917, + "learning_rate": 3.833157550804176e-05, + "loss": 0.4502, + "mean_token_accuracy": 0.8276907801628113, + "step": 522 + }, + { + "epoch": 1.6446540880503144, + "grad_norm": 0.17706024646759033, + "learning_rate": 3.8323857368249014e-05, + "loss": 0.4472, + "mean_token_accuracy": 0.8285207748413086, + "step": 523 + }, + { + "epoch": 1.6477987421383649, + "grad_norm": 0.2173389196395874, + "learning_rate": 3.8316122289222535e-05, + "loss": 0.4482, + "mean_token_accuracy": 0.8268563747406006, + "step": 524 + }, + { + "epoch": 1.650943396226415, + "grad_norm": 0.14905473589897156, + "learning_rate": 3.8308370278989e-05, + "loss": 0.445, + "mean_token_accuracy": 0.8302363157272339, + "step": 525 + }, + { + "epoch": 1.6540880503144653, + "grad_norm": 0.1895207017660141, + "learning_rate": 3.8300601345592675e-05, + "loss": 0.455, + "mean_token_accuracy": 0.8297334909439087, + "step": 526 + }, + { + "epoch": 1.6572327044025157, + "grad_norm": 0.190410777926445, + "learning_rate": 3.829281549709533e-05, + "loss": 0.4456, + "mean_token_accuracy": 0.8273657560348511, + "step": 527 + }, + { + "epoch": 1.6603773584905661, + "grad_norm": 0.15508361160755157, + "learning_rate": 3.828501274157635e-05, + "loss": 0.4444, + "mean_token_accuracy": 0.8293198347091675, + "step": 528 + }, + { + "epoch": 1.6635220125786163, + "grad_norm": 0.17064864933490753, + "learning_rate": 3.8277193087132634e-05, + "loss": 0.4534, + "mean_token_accuracy": 0.8277645707130432, + "step": 529 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.18299253284931183, + "learning_rate": 3.826935654187861e-05, + "loss": 0.4529, + "mean_token_accuracy": 0.8259576559066772, + "step": 530 + }, + { + "epoch": 1.669811320754717, + "grad_norm": 0.14436650276184082, + "learning_rate": 3.826150311394627e-05, + "loss": 0.4424, + "mean_token_accuracy": 0.8301630020141602, + "step": 531 + }, + { + "epoch": 1.6729559748427674, + "grad_norm": 0.16234935820102692, + "learning_rate": 3.825363281148507e-05, + "loss": 0.4414, + "mean_token_accuracy": 0.8282803297042847, + "step": 532 + }, + { + "epoch": 1.6761006289308176, + "grad_norm": 0.1424507051706314, + "learning_rate": 3.8245745642662025e-05, + "loss": 0.45, + "mean_token_accuracy": 0.8285747170448303, + "step": 533 + }, + { + "epoch": 1.6792452830188678, + "grad_norm": 0.17243961989879608, + "learning_rate": 3.8237841615661636e-05, + "loss": 0.4519, + "mean_token_accuracy": 0.8274285197257996, + "step": 534 + }, + { + "epoch": 1.6823899371069182, + "grad_norm": 0.15060077607631683, + "learning_rate": 3.8229920738685886e-05, + "loss": 0.4485, + "mean_token_accuracy": 0.827806293964386, + "step": 535 + }, + { + "epoch": 1.6855345911949686, + "grad_norm": 0.16962994635105133, + "learning_rate": 3.8221983019954254e-05, + "loss": 0.4526, + "mean_token_accuracy": 0.8264437317848206, + "step": 536 + }, + { + "epoch": 1.6886792452830188, + "grad_norm": 0.16717560589313507, + "learning_rate": 3.82140284677037e-05, + "loss": 0.4453, + "mean_token_accuracy": 0.8285322785377502, + "step": 537 + }, + { + "epoch": 1.691823899371069, + "grad_norm": 0.15772481262683868, + "learning_rate": 3.820605709018865e-05, + "loss": 0.4506, + "mean_token_accuracy": 0.8271482586860657, + "step": 538 + }, + { + "epoch": 1.6949685534591195, + "grad_norm": 0.17806661128997803, + "learning_rate": 3.819806889568098e-05, + "loss": 0.4482, + "mean_token_accuracy": 0.8278267979621887, + "step": 539 + }, + { + "epoch": 1.6981132075471699, + "grad_norm": 0.18837103247642517, + "learning_rate": 3.819006389247002e-05, + "loss": 0.4491, + "mean_token_accuracy": 0.8294506669044495, + "step": 540 + }, + { + "epoch": 1.70125786163522, + "grad_norm": 0.18263404071331024, + "learning_rate": 3.8182042088862555e-05, + "loss": 0.4549, + "mean_token_accuracy": 0.8264862895011902, + "step": 541 + }, + { + "epoch": 1.7044025157232703, + "grad_norm": 0.17201568186283112, + "learning_rate": 3.8174003493182784e-05, + "loss": 0.4479, + "mean_token_accuracy": 0.826499879360199, + "step": 542 + }, + { + "epoch": 1.7075471698113207, + "grad_norm": 0.2106884866952896, + "learning_rate": 3.816594811377235e-05, + "loss": 0.448, + "mean_token_accuracy": 0.8292680382728577, + "step": 543 + }, + { + "epoch": 1.7106918238993711, + "grad_norm": 0.17805208265781403, + "learning_rate": 3.81578759589903e-05, + "loss": 0.451, + "mean_token_accuracy": 0.8258050680160522, + "step": 544 + }, + { + "epoch": 1.7138364779874213, + "grad_norm": 0.20148631930351257, + "learning_rate": 3.814978703721309e-05, + "loss": 0.4491, + "mean_token_accuracy": 0.8281079530715942, + "step": 545 + }, + { + "epoch": 1.7169811320754715, + "grad_norm": 0.2121734768152237, + "learning_rate": 3.8141681356834587e-05, + "loss": 0.4503, + "mean_token_accuracy": 0.8274951577186584, + "step": 546 + }, + { + "epoch": 1.720125786163522, + "grad_norm": 0.16927511990070343, + "learning_rate": 3.813355892626603e-05, + "loss": 0.4358, + "mean_token_accuracy": 0.8312507271766663, + "step": 547 + }, + { + "epoch": 1.7232704402515724, + "grad_norm": 0.2374507337808609, + "learning_rate": 3.8125419753936055e-05, + "loss": 0.4433, + "mean_token_accuracy": 0.8288567662239075, + "step": 548 + }, + { + "epoch": 1.7264150943396226, + "grad_norm": 0.14739219844341278, + "learning_rate": 3.8117263848290656e-05, + "loss": 0.4432, + "mean_token_accuracy": 0.8277235627174377, + "step": 549 + }, + { + "epoch": 1.7295597484276728, + "grad_norm": 0.22958379983901978, + "learning_rate": 3.810909121779321e-05, + "loss": 0.4522, + "mean_token_accuracy": 0.8266805410385132, + "step": 550 + }, + { + "epoch": 1.7327044025157232, + "grad_norm": 0.1478036642074585, + "learning_rate": 3.810090187092443e-05, + "loss": 0.4362, + "mean_token_accuracy": 0.8309704065322876, + "step": 551 + }, + { + "epoch": 1.7358490566037736, + "grad_norm": 0.2060670554637909, + "learning_rate": 3.809269581618239e-05, + "loss": 0.4463, + "mean_token_accuracy": 0.8310072422027588, + "step": 552 + }, + { + "epoch": 1.7389937106918238, + "grad_norm": 0.157624289393425, + "learning_rate": 3.80844730620825e-05, + "loss": 0.4489, + "mean_token_accuracy": 0.82773357629776, + "step": 553 + }, + { + "epoch": 1.742138364779874, + "grad_norm": 0.18612465262413025, + "learning_rate": 3.8076233617157486e-05, + "loss": 0.4506, + "mean_token_accuracy": 0.8275358080863953, + "step": 554 + }, + { + "epoch": 1.7452830188679245, + "grad_norm": 0.15709009766578674, + "learning_rate": 3.806797748995741e-05, + "loss": 0.452, + "mean_token_accuracy": 0.829462468624115, + "step": 555 + }, + { + "epoch": 1.748427672955975, + "grad_norm": 0.17328409850597382, + "learning_rate": 3.805970468904964e-05, + "loss": 0.4439, + "mean_token_accuracy": 0.8290309906005859, + "step": 556 + }, + { + "epoch": 1.751572327044025, + "grad_norm": 0.17372116446495056, + "learning_rate": 3.805141522301884e-05, + "loss": 0.4524, + "mean_token_accuracy": 0.8271963000297546, + "step": 557 + }, + { + "epoch": 1.7547169811320755, + "grad_norm": 0.161929190158844, + "learning_rate": 3.804310910046697e-05, + "loss": 0.4399, + "mean_token_accuracy": 0.8296563029289246, + "step": 558 + }, + { + "epoch": 1.757861635220126, + "grad_norm": 0.1848653256893158, + "learning_rate": 3.803478633001328e-05, + "loss": 0.4576, + "mean_token_accuracy": 0.8277556896209717, + "step": 559 + }, + { + "epoch": 1.7610062893081762, + "grad_norm": 0.1677953600883484, + "learning_rate": 3.8026446920294295e-05, + "loss": 0.4554, + "mean_token_accuracy": 0.8266874551773071, + "step": 560 + }, + { + "epoch": 1.7641509433962264, + "grad_norm": 0.15664350986480713, + "learning_rate": 3.80180908799638e-05, + "loss": 0.4499, + "mean_token_accuracy": 0.8271031379699707, + "step": 561 + }, + { + "epoch": 1.7672955974842768, + "grad_norm": 0.1716042309999466, + "learning_rate": 3.800971821769284e-05, + "loss": 0.4477, + "mean_token_accuracy": 0.8271938562393188, + "step": 562 + }, + { + "epoch": 1.7704402515723272, + "grad_norm": 0.15244047343730927, + "learning_rate": 3.800132894216971e-05, + "loss": 0.4373, + "mean_token_accuracy": 0.8322244882583618, + "step": 563 + }, + { + "epoch": 1.7735849056603774, + "grad_norm": 0.1561097949743271, + "learning_rate": 3.799292306209995e-05, + "loss": 0.4342, + "mean_token_accuracy": 0.8296236395835876, + "step": 564 + }, + { + "epoch": 1.7767295597484276, + "grad_norm": 0.13890111446380615, + "learning_rate": 3.7984500586206324e-05, + "loss": 0.4552, + "mean_token_accuracy": 0.825447678565979, + "step": 565 + }, + { + "epoch": 1.779874213836478, + "grad_norm": 0.16650985181331635, + "learning_rate": 3.79760615232288e-05, + "loss": 0.4474, + "mean_token_accuracy": 0.8286757469177246, + "step": 566 + }, + { + "epoch": 1.7830188679245285, + "grad_norm": 0.14276254177093506, + "learning_rate": 3.7967605881924605e-05, + "loss": 0.4608, + "mean_token_accuracy": 0.8258764147758484, + "step": 567 + }, + { + "epoch": 1.7861635220125787, + "grad_norm": 0.13229092955589294, + "learning_rate": 3.795913367106812e-05, + "loss": 0.4489, + "mean_token_accuracy": 0.8294503688812256, + "step": 568 + }, + { + "epoch": 1.7893081761006289, + "grad_norm": 0.15833866596221924, + "learning_rate": 3.795064489945095e-05, + "loss": 0.4442, + "mean_token_accuracy": 0.829393744468689, + "step": 569 + }, + { + "epoch": 1.7924528301886793, + "grad_norm": 0.14272266626358032, + "learning_rate": 3.7942139575881875e-05, + "loss": 0.4488, + "mean_token_accuracy": 0.8297043442726135, + "step": 570 + }, + { + "epoch": 1.7955974842767297, + "grad_norm": 0.15338532626628876, + "learning_rate": 3.7933617709186845e-05, + "loss": 0.4637, + "mean_token_accuracy": 0.8241843581199646, + "step": 571 + }, + { + "epoch": 1.79874213836478, + "grad_norm": 0.16218815743923187, + "learning_rate": 3.7925079308209e-05, + "loss": 0.4581, + "mean_token_accuracy": 0.8253941535949707, + "step": 572 + }, + { + "epoch": 1.8018867924528301, + "grad_norm": 0.1723911464214325, + "learning_rate": 3.7916524381808606e-05, + "loss": 0.4485, + "mean_token_accuracy": 0.8298059701919556, + "step": 573 + }, + { + "epoch": 1.8050314465408805, + "grad_norm": 0.1315835565328598, + "learning_rate": 3.7907952938863095e-05, + "loss": 0.4409, + "mean_token_accuracy": 0.8307161331176758, + "step": 574 + }, + { + "epoch": 1.808176100628931, + "grad_norm": 0.1815989911556244, + "learning_rate": 3.7899364988267045e-05, + "loss": 0.4533, + "mean_token_accuracy": 0.8261933326721191, + "step": 575 + }, + { + "epoch": 1.8113207547169812, + "grad_norm": 0.13026003539562225, + "learning_rate": 3.789076053893214e-05, + "loss": 0.4425, + "mean_token_accuracy": 0.8303501009941101, + "step": 576 + }, + { + "epoch": 1.8144654088050314, + "grad_norm": 0.15672063827514648, + "learning_rate": 3.788213959978722e-05, + "loss": 0.4492, + "mean_token_accuracy": 0.8289505839347839, + "step": 577 + }, + { + "epoch": 1.8176100628930818, + "grad_norm": 0.1376017928123474, + "learning_rate": 3.7873502179778204e-05, + "loss": 0.4531, + "mean_token_accuracy": 0.8275585174560547, + "step": 578 + }, + { + "epoch": 1.8207547169811322, + "grad_norm": 0.16440635919570923, + "learning_rate": 3.786484828786812e-05, + "loss": 0.4357, + "mean_token_accuracy": 0.8302841782569885, + "step": 579 + }, + { + "epoch": 1.8238993710691824, + "grad_norm": 0.15636101365089417, + "learning_rate": 3.78561779330371e-05, + "loss": 0.4612, + "mean_token_accuracy": 0.8249790072441101, + "step": 580 + }, + { + "epoch": 1.8270440251572326, + "grad_norm": 0.14285589754581451, + "learning_rate": 3.7847491124282354e-05, + "loss": 0.4456, + "mean_token_accuracy": 0.828575611114502, + "step": 581 + }, + { + "epoch": 1.830188679245283, + "grad_norm": 0.16128715872764587, + "learning_rate": 3.783878787061817e-05, + "loss": 0.4499, + "mean_token_accuracy": 0.8283258676528931, + "step": 582 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.14683718979358673, + "learning_rate": 3.783006818107589e-05, + "loss": 0.4493, + "mean_token_accuracy": 0.8294415473937988, + "step": 583 + }, + { + "epoch": 1.8364779874213837, + "grad_norm": 0.17366930842399597, + "learning_rate": 3.782133206470392e-05, + "loss": 0.4569, + "mean_token_accuracy": 0.8255544900894165, + "step": 584 + }, + { + "epoch": 1.8396226415094339, + "grad_norm": 0.18530210852622986, + "learning_rate": 3.781257953056771e-05, + "loss": 0.4469, + "mean_token_accuracy": 0.8293200731277466, + "step": 585 + }, + { + "epoch": 1.8427672955974843, + "grad_norm": 0.1758362352848053, + "learning_rate": 3.780381058774975e-05, + "loss": 0.4444, + "mean_token_accuracy": 0.8289125561714172, + "step": 586 + }, + { + "epoch": 1.8459119496855347, + "grad_norm": 0.17143899202346802, + "learning_rate": 3.779502524534955e-05, + "loss": 0.4437, + "mean_token_accuracy": 0.8296340703964233, + "step": 587 + }, + { + "epoch": 1.849056603773585, + "grad_norm": 0.15893438458442688, + "learning_rate": 3.778622351248365e-05, + "loss": 0.461, + "mean_token_accuracy": 0.8271942734718323, + "step": 588 + }, + { + "epoch": 1.8522012578616351, + "grad_norm": 0.15634989738464355, + "learning_rate": 3.777740539828558e-05, + "loss": 0.4646, + "mean_token_accuracy": 0.8230259418487549, + "step": 589 + }, + { + "epoch": 1.8553459119496856, + "grad_norm": 0.15147466957569122, + "learning_rate": 3.776857091190588e-05, + "loss": 0.4628, + "mean_token_accuracy": 0.8259610533714294, + "step": 590 + }, + { + "epoch": 1.858490566037736, + "grad_norm": 0.1858360767364502, + "learning_rate": 3.775972006251209e-05, + "loss": 0.4608, + "mean_token_accuracy": 0.8260266184806824, + "step": 591 + }, + { + "epoch": 1.8616352201257862, + "grad_norm": 0.1573314070701599, + "learning_rate": 3.77508528592887e-05, + "loss": 0.4638, + "mean_token_accuracy": 0.8259332180023193, + "step": 592 + }, + { + "epoch": 1.8647798742138364, + "grad_norm": 0.1616523116827011, + "learning_rate": 3.7741969311437203e-05, + "loss": 0.4354, + "mean_token_accuracy": 0.8312195539474487, + "step": 593 + }, + { + "epoch": 1.8679245283018868, + "grad_norm": 0.161224365234375, + "learning_rate": 3.7733069428176044e-05, + "loss": 0.4606, + "mean_token_accuracy": 0.8260555267333984, + "step": 594 + }, + { + "epoch": 1.8710691823899372, + "grad_norm": 0.14999954402446747, + "learning_rate": 3.772415321874059e-05, + "loss": 0.4526, + "mean_token_accuracy": 0.829216480255127, + "step": 595 + }, + { + "epoch": 1.8742138364779874, + "grad_norm": 0.13627400994300842, + "learning_rate": 3.7715220692383206e-05, + "loss": 0.4451, + "mean_token_accuracy": 0.8285694718360901, + "step": 596 + }, + { + "epoch": 1.8773584905660377, + "grad_norm": 0.1767715960741043, + "learning_rate": 3.770627185837313e-05, + "loss": 0.4477, + "mean_token_accuracy": 0.8299650549888611, + "step": 597 + }, + { + "epoch": 1.880503144654088, + "grad_norm": 0.17853030562400818, + "learning_rate": 3.769730672599655e-05, + "loss": 0.4455, + "mean_token_accuracy": 0.8283320665359497, + "step": 598 + }, + { + "epoch": 1.8836477987421385, + "grad_norm": 0.1425485908985138, + "learning_rate": 3.768832530455658e-05, + "loss": 0.4537, + "mean_token_accuracy": 0.8282998204231262, + "step": 599 + }, + { + "epoch": 1.8867924528301887, + "grad_norm": 0.14554312825202942, + "learning_rate": 3.7679327603373224e-05, + "loss": 0.4365, + "mean_token_accuracy": 0.830973744392395, + "step": 600 + }, + { + "epoch": 1.889937106918239, + "grad_norm": 0.14549976587295532, + "learning_rate": 3.767031363178336e-05, + "loss": 0.4476, + "mean_token_accuracy": 0.8286392092704773, + "step": 601 + }, + { + "epoch": 1.8930817610062893, + "grad_norm": 0.1506306231021881, + "learning_rate": 3.766128339914079e-05, + "loss": 0.4344, + "mean_token_accuracy": 0.8309949636459351, + "step": 602 + }, + { + "epoch": 1.8962264150943398, + "grad_norm": 0.18175862729549408, + "learning_rate": 3.765223691481615e-05, + "loss": 0.4414, + "mean_token_accuracy": 0.8330844044685364, + "step": 603 + }, + { + "epoch": 1.89937106918239, + "grad_norm": 0.14774861931800842, + "learning_rate": 3.764317418819696e-05, + "loss": 0.447, + "mean_token_accuracy": 0.8281767964363098, + "step": 604 + }, + { + "epoch": 1.9025157232704402, + "grad_norm": 0.2164774239063263, + "learning_rate": 3.7634095228687606e-05, + "loss": 0.449, + "mean_token_accuracy": 0.827027440071106, + "step": 605 + }, + { + "epoch": 1.9056603773584906, + "grad_norm": 0.1532975137233734, + "learning_rate": 3.762500004570929e-05, + "loss": 0.439, + "mean_token_accuracy": 0.8315529823303223, + "step": 606 + }, + { + "epoch": 1.908805031446541, + "grad_norm": 0.17712901532649994, + "learning_rate": 3.761588864870009e-05, + "loss": 0.4586, + "mean_token_accuracy": 0.8272659182548523, + "step": 607 + }, + { + "epoch": 1.9119496855345912, + "grad_norm": 0.1497354805469513, + "learning_rate": 3.760676104711485e-05, + "loss": 0.4487, + "mean_token_accuracy": 0.8286811709403992, + "step": 608 + }, + { + "epoch": 1.9150943396226414, + "grad_norm": 0.16662414371967316, + "learning_rate": 3.759761725042529e-05, + "loss": 0.4597, + "mean_token_accuracy": 0.8276802897453308, + "step": 609 + }, + { + "epoch": 1.9182389937106918, + "grad_norm": 0.17103828489780426, + "learning_rate": 3.75884572681199e-05, + "loss": 0.4465, + "mean_token_accuracy": 0.8278771638870239, + "step": 610 + }, + { + "epoch": 1.9213836477987423, + "grad_norm": 0.18412946164608002, + "learning_rate": 3.7579281109703976e-05, + "loss": 0.4396, + "mean_token_accuracy": 0.8283287882804871, + "step": 611 + }, + { + "epoch": 1.9245283018867925, + "grad_norm": 0.21794135868549347, + "learning_rate": 3.757008878469959e-05, + "loss": 0.4531, + "mean_token_accuracy": 0.8257644772529602, + "step": 612 + }, + { + "epoch": 1.9276729559748427, + "grad_norm": 0.16859892010688782, + "learning_rate": 3.756088030264562e-05, + "loss": 0.4477, + "mean_token_accuracy": 0.8286134600639343, + "step": 613 + }, + { + "epoch": 1.930817610062893, + "grad_norm": 0.16001489758491516, + "learning_rate": 3.7551655673097664e-05, + "loss": 0.4477, + "mean_token_accuracy": 0.8258501887321472, + "step": 614 + }, + { + "epoch": 1.9339622641509435, + "grad_norm": 0.16016210615634918, + "learning_rate": 3.7542414905628125e-05, + "loss": 0.4464, + "mean_token_accuracy": 0.8278330564498901, + "step": 615 + }, + { + "epoch": 1.9371069182389937, + "grad_norm": 0.16711488366127014, + "learning_rate": 3.753315800982611e-05, + "loss": 0.4529, + "mean_token_accuracy": 0.8275772333145142, + "step": 616 + }, + { + "epoch": 1.940251572327044, + "grad_norm": 0.14468665421009064, + "learning_rate": 3.75238849952975e-05, + "loss": 0.4498, + "mean_token_accuracy": 0.8276384472846985, + "step": 617 + }, + { + "epoch": 1.9433962264150944, + "grad_norm": 0.16502094268798828, + "learning_rate": 3.751459587166486e-05, + "loss": 0.4418, + "mean_token_accuracy": 0.8288312554359436, + "step": 618 + }, + { + "epoch": 1.9465408805031448, + "grad_norm": 0.15773820877075195, + "learning_rate": 3.750529064856752e-05, + "loss": 0.4467, + "mean_token_accuracy": 0.8294534683227539, + "step": 619 + }, + { + "epoch": 1.949685534591195, + "grad_norm": 0.15135358273983002, + "learning_rate": 3.749596933566146e-05, + "loss": 0.4547, + "mean_token_accuracy": 0.826626181602478, + "step": 620 + }, + { + "epoch": 1.9528301886792452, + "grad_norm": 0.14081326127052307, + "learning_rate": 3.748663194261942e-05, + "loss": 0.4491, + "mean_token_accuracy": 0.8284742832183838, + "step": 621 + }, + { + "epoch": 1.9559748427672956, + "grad_norm": 0.14700694382190704, + "learning_rate": 3.7477278479130774e-05, + "loss": 0.4505, + "mean_token_accuracy": 0.828678548336029, + "step": 622 + }, + { + "epoch": 1.959119496855346, + "grad_norm": 0.14334739744663239, + "learning_rate": 3.7467908954901607e-05, + "loss": 0.4515, + "mean_token_accuracy": 0.8276963829994202, + "step": 623 + }, + { + "epoch": 1.9622641509433962, + "grad_norm": 0.13543659448623657, + "learning_rate": 3.745852337965463e-05, + "loss": 0.4372, + "mean_token_accuracy": 0.8290265202522278, + "step": 624 + }, + { + "epoch": 1.9654088050314464, + "grad_norm": 0.13767287135124207, + "learning_rate": 3.744912176312926e-05, + "loss": 0.4536, + "mean_token_accuracy": 0.829569935798645, + "step": 625 + }, + { + "epoch": 1.9685534591194969, + "grad_norm": 0.14214669167995453, + "learning_rate": 3.743970411508153e-05, + "loss": 0.4492, + "mean_token_accuracy": 0.8299821019172668, + "step": 626 + }, + { + "epoch": 1.9716981132075473, + "grad_norm": 0.1446889191865921, + "learning_rate": 3.7430270445284113e-05, + "loss": 0.4507, + "mean_token_accuracy": 0.8302611112594604, + "step": 627 + }, + { + "epoch": 1.9748427672955975, + "grad_norm": 0.1604827344417572, + "learning_rate": 3.74208207635263e-05, + "loss": 0.4503, + "mean_token_accuracy": 0.8277794122695923, + "step": 628 + }, + { + "epoch": 1.9779874213836477, + "grad_norm": 0.15117383003234863, + "learning_rate": 3.741135507961402e-05, + "loss": 0.4459, + "mean_token_accuracy": 0.8283612132072449, + "step": 629 + }, + { + "epoch": 1.9811320754716981, + "grad_norm": 0.152221217751503, + "learning_rate": 3.740187340336978e-05, + "loss": 0.4501, + "mean_token_accuracy": 0.8274828791618347, + "step": 630 + }, + { + "epoch": 1.9842767295597485, + "grad_norm": 0.15416257083415985, + "learning_rate": 3.7392375744632705e-05, + "loss": 0.4314, + "mean_token_accuracy": 0.8337308764457703, + "step": 631 + }, + { + "epoch": 1.9874213836477987, + "grad_norm": 0.15964354574680328, + "learning_rate": 3.73828621132585e-05, + "loss": 0.4477, + "mean_token_accuracy": 0.8281906843185425, + "step": 632 + }, + { + "epoch": 1.990566037735849, + "grad_norm": 0.176462784409523, + "learning_rate": 3.737333251911943e-05, + "loss": 0.458, + "mean_token_accuracy": 0.8277231454849243, + "step": 633 + }, + { + "epoch": 1.9937106918238994, + "grad_norm": 0.14236986637115479, + "learning_rate": 3.736378697210435e-05, + "loss": 0.4433, + "mean_token_accuracy": 0.830397367477417, + "step": 634 + }, + { + "epoch": 1.9968553459119498, + "grad_norm": 0.20474541187286377, + "learning_rate": 3.735422548211865e-05, + "loss": 0.4443, + "mean_token_accuracy": 0.8304517269134521, + "step": 635 + }, + { + "epoch": 2.0, + "grad_norm": 0.1430058777332306, + "learning_rate": 3.7344648059084254e-05, + "loss": 0.4409, + "mean_token_accuracy": 0.8248613476753235, + "step": 636 + }, + { + "epoch": 2.00314465408805, + "grad_norm": 0.2259976714849472, + "learning_rate": 3.733505471293965e-05, + "loss": 0.3875, + "mean_token_accuracy": 0.8429651260375977, + "step": 637 + }, + { + "epoch": 2.006289308176101, + "grad_norm": 0.7415010929107666, + "learning_rate": 3.732544545363983e-05, + "loss": 0.396, + "mean_token_accuracy": 0.8418735861778259, + "step": 638 + }, + { + "epoch": 2.009433962264151, + "grad_norm": 0.20141229033470154, + "learning_rate": 3.73158202911563e-05, + "loss": 0.3975, + "mean_token_accuracy": 0.8419850468635559, + "step": 639 + }, + { + "epoch": 2.0125786163522013, + "grad_norm": 0.29176196455955505, + "learning_rate": 3.730617923547708e-05, + "loss": 0.38, + "mean_token_accuracy": 0.8438968062400818, + "step": 640 + }, + { + "epoch": 2.0157232704402515, + "grad_norm": 0.19412219524383545, + "learning_rate": 3.7296522296606663e-05, + "loss": 0.3857, + "mean_token_accuracy": 0.843908965587616, + "step": 641 + }, + { + "epoch": 2.018867924528302, + "grad_norm": 0.33550146222114563, + "learning_rate": 3.7286849484566046e-05, + "loss": 0.3931, + "mean_token_accuracy": 0.8432913422584534, + "step": 642 + }, + { + "epoch": 2.0220125786163523, + "grad_norm": 0.2596316635608673, + "learning_rate": 3.727716080939268e-05, + "loss": 0.3962, + "mean_token_accuracy": 0.841339111328125, + "step": 643 + }, + { + "epoch": 2.0251572327044025, + "grad_norm": 0.2917231321334839, + "learning_rate": 3.726745628114048e-05, + "loss": 0.3957, + "mean_token_accuracy": 0.8408085703849792, + "step": 644 + }, + { + "epoch": 2.0283018867924527, + "grad_norm": 0.22424006462097168, + "learning_rate": 3.725773590987982e-05, + "loss": 0.3824, + "mean_token_accuracy": 0.8433759808540344, + "step": 645 + }, + { + "epoch": 2.0314465408805034, + "grad_norm": 0.31289535760879517, + "learning_rate": 3.7247999705697515e-05, + "loss": 0.3867, + "mean_token_accuracy": 0.8449568152427673, + "step": 646 + }, + { + "epoch": 2.0345911949685536, + "grad_norm": 0.28442052006721497, + "learning_rate": 3.723824767869679e-05, + "loss": 0.4006, + "mean_token_accuracy": 0.8402043581008911, + "step": 647 + }, + { + "epoch": 2.0377358490566038, + "grad_norm": 0.282670795917511, + "learning_rate": 3.722847983899732e-05, + "loss": 0.388, + "mean_token_accuracy": 0.8409184217453003, + "step": 648 + }, + { + "epoch": 2.040880503144654, + "grad_norm": 0.26842135190963745, + "learning_rate": 3.7218696196735165e-05, + "loss": 0.3722, + "mean_token_accuracy": 0.8447781801223755, + "step": 649 + }, + { + "epoch": 2.0440251572327046, + "grad_norm": 0.22568707168102264, + "learning_rate": 3.720889676206279e-05, + "loss": 0.3783, + "mean_token_accuracy": 0.8424438834190369, + "step": 650 + }, + { + "epoch": 2.047169811320755, + "grad_norm": 0.23483210802078247, + "learning_rate": 3.719908154514904e-05, + "loss": 0.4051, + "mean_token_accuracy": 0.8413841128349304, + "step": 651 + }, + { + "epoch": 2.050314465408805, + "grad_norm": 0.19200192391872406, + "learning_rate": 3.7189250556179156e-05, + "loss": 0.391, + "mean_token_accuracy": 0.8443261384963989, + "step": 652 + }, + { + "epoch": 2.0534591194968552, + "grad_norm": 0.2197944074869156, + "learning_rate": 3.717940380535474e-05, + "loss": 0.3733, + "mean_token_accuracy": 0.8451419472694397, + "step": 653 + }, + { + "epoch": 2.056603773584906, + "grad_norm": 0.15395909547805786, + "learning_rate": 3.716954130289374e-05, + "loss": 0.396, + "mean_token_accuracy": 0.8405272364616394, + "step": 654 + }, + { + "epoch": 2.059748427672956, + "grad_norm": 0.2504603862762451, + "learning_rate": 3.7159663059030446e-05, + "loss": 0.3819, + "mean_token_accuracy": 0.8421579599380493, + "step": 655 + }, + { + "epoch": 2.0628930817610063, + "grad_norm": 0.16238710284233093, + "learning_rate": 3.7149769084015514e-05, + "loss": 0.3874, + "mean_token_accuracy": 0.8433874249458313, + "step": 656 + }, + { + "epoch": 2.0660377358490565, + "grad_norm": 0.25545817613601685, + "learning_rate": 3.713985938811588e-05, + "loss": 0.3916, + "mean_token_accuracy": 0.841235876083374, + "step": 657 + }, + { + "epoch": 2.069182389937107, + "grad_norm": 0.2000405341386795, + "learning_rate": 3.7129933981614816e-05, + "loss": 0.3717, + "mean_token_accuracy": 0.8454606533050537, + "step": 658 + }, + { + "epoch": 2.0723270440251573, + "grad_norm": 0.2362697422504425, + "learning_rate": 3.711999287481191e-05, + "loss": 0.3962, + "mean_token_accuracy": 0.840667724609375, + "step": 659 + }, + { + "epoch": 2.0754716981132075, + "grad_norm": 0.21723434329032898, + "learning_rate": 3.7110036078023024e-05, + "loss": 0.3906, + "mean_token_accuracy": 0.8431429266929626, + "step": 660 + }, + { + "epoch": 2.0786163522012577, + "grad_norm": 0.17892718315124512, + "learning_rate": 3.71000636015803e-05, + "loss": 0.3848, + "mean_token_accuracy": 0.8448073863983154, + "step": 661 + }, + { + "epoch": 2.0817610062893084, + "grad_norm": 0.18332098424434662, + "learning_rate": 3.709007545583217e-05, + "loss": 0.393, + "mean_token_accuracy": 0.8400642275810242, + "step": 662 + }, + { + "epoch": 2.0849056603773586, + "grad_norm": 0.1824181079864502, + "learning_rate": 3.708007165114329e-05, + "loss": 0.398, + "mean_token_accuracy": 0.8406155705451965, + "step": 663 + }, + { + "epoch": 2.088050314465409, + "grad_norm": 0.18380111455917358, + "learning_rate": 3.70700521978946e-05, + "loss": 0.3918, + "mean_token_accuracy": 0.8429200649261475, + "step": 664 + }, + { + "epoch": 2.091194968553459, + "grad_norm": 0.1803388148546219, + "learning_rate": 3.706001710648327e-05, + "loss": 0.4006, + "mean_token_accuracy": 0.8405814170837402, + "step": 665 + }, + { + "epoch": 2.0943396226415096, + "grad_norm": 0.17774419486522675, + "learning_rate": 3.7049966387322694e-05, + "loss": 0.3817, + "mean_token_accuracy": 0.842662513256073, + "step": 666 + }, + { + "epoch": 2.09748427672956, + "grad_norm": 0.19682133197784424, + "learning_rate": 3.703990005084248e-05, + "loss": 0.3837, + "mean_token_accuracy": 0.8443858027458191, + "step": 667 + }, + { + "epoch": 2.10062893081761, + "grad_norm": 0.19912737607955933, + "learning_rate": 3.702981810748844e-05, + "loss": 0.3818, + "mean_token_accuracy": 0.8457018136978149, + "step": 668 + }, + { + "epoch": 2.1037735849056602, + "grad_norm": 0.17018529772758484, + "learning_rate": 3.7019720567722595e-05, + "loss": 0.375, + "mean_token_accuracy": 0.8442913293838501, + "step": 669 + }, + { + "epoch": 2.106918238993711, + "grad_norm": 0.18541787564754486, + "learning_rate": 3.700960744202313e-05, + "loss": 0.3678, + "mean_token_accuracy": 0.8443611264228821, + "step": 670 + }, + { + "epoch": 2.110062893081761, + "grad_norm": 0.15378877520561218, + "learning_rate": 3.6999478740884435e-05, + "loss": 0.3852, + "mean_token_accuracy": 0.8438538312911987, + "step": 671 + }, + { + "epoch": 2.1132075471698113, + "grad_norm": 0.1910746991634369, + "learning_rate": 3.6989334474817026e-05, + "loss": 0.3812, + "mean_token_accuracy": 0.8418902158737183, + "step": 672 + }, + { + "epoch": 2.1163522012578615, + "grad_norm": 0.15623098611831665, + "learning_rate": 3.697917465434759e-05, + "loss": 0.3934, + "mean_token_accuracy": 0.8424828052520752, + "step": 673 + }, + { + "epoch": 2.119496855345912, + "grad_norm": 0.17167270183563232, + "learning_rate": 3.6968999290018953e-05, + "loss": 0.3905, + "mean_token_accuracy": 0.8438811898231506, + "step": 674 + }, + { + "epoch": 2.1226415094339623, + "grad_norm": 0.1417340338230133, + "learning_rate": 3.695880839239007e-05, + "loss": 0.3953, + "mean_token_accuracy": 0.8393605947494507, + "step": 675 + }, + { + "epoch": 2.1257861635220126, + "grad_norm": 0.19678044319152832, + "learning_rate": 3.694860197203602e-05, + "loss": 0.3838, + "mean_token_accuracy": 0.8427770137786865, + "step": 676 + }, + { + "epoch": 2.1289308176100628, + "grad_norm": 0.16106431186199188, + "learning_rate": 3.693838003954798e-05, + "loss": 0.397, + "mean_token_accuracy": 0.8402181267738342, + "step": 677 + }, + { + "epoch": 2.1320754716981134, + "grad_norm": 0.15794852375984192, + "learning_rate": 3.692814260553323e-05, + "loss": 0.3899, + "mean_token_accuracy": 0.8398270606994629, + "step": 678 + }, + { + "epoch": 2.1352201257861636, + "grad_norm": 0.1485368013381958, + "learning_rate": 3.691788968061512e-05, + "loss": 0.3834, + "mean_token_accuracy": 0.8433291912078857, + "step": 679 + }, + { + "epoch": 2.138364779874214, + "grad_norm": 0.16451844573020935, + "learning_rate": 3.690762127543312e-05, + "loss": 0.388, + "mean_token_accuracy": 0.8428193926811218, + "step": 680 + }, + { + "epoch": 2.141509433962264, + "grad_norm": 0.1383827030658722, + "learning_rate": 3.6897337400642706e-05, + "loss": 0.3863, + "mean_token_accuracy": 0.844539225101471, + "step": 681 + }, + { + "epoch": 2.1446540880503147, + "grad_norm": 0.16487407684326172, + "learning_rate": 3.688703806691545e-05, + "loss": 0.3806, + "mean_token_accuracy": 0.8434612154960632, + "step": 682 + }, + { + "epoch": 2.147798742138365, + "grad_norm": 0.16573506593704224, + "learning_rate": 3.6876723284938944e-05, + "loss": 0.3753, + "mean_token_accuracy": 0.8452996611595154, + "step": 683 + }, + { + "epoch": 2.150943396226415, + "grad_norm": 0.16341538727283478, + "learning_rate": 3.686639306541681e-05, + "loss": 0.3854, + "mean_token_accuracy": 0.8425849676132202, + "step": 684 + }, + { + "epoch": 2.1540880503144653, + "grad_norm": 0.17833516001701355, + "learning_rate": 3.685604741906871e-05, + "loss": 0.3863, + "mean_token_accuracy": 0.8424407243728638, + "step": 685 + }, + { + "epoch": 2.157232704402516, + "grad_norm": 0.15904675424098969, + "learning_rate": 3.684568635663029e-05, + "loss": 0.3786, + "mean_token_accuracy": 0.8449070453643799, + "step": 686 + }, + { + "epoch": 2.160377358490566, + "grad_norm": 0.14655998349189758, + "learning_rate": 3.683530988885321e-05, + "loss": 0.3935, + "mean_token_accuracy": 0.8412083387374878, + "step": 687 + }, + { + "epoch": 2.1635220125786163, + "grad_norm": 0.16055339574813843, + "learning_rate": 3.6824918026505094e-05, + "loss": 0.3858, + "mean_token_accuracy": 0.8450068831443787, + "step": 688 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.1480380743741989, + "learning_rate": 3.681451078036957e-05, + "loss": 0.3874, + "mean_token_accuracy": 0.8429787158966064, + "step": 689 + }, + { + "epoch": 2.169811320754717, + "grad_norm": 0.1642938256263733, + "learning_rate": 3.680408816124622e-05, + "loss": 0.4035, + "mean_token_accuracy": 0.8393082022666931, + "step": 690 + }, + { + "epoch": 2.1729559748427674, + "grad_norm": 0.15204781293869019, + "learning_rate": 3.6793650179950574e-05, + "loss": 0.369, + "mean_token_accuracy": 0.8482940793037415, + "step": 691 + }, + { + "epoch": 2.1761006289308176, + "grad_norm": 0.18579435348510742, + "learning_rate": 3.67831968473141e-05, + "loss": 0.3856, + "mean_token_accuracy": 0.8452662229537964, + "step": 692 + }, + { + "epoch": 2.1792452830188678, + "grad_norm": 0.16767072677612305, + "learning_rate": 3.6772728174184185e-05, + "loss": 0.3887, + "mean_token_accuracy": 0.8428248167037964, + "step": 693 + }, + { + "epoch": 2.1823899371069184, + "grad_norm": 0.1605173796415329, + "learning_rate": 3.676224417142417e-05, + "loss": 0.3735, + "mean_token_accuracy": 0.8427917957305908, + "step": 694 + }, + { + "epoch": 2.1855345911949686, + "grad_norm": 0.15939317643642426, + "learning_rate": 3.675174484991328e-05, + "loss": 0.3782, + "mean_token_accuracy": 0.8429858088493347, + "step": 695 + }, + { + "epoch": 2.188679245283019, + "grad_norm": 0.17312584817409515, + "learning_rate": 3.674123022054664e-05, + "loss": 0.3846, + "mean_token_accuracy": 0.843677818775177, + "step": 696 + }, + { + "epoch": 2.191823899371069, + "grad_norm": 0.14854241907596588, + "learning_rate": 3.6730700294235255e-05, + "loss": 0.401, + "mean_token_accuracy": 0.8407949209213257, + "step": 697 + }, + { + "epoch": 2.1949685534591197, + "grad_norm": 0.15429075062274933, + "learning_rate": 3.6720155081906004e-05, + "loss": 0.383, + "mean_token_accuracy": 0.8440051078796387, + "step": 698 + }, + { + "epoch": 2.19811320754717, + "grad_norm": 0.19384504854679108, + "learning_rate": 3.670959459450164e-05, + "loss": 0.3876, + "mean_token_accuracy": 0.8426343202590942, + "step": 699 + }, + { + "epoch": 2.20125786163522, + "grad_norm": 0.17279024422168732, + "learning_rate": 3.669901884298077e-05, + "loss": 0.3933, + "mean_token_accuracy": 0.8408414721488953, + "step": 700 + }, + { + "epoch": 2.2044025157232703, + "grad_norm": 0.14874425530433655, + "learning_rate": 3.6688427838317805e-05, + "loss": 0.3851, + "mean_token_accuracy": 0.843673586845398, + "step": 701 + }, + { + "epoch": 2.207547169811321, + "grad_norm": 0.1604325771331787, + "learning_rate": 3.667782159150302e-05, + "loss": 0.3934, + "mean_token_accuracy": 0.840538740158081, + "step": 702 + }, + { + "epoch": 2.210691823899371, + "grad_norm": 0.1530817151069641, + "learning_rate": 3.666720011354251e-05, + "loss": 0.3947, + "mean_token_accuracy": 0.8400828838348389, + "step": 703 + }, + { + "epoch": 2.2138364779874213, + "grad_norm": 0.14883075654506683, + "learning_rate": 3.6656563415458144e-05, + "loss": 0.3965, + "mean_token_accuracy": 0.8402729034423828, + "step": 704 + }, + { + "epoch": 2.2169811320754715, + "grad_norm": 0.13267891108989716, + "learning_rate": 3.66459115082876e-05, + "loss": 0.3857, + "mean_token_accuracy": 0.8434920310974121, + "step": 705 + }, + { + "epoch": 2.220125786163522, + "grad_norm": 0.17050376534461975, + "learning_rate": 3.663524440308436e-05, + "loss": 0.3995, + "mean_token_accuracy": 0.8391640186309814, + "step": 706 + }, + { + "epoch": 2.2232704402515724, + "grad_norm": 0.16033872961997986, + "learning_rate": 3.6624562110917634e-05, + "loss": 0.3791, + "mean_token_accuracy": 0.8431466817855835, + "step": 707 + }, + { + "epoch": 2.2264150943396226, + "grad_norm": 0.16110140085220337, + "learning_rate": 3.6613864642872433e-05, + "loss": 0.3815, + "mean_token_accuracy": 0.84393709897995, + "step": 708 + }, + { + "epoch": 2.229559748427673, + "grad_norm": 0.1578216254711151, + "learning_rate": 3.660315201004949e-05, + "loss": 0.3901, + "mean_token_accuracy": 0.8419655561447144, + "step": 709 + }, + { + "epoch": 2.2327044025157234, + "grad_norm": 0.15616242587566376, + "learning_rate": 3.659242422356528e-05, + "loss": 0.3755, + "mean_token_accuracy": 0.847722589969635, + "step": 710 + }, + { + "epoch": 2.2358490566037736, + "grad_norm": 0.16292689740657806, + "learning_rate": 3.658168129455201e-05, + "loss": 0.3778, + "mean_token_accuracy": 0.8432061076164246, + "step": 711 + }, + { + "epoch": 2.238993710691824, + "grad_norm": 0.14753198623657227, + "learning_rate": 3.657092323415759e-05, + "loss": 0.3908, + "mean_token_accuracy": 0.8414838314056396, + "step": 712 + }, + { + "epoch": 2.242138364779874, + "grad_norm": 0.14525368809700012, + "learning_rate": 3.656015005354565e-05, + "loss": 0.3864, + "mean_token_accuracy": 0.8425192832946777, + "step": 713 + }, + { + "epoch": 2.2452830188679247, + "grad_norm": 0.1643352061510086, + "learning_rate": 3.654936176389548e-05, + "loss": 0.3937, + "mean_token_accuracy": 0.8406062722206116, + "step": 714 + }, + { + "epoch": 2.248427672955975, + "grad_norm": 0.1485619693994522, + "learning_rate": 3.653855837640208e-05, + "loss": 0.3949, + "mean_token_accuracy": 0.8385857939720154, + "step": 715 + }, + { + "epoch": 2.251572327044025, + "grad_norm": 0.17640046775341034, + "learning_rate": 3.65277399022761e-05, + "loss": 0.3931, + "mean_token_accuracy": 0.84043288230896, + "step": 716 + }, + { + "epoch": 2.2547169811320753, + "grad_norm": 0.13140366971492767, + "learning_rate": 3.651690635274385e-05, + "loss": 0.382, + "mean_token_accuracy": 0.8446291089057922, + "step": 717 + }, + { + "epoch": 2.257861635220126, + "grad_norm": 0.1530969738960266, + "learning_rate": 3.650605773904728e-05, + "loss": 0.373, + "mean_token_accuracy": 0.8450080156326294, + "step": 718 + }, + { + "epoch": 2.261006289308176, + "grad_norm": 0.13690687716007233, + "learning_rate": 3.649519407244397e-05, + "loss": 0.3972, + "mean_token_accuracy": 0.8407341241836548, + "step": 719 + }, + { + "epoch": 2.2641509433962264, + "grad_norm": 0.1574098914861679, + "learning_rate": 3.648431536420713e-05, + "loss": 0.3798, + "mean_token_accuracy": 0.8445830345153809, + "step": 720 + }, + { + "epoch": 2.2672955974842766, + "grad_norm": 0.1320675164461136, + "learning_rate": 3.6473421625625575e-05, + "loss": 0.3939, + "mean_token_accuracy": 0.8414525389671326, + "step": 721 + }, + { + "epoch": 2.270440251572327, + "grad_norm": 0.16731448471546173, + "learning_rate": 3.646251286800371e-05, + "loss": 0.3884, + "mean_token_accuracy": 0.8414167165756226, + "step": 722 + }, + { + "epoch": 2.2735849056603774, + "grad_norm": 0.15549425780773163, + "learning_rate": 3.645158910266154e-05, + "loss": 0.3905, + "mean_token_accuracy": 0.8417775630950928, + "step": 723 + }, + { + "epoch": 2.2767295597484276, + "grad_norm": 0.14840513467788696, + "learning_rate": 3.6440650340934625e-05, + "loss": 0.4033, + "mean_token_accuracy": 0.8384340405464172, + "step": 724 + }, + { + "epoch": 2.279874213836478, + "grad_norm": 0.1481187343597412, + "learning_rate": 3.64296965941741e-05, + "loss": 0.3943, + "mean_token_accuracy": 0.8407248854637146, + "step": 725 + }, + { + "epoch": 2.2830188679245285, + "grad_norm": 0.16942381858825684, + "learning_rate": 3.641872787374664e-05, + "loss": 0.3954, + "mean_token_accuracy": 0.841485857963562, + "step": 726 + }, + { + "epoch": 2.2861635220125787, + "grad_norm": 0.1288381665945053, + "learning_rate": 3.640774419103448e-05, + "loss": 0.3898, + "mean_token_accuracy": 0.8435741662979126, + "step": 727 + }, + { + "epoch": 2.289308176100629, + "grad_norm": 0.1845909059047699, + "learning_rate": 3.6396745557435344e-05, + "loss": 0.3959, + "mean_token_accuracy": 0.839992105960846, + "step": 728 + }, + { + "epoch": 2.292452830188679, + "grad_norm": 0.13671965897083282, + "learning_rate": 3.6385731984362505e-05, + "loss": 0.3952, + "mean_token_accuracy": 0.8399907946586609, + "step": 729 + }, + { + "epoch": 2.2955974842767297, + "grad_norm": 0.18472205102443695, + "learning_rate": 3.637470348324473e-05, + "loss": 0.3913, + "mean_token_accuracy": 0.8435697555541992, + "step": 730 + }, + { + "epoch": 2.29874213836478, + "grad_norm": 0.136347234249115, + "learning_rate": 3.6363660065526255e-05, + "loss": 0.3984, + "mean_token_accuracy": 0.8397014737129211, + "step": 731 + }, + { + "epoch": 2.30188679245283, + "grad_norm": 0.17823082208633423, + "learning_rate": 3.635260174266682e-05, + "loss": 0.3909, + "mean_token_accuracy": 0.8430346846580505, + "step": 732 + }, + { + "epoch": 2.3050314465408803, + "grad_norm": 0.143354594707489, + "learning_rate": 3.634152852614163e-05, + "loss": 0.3936, + "mean_token_accuracy": 0.8386049270629883, + "step": 733 + }, + { + "epoch": 2.308176100628931, + "grad_norm": 0.14964474737644196, + "learning_rate": 3.633044042744134e-05, + "loss": 0.3879, + "mean_token_accuracy": 0.8436481952667236, + "step": 734 + }, + { + "epoch": 2.311320754716981, + "grad_norm": 0.14824624359607697, + "learning_rate": 3.631933745807204e-05, + "loss": 0.4019, + "mean_token_accuracy": 0.8401293754577637, + "step": 735 + }, + { + "epoch": 2.3144654088050314, + "grad_norm": 0.14450529217720032, + "learning_rate": 3.6308219629555264e-05, + "loss": 0.4031, + "mean_token_accuracy": 0.8403022289276123, + "step": 736 + }, + { + "epoch": 2.3176100628930816, + "grad_norm": 0.16056504845619202, + "learning_rate": 3.629708695342795e-05, + "loss": 0.3936, + "mean_token_accuracy": 0.8434531688690186, + "step": 737 + }, + { + "epoch": 2.3207547169811322, + "grad_norm": 0.1542007476091385, + "learning_rate": 3.628593944124247e-05, + "loss": 0.3994, + "mean_token_accuracy": 0.8412461280822754, + "step": 738 + }, + { + "epoch": 2.3238993710691824, + "grad_norm": 0.15933917462825775, + "learning_rate": 3.627477710456657e-05, + "loss": 0.3968, + "mean_token_accuracy": 0.8403854966163635, + "step": 739 + }, + { + "epoch": 2.3270440251572326, + "grad_norm": 0.15049493312835693, + "learning_rate": 3.626359995498337e-05, + "loss": 0.3845, + "mean_token_accuracy": 0.8435242176055908, + "step": 740 + }, + { + "epoch": 2.330188679245283, + "grad_norm": 0.15058250725269318, + "learning_rate": 3.625240800409139e-05, + "loss": 0.3976, + "mean_token_accuracy": 0.8395102024078369, + "step": 741 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.14439113438129425, + "learning_rate": 3.624120126350449e-05, + "loss": 0.3873, + "mean_token_accuracy": 0.8440976738929749, + "step": 742 + }, + { + "epoch": 2.3364779874213837, + "grad_norm": 0.14690877497196198, + "learning_rate": 3.6229979744851886e-05, + "loss": 0.3769, + "mean_token_accuracy": 0.8444263935089111, + "step": 743 + }, + { + "epoch": 2.339622641509434, + "grad_norm": 0.17405840754508972, + "learning_rate": 3.621874345977813e-05, + "loss": 0.4031, + "mean_token_accuracy": 0.8406171798706055, + "step": 744 + }, + { + "epoch": 2.342767295597484, + "grad_norm": 0.15319286286830902, + "learning_rate": 3.6207492419943065e-05, + "loss": 0.3899, + "mean_token_accuracy": 0.8424206972122192, + "step": 745 + }, + { + "epoch": 2.3459119496855347, + "grad_norm": 0.16251526772975922, + "learning_rate": 3.61962266370219e-05, + "loss": 0.3898, + "mean_token_accuracy": 0.8415105938911438, + "step": 746 + }, + { + "epoch": 2.349056603773585, + "grad_norm": 0.1394706815481186, + "learning_rate": 3.618494612270511e-05, + "loss": 0.3919, + "mean_token_accuracy": 0.8406856656074524, + "step": 747 + }, + { + "epoch": 2.352201257861635, + "grad_norm": 0.15003950893878937, + "learning_rate": 3.6173650888698456e-05, + "loss": 0.3984, + "mean_token_accuracy": 0.8404019474983215, + "step": 748 + }, + { + "epoch": 2.3553459119496853, + "grad_norm": 0.1499018669128418, + "learning_rate": 3.616234094672298e-05, + "loss": 0.3947, + "mean_token_accuracy": 0.8411343693733215, + "step": 749 + }, + { + "epoch": 2.358490566037736, + "grad_norm": 0.15295803546905518, + "learning_rate": 3.615101630851499e-05, + "loss": 0.3919, + "mean_token_accuracy": 0.8424092531204224, + "step": 750 + }, + { + "epoch": 2.361635220125786, + "grad_norm": 0.15068519115447998, + "learning_rate": 3.6139676985826035e-05, + "loss": 0.3825, + "mean_token_accuracy": 0.845098078250885, + "step": 751 + }, + { + "epoch": 2.3647798742138364, + "grad_norm": 0.17812252044677734, + "learning_rate": 3.6128322990422924e-05, + "loss": 0.3781, + "mean_token_accuracy": 0.8452845811843872, + "step": 752 + }, + { + "epoch": 2.3679245283018866, + "grad_norm": 0.1486268937587738, + "learning_rate": 3.6116954334087644e-05, + "loss": 0.3904, + "mean_token_accuracy": 0.8416242003440857, + "step": 753 + }, + { + "epoch": 2.3710691823899372, + "grad_norm": 0.20920655131340027, + "learning_rate": 3.6105571028617445e-05, + "loss": 0.3882, + "mean_token_accuracy": 0.8425738215446472, + "step": 754 + }, + { + "epoch": 2.3742138364779874, + "grad_norm": 0.14427687227725983, + "learning_rate": 3.609417308582477e-05, + "loss": 0.3859, + "mean_token_accuracy": 0.841761589050293, + "step": 755 + }, + { + "epoch": 2.3773584905660377, + "grad_norm": 0.1912706047296524, + "learning_rate": 3.608276051753722e-05, + "loss": 0.3996, + "mean_token_accuracy": 0.8410661816596985, + "step": 756 + }, + { + "epoch": 2.380503144654088, + "grad_norm": 0.15891054272651672, + "learning_rate": 3.607133333559761e-05, + "loss": 0.3871, + "mean_token_accuracy": 0.8439091444015503, + "step": 757 + }, + { + "epoch": 2.3836477987421385, + "grad_norm": 0.18664152920246124, + "learning_rate": 3.605989155186389e-05, + "loss": 0.3946, + "mean_token_accuracy": 0.8415076732635498, + "step": 758 + }, + { + "epoch": 2.3867924528301887, + "grad_norm": 0.19155050814151764, + "learning_rate": 3.6048435178209194e-05, + "loss": 0.3951, + "mean_token_accuracy": 0.8402332067489624, + "step": 759 + }, + { + "epoch": 2.389937106918239, + "grad_norm": 0.14341680705547333, + "learning_rate": 3.603696422652176e-05, + "loss": 0.3858, + "mean_token_accuracy": 0.8424145579338074, + "step": 760 + }, + { + "epoch": 2.3930817610062896, + "grad_norm": 0.1860472857952118, + "learning_rate": 3.602547870870498e-05, + "loss": 0.3829, + "mean_token_accuracy": 0.8446792960166931, + "step": 761 + }, + { + "epoch": 2.3962264150943398, + "grad_norm": 0.16511857509613037, + "learning_rate": 3.6013978636677354e-05, + "loss": 0.3883, + "mean_token_accuracy": 0.8434211611747742, + "step": 762 + }, + { + "epoch": 2.39937106918239, + "grad_norm": 0.1752832978963852, + "learning_rate": 3.600246402237248e-05, + "loss": 0.3923, + "mean_token_accuracy": 0.840692400932312, + "step": 763 + }, + { + "epoch": 2.40251572327044, + "grad_norm": 0.1690099984407425, + "learning_rate": 3.5990934877739045e-05, + "loss": 0.3969, + "mean_token_accuracy": 0.8391318917274475, + "step": 764 + }, + { + "epoch": 2.4056603773584904, + "grad_norm": 0.13980698585510254, + "learning_rate": 3.597939121474082e-05, + "loss": 0.3978, + "mean_token_accuracy": 0.8390938639640808, + "step": 765 + }, + { + "epoch": 2.408805031446541, + "grad_norm": 0.1642495095729828, + "learning_rate": 3.5967833045356664e-05, + "loss": 0.3863, + "mean_token_accuracy": 0.8447932600975037, + "step": 766 + }, + { + "epoch": 2.411949685534591, + "grad_norm": 0.1437988579273224, + "learning_rate": 3.595626038158043e-05, + "loss": 0.4029, + "mean_token_accuracy": 0.8407385945320129, + "step": 767 + }, + { + "epoch": 2.4150943396226414, + "grad_norm": 0.1538083404302597, + "learning_rate": 3.594467323542107e-05, + "loss": 0.3918, + "mean_token_accuracy": 0.8416243195533752, + "step": 768 + }, + { + "epoch": 2.418238993710692, + "grad_norm": 0.16916736960411072, + "learning_rate": 3.593307161890254e-05, + "loss": 0.3855, + "mean_token_accuracy": 0.8438270092010498, + "step": 769 + }, + { + "epoch": 2.4213836477987423, + "grad_norm": 0.15521636605262756, + "learning_rate": 3.592145554406381e-05, + "loss": 0.4015, + "mean_token_accuracy": 0.8377931118011475, + "step": 770 + }, + { + "epoch": 2.4245283018867925, + "grad_norm": 0.15567384660243988, + "learning_rate": 3.5909825022958857e-05, + "loss": 0.3918, + "mean_token_accuracy": 0.841164767742157, + "step": 771 + }, + { + "epoch": 2.4276729559748427, + "grad_norm": 0.15030360221862793, + "learning_rate": 3.5898180067656655e-05, + "loss": 0.3968, + "mean_token_accuracy": 0.8410426378250122, + "step": 772 + }, + { + "epoch": 2.430817610062893, + "grad_norm": 0.14496302604675293, + "learning_rate": 3.5886520690241136e-05, + "loss": 0.3939, + "mean_token_accuracy": 0.8415027260780334, + "step": 773 + }, + { + "epoch": 2.4339622641509435, + "grad_norm": 0.16270099580287933, + "learning_rate": 3.587484690281123e-05, + "loss": 0.392, + "mean_token_accuracy": 0.8409454822540283, + "step": 774 + }, + { + "epoch": 2.4371069182389937, + "grad_norm": 0.1545172780752182, + "learning_rate": 3.5863158717480794e-05, + "loss": 0.4027, + "mean_token_accuracy": 0.8391482830047607, + "step": 775 + }, + { + "epoch": 2.440251572327044, + "grad_norm": 0.17777201533317566, + "learning_rate": 3.585145614637864e-05, + "loss": 0.3875, + "mean_token_accuracy": 0.845243513584137, + "step": 776 + }, + { + "epoch": 2.4433962264150946, + "grad_norm": 0.13722971081733704, + "learning_rate": 3.583973920164849e-05, + "loss": 0.3907, + "mean_token_accuracy": 0.8422651290893555, + "step": 777 + }, + { + "epoch": 2.4465408805031448, + "grad_norm": 0.16192865371704102, + "learning_rate": 3.5828007895449e-05, + "loss": 0.3844, + "mean_token_accuracy": 0.8450747728347778, + "step": 778 + }, + { + "epoch": 2.449685534591195, + "grad_norm": 0.15951241552829742, + "learning_rate": 3.581626223995372e-05, + "loss": 0.3847, + "mean_token_accuracy": 0.8426473736763, + "step": 779 + }, + { + "epoch": 2.452830188679245, + "grad_norm": 0.1600176841020584, + "learning_rate": 3.580450224735111e-05, + "loss": 0.3868, + "mean_token_accuracy": 0.8432039618492126, + "step": 780 + }, + { + "epoch": 2.4559748427672954, + "grad_norm": 0.1509629637002945, + "learning_rate": 3.579272792984447e-05, + "loss": 0.3837, + "mean_token_accuracy": 0.843054473400116, + "step": 781 + }, + { + "epoch": 2.459119496855346, + "grad_norm": 0.15704569220542908, + "learning_rate": 3.5780939299651995e-05, + "loss": 0.4006, + "mean_token_accuracy": 0.8408331274986267, + "step": 782 + }, + { + "epoch": 2.4622641509433962, + "grad_norm": 0.1845240294933319, + "learning_rate": 3.576913636900672e-05, + "loss": 0.4115, + "mean_token_accuracy": 0.8372202515602112, + "step": 783 + }, + { + "epoch": 2.4654088050314464, + "grad_norm": 0.13821645081043243, + "learning_rate": 3.5757319150156515e-05, + "loss": 0.3848, + "mean_token_accuracy": 0.840967059135437, + "step": 784 + }, + { + "epoch": 2.468553459119497, + "grad_norm": 0.18777751922607422, + "learning_rate": 3.57454876553641e-05, + "loss": 0.3991, + "mean_token_accuracy": 0.8414704203605652, + "step": 785 + }, + { + "epoch": 2.4716981132075473, + "grad_norm": 0.14253944158554077, + "learning_rate": 3.573364189690699e-05, + "loss": 0.401, + "mean_token_accuracy": 0.8404497504234314, + "step": 786 + }, + { + "epoch": 2.4748427672955975, + "grad_norm": 0.17456071078777313, + "learning_rate": 3.572178188707749e-05, + "loss": 0.3815, + "mean_token_accuracy": 0.8429886102676392, + "step": 787 + }, + { + "epoch": 2.4779874213836477, + "grad_norm": 0.14869949221611023, + "learning_rate": 3.5709907638182725e-05, + "loss": 0.4001, + "mean_token_accuracy": 0.8371372818946838, + "step": 788 + }, + { + "epoch": 2.481132075471698, + "grad_norm": 0.1542949378490448, + "learning_rate": 3.569801916254457e-05, + "loss": 0.3738, + "mean_token_accuracy": 0.8442022800445557, + "step": 789 + }, + { + "epoch": 2.4842767295597485, + "grad_norm": 0.1838357299566269, + "learning_rate": 3.5686116472499665e-05, + "loss": 0.3944, + "mean_token_accuracy": 0.8391475677490234, + "step": 790 + }, + { + "epoch": 2.4874213836477987, + "grad_norm": 0.1430555135011673, + "learning_rate": 3.5674199580399425e-05, + "loss": 0.3959, + "mean_token_accuracy": 0.8409162163734436, + "step": 791 + }, + { + "epoch": 2.490566037735849, + "grad_norm": 0.17979463934898376, + "learning_rate": 3.566226849860997e-05, + "loss": 0.3928, + "mean_token_accuracy": 0.8403165936470032, + "step": 792 + }, + { + "epoch": 2.4937106918238996, + "grad_norm": 0.137798473238945, + "learning_rate": 3.5650323239512175e-05, + "loss": 0.4061, + "mean_token_accuracy": 0.8378630876541138, + "step": 793 + }, + { + "epoch": 2.49685534591195, + "grad_norm": 0.1714894026517868, + "learning_rate": 3.563836381550159e-05, + "loss": 0.394, + "mean_token_accuracy": 0.8412221074104309, + "step": 794 + }, + { + "epoch": 2.5, + "grad_norm": 0.15192954242229462, + "learning_rate": 3.5626390238988504e-05, + "loss": 0.4004, + "mean_token_accuracy": 0.8397027254104614, + "step": 795 + }, + { + "epoch": 2.50314465408805, + "grad_norm": 0.16279102861881256, + "learning_rate": 3.561440252239787e-05, + "loss": 0.3799, + "mean_token_accuracy": 0.8437523245811462, + "step": 796 + }, + { + "epoch": 2.5062893081761004, + "grad_norm": 0.15632732212543488, + "learning_rate": 3.56024006781693e-05, + "loss": 0.3938, + "mean_token_accuracy": 0.8418835997581482, + "step": 797 + }, + { + "epoch": 2.509433962264151, + "grad_norm": 0.1601596623659134, + "learning_rate": 3.559038471875711e-05, + "loss": 0.3757, + "mean_token_accuracy": 0.8448925614356995, + "step": 798 + }, + { + "epoch": 2.5125786163522013, + "grad_norm": 0.15498454868793488, + "learning_rate": 3.557835465663021e-05, + "loss": 0.3966, + "mean_token_accuracy": 0.83966064453125, + "step": 799 + }, + { + "epoch": 2.5157232704402515, + "grad_norm": 0.15797755122184753, + "learning_rate": 3.5566310504272194e-05, + "loss": 0.3916, + "mean_token_accuracy": 0.8428415656089783, + "step": 800 + }, + { + "epoch": 2.518867924528302, + "grad_norm": 0.17878195643424988, + "learning_rate": 3.5554252274181246e-05, + "loss": 0.3922, + "mean_token_accuracy": 0.8414608240127563, + "step": 801 + }, + { + "epoch": 2.5220125786163523, + "grad_norm": 0.1354987770318985, + "learning_rate": 3.554217997887016e-05, + "loss": 0.3874, + "mean_token_accuracy": 0.8432824611663818, + "step": 802 + }, + { + "epoch": 2.5251572327044025, + "grad_norm": 0.17350763082504272, + "learning_rate": 3.553009363086634e-05, + "loss": 0.399, + "mean_token_accuracy": 0.838498055934906, + "step": 803 + }, + { + "epoch": 2.5283018867924527, + "grad_norm": 0.1341233104467392, + "learning_rate": 3.551799324271176e-05, + "loss": 0.4094, + "mean_token_accuracy": 0.8359872698783875, + "step": 804 + }, + { + "epoch": 2.531446540880503, + "grad_norm": 0.18266914784908295, + "learning_rate": 3.550587882696297e-05, + "loss": 0.3996, + "mean_token_accuracy": 0.8383893966674805, + "step": 805 + }, + { + "epoch": 2.5345911949685536, + "grad_norm": 0.14570754766464233, + "learning_rate": 3.549375039619109e-05, + "loss": 0.393, + "mean_token_accuracy": 0.8403996229171753, + "step": 806 + }, + { + "epoch": 2.5377358490566038, + "grad_norm": 0.1772826611995697, + "learning_rate": 3.5481607962981744e-05, + "loss": 0.4042, + "mean_token_accuracy": 0.8374120593070984, + "step": 807 + }, + { + "epoch": 2.540880503144654, + "grad_norm": 0.13402216136455536, + "learning_rate": 3.546945153993512e-05, + "loss": 0.3969, + "mean_token_accuracy": 0.8391050100326538, + "step": 808 + }, + { + "epoch": 2.5440251572327046, + "grad_norm": 0.15814465284347534, + "learning_rate": 3.5457281139665906e-05, + "loss": 0.3778, + "mean_token_accuracy": 0.8457577228546143, + "step": 809 + }, + { + "epoch": 2.547169811320755, + "grad_norm": 0.13784445822238922, + "learning_rate": 3.544509677480332e-05, + "loss": 0.4, + "mean_token_accuracy": 0.8372910022735596, + "step": 810 + }, + { + "epoch": 2.550314465408805, + "grad_norm": 0.15171018242835999, + "learning_rate": 3.543289845799104e-05, + "loss": 0.3939, + "mean_token_accuracy": 0.8409681916236877, + "step": 811 + }, + { + "epoch": 2.5534591194968552, + "grad_norm": 0.14729034900665283, + "learning_rate": 3.542068620188723e-05, + "loss": 0.3979, + "mean_token_accuracy": 0.8405581116676331, + "step": 812 + }, + { + "epoch": 2.5566037735849054, + "grad_norm": 0.16989730298519135, + "learning_rate": 3.540846001916454e-05, + "loss": 0.3912, + "mean_token_accuracy": 0.8416076898574829, + "step": 813 + }, + { + "epoch": 2.559748427672956, + "grad_norm": 0.16093933582305908, + "learning_rate": 3.5396219922510044e-05, + "loss": 0.4019, + "mean_token_accuracy": 0.8399268984794617, + "step": 814 + }, + { + "epoch": 2.5628930817610063, + "grad_norm": 0.1573500782251358, + "learning_rate": 3.538396592462527e-05, + "loss": 0.3914, + "mean_token_accuracy": 0.8417745232582092, + "step": 815 + }, + { + "epoch": 2.5660377358490565, + "grad_norm": 0.1340954303741455, + "learning_rate": 3.5371698038226156e-05, + "loss": 0.397, + "mean_token_accuracy": 0.8433490991592407, + "step": 816 + }, + { + "epoch": 2.569182389937107, + "grad_norm": 0.14694054424762726, + "learning_rate": 3.535941627604307e-05, + "loss": 0.3905, + "mean_token_accuracy": 0.8411130309104919, + "step": 817 + }, + { + "epoch": 2.5723270440251573, + "grad_norm": 0.1424230933189392, + "learning_rate": 3.534712065082078e-05, + "loss": 0.3997, + "mean_token_accuracy": 0.8407908082008362, + "step": 818 + }, + { + "epoch": 2.5754716981132075, + "grad_norm": 0.14012737572193146, + "learning_rate": 3.533481117531842e-05, + "loss": 0.3798, + "mean_token_accuracy": 0.8445196151733398, + "step": 819 + }, + { + "epoch": 2.5786163522012577, + "grad_norm": 0.14719845354557037, + "learning_rate": 3.5322487862309516e-05, + "loss": 0.4068, + "mean_token_accuracy": 0.8381505608558655, + "step": 820 + }, + { + "epoch": 2.581761006289308, + "grad_norm": 0.14293119311332703, + "learning_rate": 3.531015072458193e-05, + "loss": 0.387, + "mean_token_accuracy": 0.8414595723152161, + "step": 821 + }, + { + "epoch": 2.5849056603773586, + "grad_norm": 0.15876775979995728, + "learning_rate": 3.5297799774937904e-05, + "loss": 0.3968, + "mean_token_accuracy": 0.8399572968482971, + "step": 822 + }, + { + "epoch": 2.588050314465409, + "grad_norm": 0.1583719551563263, + "learning_rate": 3.528543502619398e-05, + "loss": 0.3837, + "mean_token_accuracy": 0.8397992253303528, + "step": 823 + }, + { + "epoch": 2.591194968553459, + "grad_norm": 0.16118523478507996, + "learning_rate": 3.527305649118104e-05, + "loss": 0.3875, + "mean_token_accuracy": 0.8445963263511658, + "step": 824 + }, + { + "epoch": 2.5943396226415096, + "grad_norm": 0.1711784452199936, + "learning_rate": 3.526066418274427e-05, + "loss": 0.4029, + "mean_token_accuracy": 0.8400377631187439, + "step": 825 + }, + { + "epoch": 2.59748427672956, + "grad_norm": 0.14684435725212097, + "learning_rate": 3.5248258113743126e-05, + "loss": 0.3746, + "mean_token_accuracy": 0.8455667495727539, + "step": 826 + }, + { + "epoch": 2.60062893081761, + "grad_norm": 0.16698506474494934, + "learning_rate": 3.523583829705137e-05, + "loss": 0.3929, + "mean_token_accuracy": 0.8404534459114075, + "step": 827 + }, + { + "epoch": 2.6037735849056602, + "grad_norm": 0.15083381533622742, + "learning_rate": 3.5223404745557014e-05, + "loss": 0.3964, + "mean_token_accuracy": 0.8417296409606934, + "step": 828 + }, + { + "epoch": 2.6069182389937104, + "grad_norm": 0.15367716550827026, + "learning_rate": 3.521095747216235e-05, + "loss": 0.3921, + "mean_token_accuracy": 0.839659571647644, + "step": 829 + }, + { + "epoch": 2.610062893081761, + "grad_norm": 0.16619056463241577, + "learning_rate": 3.519849648978387e-05, + "loss": 0.3885, + "mean_token_accuracy": 0.840570330619812, + "step": 830 + }, + { + "epoch": 2.6132075471698113, + "grad_norm": 0.13816650211811066, + "learning_rate": 3.51860218113523e-05, + "loss": 0.4104, + "mean_token_accuracy": 0.8387232422828674, + "step": 831 + }, + { + "epoch": 2.6163522012578615, + "grad_norm": 0.15900948643684387, + "learning_rate": 3.5173533449812627e-05, + "loss": 0.3917, + "mean_token_accuracy": 0.8438796401023865, + "step": 832 + }, + { + "epoch": 2.619496855345912, + "grad_norm": 0.14279142022132874, + "learning_rate": 3.516103141812396e-05, + "loss": 0.3886, + "mean_token_accuracy": 0.8426015973091125, + "step": 833 + }, + { + "epoch": 2.6226415094339623, + "grad_norm": 0.1456068903207779, + "learning_rate": 3.5148515729259655e-05, + "loss": 0.4118, + "mean_token_accuracy": 0.8380022644996643, + "step": 834 + }, + { + "epoch": 2.6257861635220126, + "grad_norm": 0.17821946740150452, + "learning_rate": 3.51359863962072e-05, + "loss": 0.3919, + "mean_token_accuracy": 0.8446857929229736, + "step": 835 + }, + { + "epoch": 2.6289308176100628, + "grad_norm": 0.13951314985752106, + "learning_rate": 3.512344343196827e-05, + "loss": 0.3948, + "mean_token_accuracy": 0.841708779335022, + "step": 836 + }, + { + "epoch": 2.632075471698113, + "grad_norm": 0.1878071278333664, + "learning_rate": 3.511088684955868e-05, + "loss": 0.3835, + "mean_token_accuracy": 0.843018114566803, + "step": 837 + }, + { + "epoch": 2.6352201257861636, + "grad_norm": 0.14683803915977478, + "learning_rate": 3.509831666200835e-05, + "loss": 0.3926, + "mean_token_accuracy": 0.8428792357444763, + "step": 838 + }, + { + "epoch": 2.638364779874214, + "grad_norm": 0.19433486461639404, + "learning_rate": 3.508573288236135e-05, + "loss": 0.3948, + "mean_token_accuracy": 0.8401256799697876, + "step": 839 + }, + { + "epoch": 2.641509433962264, + "grad_norm": 0.12940803170204163, + "learning_rate": 3.5073135523675853e-05, + "loss": 0.3959, + "mean_token_accuracy": 0.8402566313743591, + "step": 840 + }, + { + "epoch": 2.6446540880503147, + "grad_norm": 0.16293668746948242, + "learning_rate": 3.506052459902409e-05, + "loss": 0.3869, + "mean_token_accuracy": 0.841245174407959, + "step": 841 + }, + { + "epoch": 2.647798742138365, + "grad_norm": 0.153008833527565, + "learning_rate": 3.504790012149241e-05, + "loss": 0.3911, + "mean_token_accuracy": 0.840588390827179, + "step": 842 + }, + { + "epoch": 2.650943396226415, + "grad_norm": 0.14682938158512115, + "learning_rate": 3.503526210418119e-05, + "loss": 0.3967, + "mean_token_accuracy": 0.8388072848320007, + "step": 843 + }, + { + "epoch": 2.6540880503144653, + "grad_norm": 0.15707483887672424, + "learning_rate": 3.50226105602049e-05, + "loss": 0.3982, + "mean_token_accuracy": 0.8398618102073669, + "step": 844 + }, + { + "epoch": 2.6572327044025155, + "grad_norm": 0.15881039202213287, + "learning_rate": 3.5009945502692e-05, + "loss": 0.3933, + "mean_token_accuracy": 0.841876745223999, + "step": 845 + }, + { + "epoch": 2.660377358490566, + "grad_norm": 0.1584368199110031, + "learning_rate": 3.4997266944785e-05, + "loss": 0.4016, + "mean_token_accuracy": 0.8401806950569153, + "step": 846 + }, + { + "epoch": 2.6635220125786163, + "grad_norm": 0.16472209990024567, + "learning_rate": 3.4984574899640415e-05, + "loss": 0.4028, + "mean_token_accuracy": 0.8396847248077393, + "step": 847 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.15707650780677795, + "learning_rate": 3.497186938042876e-05, + "loss": 0.4055, + "mean_token_accuracy": 0.8379293084144592, + "step": 848 + }, + { + "epoch": 2.669811320754717, + "grad_norm": 0.1491553783416748, + "learning_rate": 3.4959150400334516e-05, + "loss": 0.3789, + "mean_token_accuracy": 0.8430595397949219, + "step": 849 + }, + { + "epoch": 2.6729559748427674, + "grad_norm": 0.1792389154434204, + "learning_rate": 3.494641797255616e-05, + "loss": 0.389, + "mean_token_accuracy": 0.8386589884757996, + "step": 850 + }, + { + "epoch": 2.6761006289308176, + "grad_norm": 0.15633267164230347, + "learning_rate": 3.493367211030609e-05, + "loss": 0.3971, + "mean_token_accuracy": 0.8409385681152344, + "step": 851 + }, + { + "epoch": 2.6792452830188678, + "grad_norm": 0.18488091230392456, + "learning_rate": 3.492091282681066e-05, + "loss": 0.3912, + "mean_token_accuracy": 0.8436446785926819, + "step": 852 + }, + { + "epoch": 2.682389937106918, + "grad_norm": 0.17898158729076385, + "learning_rate": 3.490814013531017e-05, + "loss": 0.3944, + "mean_token_accuracy": 0.8414387702941895, + "step": 853 + }, + { + "epoch": 2.6855345911949686, + "grad_norm": 0.1576376110315323, + "learning_rate": 3.489535404905879e-05, + "loss": 0.4077, + "mean_token_accuracy": 0.8369157314300537, + "step": 854 + }, + { + "epoch": 2.688679245283019, + "grad_norm": 0.19907242059707642, + "learning_rate": 3.4882554581324646e-05, + "loss": 0.3961, + "mean_token_accuracy": 0.8421619534492493, + "step": 855 + }, + { + "epoch": 2.691823899371069, + "grad_norm": 0.14955250918865204, + "learning_rate": 3.48697417453897e-05, + "loss": 0.4018, + "mean_token_accuracy": 0.8398721814155579, + "step": 856 + }, + { + "epoch": 2.6949685534591197, + "grad_norm": 0.1245059221982956, + "learning_rate": 3.485691555454982e-05, + "loss": 0.4034, + "mean_token_accuracy": 0.8386282324790955, + "step": 857 + }, + { + "epoch": 2.69811320754717, + "grad_norm": 0.16106462478637695, + "learning_rate": 3.48440760221147e-05, + "loss": 0.3885, + "mean_token_accuracy": 0.8441129922866821, + "step": 858 + }, + { + "epoch": 2.70125786163522, + "grad_norm": 0.1262756884098053, + "learning_rate": 3.4831223161407904e-05, + "loss": 0.4013, + "mean_token_accuracy": 0.8384125828742981, + "step": 859 + }, + { + "epoch": 2.7044025157232703, + "grad_norm": 0.16710178554058075, + "learning_rate": 3.4818356985766825e-05, + "loss": 0.3906, + "mean_token_accuracy": 0.8418945670127869, + "step": 860 + }, + { + "epoch": 2.7075471698113205, + "grad_norm": 0.14853724837303162, + "learning_rate": 3.4805477508542665e-05, + "loss": 0.3895, + "mean_token_accuracy": 0.8417914509773254, + "step": 861 + }, + { + "epoch": 2.710691823899371, + "grad_norm": 0.1702132523059845, + "learning_rate": 3.4792584743100425e-05, + "loss": 0.4086, + "mean_token_accuracy": 0.8393948674201965, + "step": 862 + }, + { + "epoch": 2.7138364779874213, + "grad_norm": 0.15771856904029846, + "learning_rate": 3.4779678702818915e-05, + "loss": 0.403, + "mean_token_accuracy": 0.8413280844688416, + "step": 863 + }, + { + "epoch": 2.7169811320754715, + "grad_norm": 0.1526855230331421, + "learning_rate": 3.4766759401090695e-05, + "loss": 0.395, + "mean_token_accuracy": 0.8401949405670166, + "step": 864 + }, + { + "epoch": 2.720125786163522, + "grad_norm": 0.15966515243053436, + "learning_rate": 3.47538268513221e-05, + "loss": 0.3947, + "mean_token_accuracy": 0.8407111763954163, + "step": 865 + }, + { + "epoch": 2.7232704402515724, + "grad_norm": 0.15073886513710022, + "learning_rate": 3.474088106693323e-05, + "loss": 0.3969, + "mean_token_accuracy": 0.8384842276573181, + "step": 866 + }, + { + "epoch": 2.7264150943396226, + "grad_norm": 0.15170443058013916, + "learning_rate": 3.472792206135786e-05, + "loss": 0.3938, + "mean_token_accuracy": 0.8373732566833496, + "step": 867 + }, + { + "epoch": 2.729559748427673, + "grad_norm": 0.1322529911994934, + "learning_rate": 3.471494984804355e-05, + "loss": 0.4004, + "mean_token_accuracy": 0.8394856452941895, + "step": 868 + }, + { + "epoch": 2.732704402515723, + "grad_norm": 0.14727897942066193, + "learning_rate": 3.4701964440451545e-05, + "loss": 0.3904, + "mean_token_accuracy": 0.8422083258628845, + "step": 869 + }, + { + "epoch": 2.7358490566037736, + "grad_norm": 0.14118586480617523, + "learning_rate": 3.468896585205676e-05, + "loss": 0.3974, + "mean_token_accuracy": 0.8416357040405273, + "step": 870 + }, + { + "epoch": 2.738993710691824, + "grad_norm": 0.1370769441127777, + "learning_rate": 3.467595409634781e-05, + "loss": 0.3972, + "mean_token_accuracy": 0.8423879146575928, + "step": 871 + }, + { + "epoch": 2.742138364779874, + "grad_norm": 0.12729352712631226, + "learning_rate": 3.466292918682696e-05, + "loss": 0.3891, + "mean_token_accuracy": 0.8427572250366211, + "step": 872 + }, + { + "epoch": 2.7452830188679247, + "grad_norm": 0.1347734034061432, + "learning_rate": 3.464989113701016e-05, + "loss": 0.3784, + "mean_token_accuracy": 0.8443403244018555, + "step": 873 + }, + { + "epoch": 2.748427672955975, + "grad_norm": 0.1276370882987976, + "learning_rate": 3.4636839960426935e-05, + "loss": 0.3969, + "mean_token_accuracy": 0.8401352763175964, + "step": 874 + }, + { + "epoch": 2.751572327044025, + "grad_norm": 0.12423428148031235, + "learning_rate": 3.462377567062048e-05, + "loss": 0.3887, + "mean_token_accuracy": 0.8414220213890076, + "step": 875 + }, + { + "epoch": 2.7547169811320753, + "grad_norm": 0.15068255364894867, + "learning_rate": 3.4610698281147574e-05, + "loss": 0.4004, + "mean_token_accuracy": 0.8396066427230835, + "step": 876 + }, + { + "epoch": 2.757861635220126, + "grad_norm": 0.13070818781852722, + "learning_rate": 3.4597607805578614e-05, + "loss": 0.3903, + "mean_token_accuracy": 0.8402306437492371, + "step": 877 + }, + { + "epoch": 2.761006289308176, + "grad_norm": 0.14346636831760406, + "learning_rate": 3.458450425749754e-05, + "loss": 0.3924, + "mean_token_accuracy": 0.8397766351699829, + "step": 878 + }, + { + "epoch": 2.7641509433962264, + "grad_norm": 0.14210395514965057, + "learning_rate": 3.4571387650501886e-05, + "loss": 0.3989, + "mean_token_accuracy": 0.8378952145576477, + "step": 879 + }, + { + "epoch": 2.767295597484277, + "grad_norm": 0.1414806991815567, + "learning_rate": 3.455825799820274e-05, + "loss": 0.3868, + "mean_token_accuracy": 0.8398988246917725, + "step": 880 + }, + { + "epoch": 2.770440251572327, + "grad_norm": 0.1717570275068283, + "learning_rate": 3.45451153142247e-05, + "loss": 0.4038, + "mean_token_accuracy": 0.8383566737174988, + "step": 881 + }, + { + "epoch": 2.7735849056603774, + "grad_norm": 0.13245093822479248, + "learning_rate": 3.45319596122059e-05, + "loss": 0.3847, + "mean_token_accuracy": 0.842934787273407, + "step": 882 + }, + { + "epoch": 2.7767295597484276, + "grad_norm": 0.1757514327764511, + "learning_rate": 3.4518790905798004e-05, + "loss": 0.3936, + "mean_token_accuracy": 0.8390152454376221, + "step": 883 + }, + { + "epoch": 2.779874213836478, + "grad_norm": 0.12597371637821198, + "learning_rate": 3.450560920866613e-05, + "loss": 0.4083, + "mean_token_accuracy": 0.8398998379707336, + "step": 884 + }, + { + "epoch": 2.7830188679245285, + "grad_norm": 0.18164481222629547, + "learning_rate": 3.4492414534488917e-05, + "loss": 0.3949, + "mean_token_accuracy": 0.8426663875579834, + "step": 885 + }, + { + "epoch": 2.7861635220125787, + "grad_norm": 0.12726567685604095, + "learning_rate": 3.4479206896958434e-05, + "loss": 0.4044, + "mean_token_accuracy": 0.8387595415115356, + "step": 886 + }, + { + "epoch": 2.789308176100629, + "grad_norm": 0.1545555740594864, + "learning_rate": 3.446598630978024e-05, + "loss": 0.3938, + "mean_token_accuracy": 0.8420332670211792, + "step": 887 + }, + { + "epoch": 2.7924528301886795, + "grad_norm": 0.12999430298805237, + "learning_rate": 3.445275278667329e-05, + "loss": 0.3945, + "mean_token_accuracy": 0.8392206430435181, + "step": 888 + }, + { + "epoch": 2.7955974842767297, + "grad_norm": 0.1740381270647049, + "learning_rate": 3.443950634137e-05, + "loss": 0.3868, + "mean_token_accuracy": 0.841921865940094, + "step": 889 + }, + { + "epoch": 2.79874213836478, + "grad_norm": 0.1445263922214508, + "learning_rate": 3.4426246987616176e-05, + "loss": 0.3815, + "mean_token_accuracy": 0.844272792339325, + "step": 890 + }, + { + "epoch": 2.80188679245283, + "grad_norm": 0.16784286499023438, + "learning_rate": 3.4412974739171026e-05, + "loss": 0.393, + "mean_token_accuracy": 0.841655969619751, + "step": 891 + }, + { + "epoch": 2.8050314465408803, + "grad_norm": 0.16633418202400208, + "learning_rate": 3.439968960980712e-05, + "loss": 0.3883, + "mean_token_accuracy": 0.8415431380271912, + "step": 892 + }, + { + "epoch": 2.808176100628931, + "grad_norm": 0.14095844328403473, + "learning_rate": 3.438639161331042e-05, + "loss": 0.3899, + "mean_token_accuracy": 0.8424274325370789, + "step": 893 + }, + { + "epoch": 2.811320754716981, + "grad_norm": 0.1560608446598053, + "learning_rate": 3.437308076348024e-05, + "loss": 0.4066, + "mean_token_accuracy": 0.839648962020874, + "step": 894 + }, + { + "epoch": 2.8144654088050314, + "grad_norm": 0.13541994988918304, + "learning_rate": 3.4359757074129205e-05, + "loss": 0.3911, + "mean_token_accuracy": 0.842206597328186, + "step": 895 + }, + { + "epoch": 2.817610062893082, + "grad_norm": 0.1437758207321167, + "learning_rate": 3.4346420559083285e-05, + "loss": 0.4054, + "mean_token_accuracy": 0.8397225141525269, + "step": 896 + }, + { + "epoch": 2.8207547169811322, + "grad_norm": 0.11642755568027496, + "learning_rate": 3.433307123218176e-05, + "loss": 0.3999, + "mean_token_accuracy": 0.8443455696105957, + "step": 897 + }, + { + "epoch": 2.8238993710691824, + "grad_norm": 0.13762716948986053, + "learning_rate": 3.43197091072772e-05, + "loss": 0.3982, + "mean_token_accuracy": 0.8394805192947388, + "step": 898 + }, + { + "epoch": 2.8270440251572326, + "grad_norm": 0.1345590353012085, + "learning_rate": 3.430633419823545e-05, + "loss": 0.4009, + "mean_token_accuracy": 0.8399420976638794, + "step": 899 + }, + { + "epoch": 2.830188679245283, + "grad_norm": 0.13669990003108978, + "learning_rate": 3.429294651893563e-05, + "loss": 0.3908, + "mean_token_accuracy": 0.8423181176185608, + "step": 900 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.14826296269893646, + "learning_rate": 3.4279546083270097e-05, + "loss": 0.3985, + "mean_token_accuracy": 0.8397724628448486, + "step": 901 + }, + { + "epoch": 2.8364779874213837, + "grad_norm": 0.14717832207679749, + "learning_rate": 3.426613290514447e-05, + "loss": 0.3775, + "mean_token_accuracy": 0.8437859416007996, + "step": 902 + }, + { + "epoch": 2.839622641509434, + "grad_norm": 0.1572881042957306, + "learning_rate": 3.4252706998477574e-05, + "loss": 0.3908, + "mean_token_accuracy": 0.8429072499275208, + "step": 903 + }, + { + "epoch": 2.8427672955974845, + "grad_norm": 0.1492244154214859, + "learning_rate": 3.423926837720144e-05, + "loss": 0.4069, + "mean_token_accuracy": 0.8388838768005371, + "step": 904 + }, + { + "epoch": 2.8459119496855347, + "grad_norm": 0.15537552535533905, + "learning_rate": 3.42258170552613e-05, + "loss": 0.3888, + "mean_token_accuracy": 0.8405810594558716, + "step": 905 + }, + { + "epoch": 2.849056603773585, + "grad_norm": 0.1573428362607956, + "learning_rate": 3.421235304661555e-05, + "loss": 0.3889, + "mean_token_accuracy": 0.843213677406311, + "step": 906 + }, + { + "epoch": 2.852201257861635, + "grad_norm": 0.15431787073612213, + "learning_rate": 3.41988763652358e-05, + "loss": 0.3969, + "mean_token_accuracy": 0.8401937484741211, + "step": 907 + }, + { + "epoch": 2.8553459119496853, + "grad_norm": 0.16958792507648468, + "learning_rate": 3.4185387025106745e-05, + "loss": 0.3966, + "mean_token_accuracy": 0.8413075804710388, + "step": 908 + }, + { + "epoch": 2.858490566037736, + "grad_norm": 0.15407972037792206, + "learning_rate": 3.417188504022625e-05, + "loss": 0.3944, + "mean_token_accuracy": 0.8398054242134094, + "step": 909 + }, + { + "epoch": 2.861635220125786, + "grad_norm": 0.16856738924980164, + "learning_rate": 3.415837042460531e-05, + "loss": 0.3862, + "mean_token_accuracy": 0.8428761959075928, + "step": 910 + }, + { + "epoch": 2.8647798742138364, + "grad_norm": 0.16327671706676483, + "learning_rate": 3.414484319226799e-05, + "loss": 0.4078, + "mean_token_accuracy": 0.8394198417663574, + "step": 911 + }, + { + "epoch": 2.867924528301887, + "grad_norm": 0.13859853148460388, + "learning_rate": 3.4131303357251505e-05, + "loss": 0.3979, + "mean_token_accuracy": 0.839618980884552, + "step": 912 + }, + { + "epoch": 2.8710691823899372, + "grad_norm": 0.16593888401985168, + "learning_rate": 3.411775093360609e-05, + "loss": 0.3959, + "mean_token_accuracy": 0.8423987030982971, + "step": 913 + }, + { + "epoch": 2.8742138364779874, + "grad_norm": 0.12062633782625198, + "learning_rate": 3.4104185935395075e-05, + "loss": 0.391, + "mean_token_accuracy": 0.8417162895202637, + "step": 914 + }, + { + "epoch": 2.8773584905660377, + "grad_norm": 0.14128541946411133, + "learning_rate": 3.409060837669483e-05, + "loss": 0.4021, + "mean_token_accuracy": 0.840218186378479, + "step": 915 + }, + { + "epoch": 2.880503144654088, + "grad_norm": 0.12717147171497345, + "learning_rate": 3.407701827159476e-05, + "loss": 0.4044, + "mean_token_accuracy": 0.8357147574424744, + "step": 916 + }, + { + "epoch": 2.8836477987421385, + "grad_norm": 0.14301805198192596, + "learning_rate": 3.406341563419729e-05, + "loss": 0.3944, + "mean_token_accuracy": 0.8393138647079468, + "step": 917 + }, + { + "epoch": 2.8867924528301887, + "grad_norm": 0.12959828972816467, + "learning_rate": 3.4049800478617844e-05, + "loss": 0.3893, + "mean_token_accuracy": 0.8411592841148376, + "step": 918 + }, + { + "epoch": 2.889937106918239, + "grad_norm": 0.1419060230255127, + "learning_rate": 3.4036172818984855e-05, + "loss": 0.3923, + "mean_token_accuracy": 0.8420356512069702, + "step": 919 + }, + { + "epoch": 2.8930817610062896, + "grad_norm": 0.14367401599884033, + "learning_rate": 3.4022532669439706e-05, + "loss": 0.406, + "mean_token_accuracy": 0.8380276560783386, + "step": 920 + }, + { + "epoch": 2.8962264150943398, + "grad_norm": 0.13751500844955444, + "learning_rate": 3.4008880044136763e-05, + "loss": 0.3834, + "mean_token_accuracy": 0.8410583734512329, + "step": 921 + }, + { + "epoch": 2.89937106918239, + "grad_norm": 0.13970538973808289, + "learning_rate": 3.399521495724332e-05, + "loss": 0.3923, + "mean_token_accuracy": 0.842393696308136, + "step": 922 + }, + { + "epoch": 2.90251572327044, + "grad_norm": 0.1298050582408905, + "learning_rate": 3.39815374229396e-05, + "loss": 0.3951, + "mean_token_accuracy": 0.8414964079856873, + "step": 923 + }, + { + "epoch": 2.9056603773584904, + "grad_norm": 0.12179820239543915, + "learning_rate": 3.396784745541877e-05, + "loss": 0.3891, + "mean_token_accuracy": 0.8448134064674377, + "step": 924 + }, + { + "epoch": 2.908805031446541, + "grad_norm": 0.12799564003944397, + "learning_rate": 3.3954145068886876e-05, + "loss": 0.4021, + "mean_token_accuracy": 0.8391764163970947, + "step": 925 + }, + { + "epoch": 2.911949685534591, + "grad_norm": 0.1219654306769371, + "learning_rate": 3.394043027756286e-05, + "loss": 0.4057, + "mean_token_accuracy": 0.8397082686424255, + "step": 926 + }, + { + "epoch": 2.9150943396226414, + "grad_norm": 0.1398463398218155, + "learning_rate": 3.392670309567852e-05, + "loss": 0.3973, + "mean_token_accuracy": 0.8409194350242615, + "step": 927 + }, + { + "epoch": 2.918238993710692, + "grad_norm": 0.13440991938114166, + "learning_rate": 3.391296353747854e-05, + "loss": 0.3866, + "mean_token_accuracy": 0.8404433131217957, + "step": 928 + }, + { + "epoch": 2.9213836477987423, + "grad_norm": 0.13937310874462128, + "learning_rate": 3.389921161722042e-05, + "loss": 0.3968, + "mean_token_accuracy": 0.8404849171638489, + "step": 929 + }, + { + "epoch": 2.9245283018867925, + "grad_norm": 0.15991294384002686, + "learning_rate": 3.3885447349174514e-05, + "loss": 0.3983, + "mean_token_accuracy": 0.83951735496521, + "step": 930 + }, + { + "epoch": 2.9276729559748427, + "grad_norm": 0.14287906885147095, + "learning_rate": 3.387167074762396e-05, + "loss": 0.4209, + "mean_token_accuracy": 0.8352724313735962, + "step": 931 + }, + { + "epoch": 2.930817610062893, + "grad_norm": 0.1415240466594696, + "learning_rate": 3.3857881826864715e-05, + "loss": 0.3867, + "mean_token_accuracy": 0.8398596048355103, + "step": 932 + }, + { + "epoch": 2.9339622641509435, + "grad_norm": 0.14125128090381622, + "learning_rate": 3.3844080601205516e-05, + "loss": 0.4074, + "mean_token_accuracy": 0.8381497263908386, + "step": 933 + }, + { + "epoch": 2.9371069182389937, + "grad_norm": 0.13332779705524445, + "learning_rate": 3.383026708496787e-05, + "loss": 0.4023, + "mean_token_accuracy": 0.8399627804756165, + "step": 934 + }, + { + "epoch": 2.940251572327044, + "grad_norm": 0.13416799902915955, + "learning_rate": 3.381644129248602e-05, + "loss": 0.3902, + "mean_token_accuracy": 0.8443140983581543, + "step": 935 + }, + { + "epoch": 2.9433962264150946, + "grad_norm": 0.14479246735572815, + "learning_rate": 3.380260323810698e-05, + "loss": 0.3884, + "mean_token_accuracy": 0.8421072959899902, + "step": 936 + }, + { + "epoch": 2.9465408805031448, + "grad_norm": 0.1284419447183609, + "learning_rate": 3.378875293619047e-05, + "loss": 0.392, + "mean_token_accuracy": 0.8417386412620544, + "step": 937 + }, + { + "epoch": 2.949685534591195, + "grad_norm": 0.13157133758068085, + "learning_rate": 3.3774890401108915e-05, + "loss": 0.4076, + "mean_token_accuracy": 0.8378016352653503, + "step": 938 + }, + { + "epoch": 2.952830188679245, + "grad_norm": 0.13864070177078247, + "learning_rate": 3.376101564724743e-05, + "loss": 0.3901, + "mean_token_accuracy": 0.8419355750083923, + "step": 939 + }, + { + "epoch": 2.9559748427672954, + "grad_norm": 0.14290863275527954, + "learning_rate": 3.374712868900384e-05, + "loss": 0.3963, + "mean_token_accuracy": 0.8409423828125, + "step": 940 + }, + { + "epoch": 2.959119496855346, + "grad_norm": 0.12841610610485077, + "learning_rate": 3.373322954078859e-05, + "loss": 0.3969, + "mean_token_accuracy": 0.8391101956367493, + "step": 941 + }, + { + "epoch": 2.9622641509433962, + "grad_norm": 0.14718987047672272, + "learning_rate": 3.371931821702481e-05, + "loss": 0.4037, + "mean_token_accuracy": 0.8362467288970947, + "step": 942 + }, + { + "epoch": 2.9654088050314464, + "grad_norm": 0.1477939933538437, + "learning_rate": 3.3705394732148264e-05, + "loss": 0.4049, + "mean_token_accuracy": 0.8374539613723755, + "step": 943 + }, + { + "epoch": 2.968553459119497, + "grad_norm": 0.13969185948371887, + "learning_rate": 3.369145910060731e-05, + "loss": 0.3864, + "mean_token_accuracy": 0.8428540825843811, + "step": 944 + }, + { + "epoch": 2.9716981132075473, + "grad_norm": 0.15239839255809784, + "learning_rate": 3.3677511336862924e-05, + "loss": 0.4012, + "mean_token_accuracy": 0.8415625095367432, + "step": 945 + }, + { + "epoch": 2.9748427672955975, + "grad_norm": 0.1359701305627823, + "learning_rate": 3.366355145538867e-05, + "loss": 0.4019, + "mean_token_accuracy": 0.8378898501396179, + "step": 946 + }, + { + "epoch": 2.9779874213836477, + "grad_norm": 0.17559880018234253, + "learning_rate": 3.3649579470670696e-05, + "loss": 0.3981, + "mean_token_accuracy": 0.8389723300933838, + "step": 947 + }, + { + "epoch": 2.981132075471698, + "grad_norm": 0.12837964296340942, + "learning_rate": 3.3635595397207705e-05, + "loss": 0.4067, + "mean_token_accuracy": 0.8383262157440186, + "step": 948 + }, + { + "epoch": 2.9842767295597485, + "grad_norm": 0.16035127639770508, + "learning_rate": 3.3621599249510936e-05, + "loss": 0.4018, + "mean_token_accuracy": 0.8396176695823669, + "step": 949 + }, + { + "epoch": 2.9874213836477987, + "grad_norm": 0.13061131536960602, + "learning_rate": 3.360759104210417e-05, + "loss": 0.4034, + "mean_token_accuracy": 0.839031994342804, + "step": 950 + }, + { + "epoch": 2.990566037735849, + "grad_norm": 0.15194053947925568, + "learning_rate": 3.359357078952368e-05, + "loss": 0.385, + "mean_token_accuracy": 0.842530369758606, + "step": 951 + }, + { + "epoch": 2.9937106918238996, + "grad_norm": 0.1333847939968109, + "learning_rate": 3.3579538506318264e-05, + "loss": 0.3998, + "mean_token_accuracy": 0.8397628664970398, + "step": 952 + }, + { + "epoch": 2.99685534591195, + "grad_norm": 0.1507115513086319, + "learning_rate": 3.3565494207049194e-05, + "loss": 0.4102, + "mean_token_accuracy": 0.8390610218048096, + "step": 953 + }, + { + "epoch": 3.0, + "grad_norm": 0.12280502915382385, + "learning_rate": 3.3551437906290207e-05, + "loss": 0.3837, + "mean_token_accuracy": 0.8393963575363159, + "step": 954 + }, + { + "epoch": 3.00314465408805, + "grad_norm": 0.2119593769311905, + "learning_rate": 3.353736961862751e-05, + "loss": 0.3276, + "mean_token_accuracy": 0.8589532375335693, + "step": 955 + }, + { + "epoch": 3.006289308176101, + "grad_norm": 0.5749790668487549, + "learning_rate": 3.352328935865972e-05, + "loss": 0.3309, + "mean_token_accuracy": 0.8566039800643921, + "step": 956 + }, + { + "epoch": 3.009433962264151, + "grad_norm": 0.30272579193115234, + "learning_rate": 3.350919714099791e-05, + "loss": 0.3372, + "mean_token_accuracy": 0.8549745082855225, + "step": 957 + }, + { + "epoch": 3.0125786163522013, + "grad_norm": 0.46458861231803894, + "learning_rate": 3.3495092980265526e-05, + "loss": 0.323, + "mean_token_accuracy": 0.8595535755157471, + "step": 958 + }, + { + "epoch": 3.0157232704402515, + "grad_norm": 0.29047465324401855, + "learning_rate": 3.348097689109844e-05, + "loss": 0.329, + "mean_token_accuracy": 0.859404444694519, + "step": 959 + }, + { + "epoch": 3.018867924528302, + "grad_norm": 0.1861647367477417, + "learning_rate": 3.346684888814489e-05, + "loss": 0.3285, + "mean_token_accuracy": 0.8579146862030029, + "step": 960 + }, + { + "epoch": 3.0220125786163523, + "grad_norm": 0.23538491129875183, + "learning_rate": 3.345270898606548e-05, + "loss": 0.3214, + "mean_token_accuracy": 0.8577497005462646, + "step": 961 + }, + { + "epoch": 3.0251572327044025, + "grad_norm": 0.24326147139072418, + "learning_rate": 3.3438557199533143e-05, + "loss": 0.3172, + "mean_token_accuracy": 0.8618540167808533, + "step": 962 + }, + { + "epoch": 3.0283018867924527, + "grad_norm": 0.19439604878425598, + "learning_rate": 3.342439354323317e-05, + "loss": 0.3343, + "mean_token_accuracy": 0.8569464087486267, + "step": 963 + }, + { + "epoch": 3.0314465408805034, + "grad_norm": 0.2556777596473694, + "learning_rate": 3.341021803186316e-05, + "loss": 0.3229, + "mean_token_accuracy": 0.859006404876709, + "step": 964 + }, + { + "epoch": 3.0345911949685536, + "grad_norm": 0.20313474535942078, + "learning_rate": 3.3396030680133e-05, + "loss": 0.319, + "mean_token_accuracy": 0.8617485165596008, + "step": 965 + }, + { + "epoch": 3.0377358490566038, + "grad_norm": 0.251817524433136, + "learning_rate": 3.3381831502764894e-05, + "loss": 0.3193, + "mean_token_accuracy": 0.863706111907959, + "step": 966 + }, + { + "epoch": 3.040880503144654, + "grad_norm": 0.20017312467098236, + "learning_rate": 3.3367620514493284e-05, + "loss": 0.3238, + "mean_token_accuracy": 0.8608281016349792, + "step": 967 + }, + { + "epoch": 3.0440251572327046, + "grad_norm": 0.27038446068763733, + "learning_rate": 3.335339773006489e-05, + "loss": 0.3151, + "mean_token_accuracy": 0.8628440499305725, + "step": 968 + }, + { + "epoch": 3.047169811320755, + "grad_norm": 0.2303992211818695, + "learning_rate": 3.3339163164238665e-05, + "loss": 0.3103, + "mean_token_accuracy": 0.8667078018188477, + "step": 969 + }, + { + "epoch": 3.050314465408805, + "grad_norm": 0.23521584272384644, + "learning_rate": 3.33249168317858e-05, + "loss": 0.3129, + "mean_token_accuracy": 0.8642385005950928, + "step": 970 + }, + { + "epoch": 3.0534591194968552, + "grad_norm": 0.20210982859134674, + "learning_rate": 3.331065874748967e-05, + "loss": 0.3125, + "mean_token_accuracy": 0.8629567623138428, + "step": 971 + }, + { + "epoch": 3.056603773584906, + "grad_norm": 0.22222645580768585, + "learning_rate": 3.329638892614587e-05, + "loss": 0.3041, + "mean_token_accuracy": 0.8669219017028809, + "step": 972 + }, + { + "epoch": 3.059748427672956, + "grad_norm": 0.1965298056602478, + "learning_rate": 3.328210738256215e-05, + "loss": 0.3232, + "mean_token_accuracy": 0.8628347516059875, + "step": 973 + }, + { + "epoch": 3.0628930817610063, + "grad_norm": 0.19745439291000366, + "learning_rate": 3.326781413155846e-05, + "loss": 0.3271, + "mean_token_accuracy": 0.8626418113708496, + "step": 974 + }, + { + "epoch": 3.0660377358490565, + "grad_norm": 0.16124515235424042, + "learning_rate": 3.325350918796686e-05, + "loss": 0.3434, + "mean_token_accuracy": 0.8579691648483276, + "step": 975 + }, + { + "epoch": 3.069182389937107, + "grad_norm": 0.1931421458721161, + "learning_rate": 3.323919256663157e-05, + "loss": 0.3245, + "mean_token_accuracy": 0.8611186146736145, + "step": 976 + }, + { + "epoch": 3.0723270440251573, + "grad_norm": 0.18141953647136688, + "learning_rate": 3.3224864282408925e-05, + "loss": 0.3094, + "mean_token_accuracy": 0.8668796420097351, + "step": 977 + }, + { + "epoch": 3.0754716981132075, + "grad_norm": 0.17107489705085754, + "learning_rate": 3.3210524350167346e-05, + "loss": 0.3047, + "mean_token_accuracy": 0.8638178110122681, + "step": 978 + }, + { + "epoch": 3.0786163522012577, + "grad_norm": 0.18757247924804688, + "learning_rate": 3.319617278478735e-05, + "loss": 0.3279, + "mean_token_accuracy": 0.8593670725822449, + "step": 979 + }, + { + "epoch": 3.0817610062893084, + "grad_norm": 0.15939109027385712, + "learning_rate": 3.3181809601161535e-05, + "loss": 0.3341, + "mean_token_accuracy": 0.8586524128913879, + "step": 980 + }, + { + "epoch": 3.0849056603773586, + "grad_norm": 0.17841064929962158, + "learning_rate": 3.316743481419454e-05, + "loss": 0.3279, + "mean_token_accuracy": 0.8618712425231934, + "step": 981 + }, + { + "epoch": 3.088050314465409, + "grad_norm": 0.15791453421115875, + "learning_rate": 3.3153048438803064e-05, + "loss": 0.3183, + "mean_token_accuracy": 0.8625402450561523, + "step": 982 + }, + { + "epoch": 3.091194968553459, + "grad_norm": 0.16996054351329803, + "learning_rate": 3.3138650489915805e-05, + "loss": 0.3459, + "mean_token_accuracy": 0.8569095134735107, + "step": 983 + }, + { + "epoch": 3.0943396226415096, + "grad_norm": 0.14582346379756927, + "learning_rate": 3.3124240982473495e-05, + "loss": 0.3199, + "mean_token_accuracy": 0.8625748157501221, + "step": 984 + }, + { + "epoch": 3.09748427672956, + "grad_norm": 0.16500362753868103, + "learning_rate": 3.3109819931428855e-05, + "loss": 0.3309, + "mean_token_accuracy": 0.8585406541824341, + "step": 985 + }, + { + "epoch": 3.10062893081761, + "grad_norm": 0.1490785777568817, + "learning_rate": 3.309538735174657e-05, + "loss": 0.3192, + "mean_token_accuracy": 0.8605947494506836, + "step": 986 + }, + { + "epoch": 3.1037735849056602, + "grad_norm": 0.1524612009525299, + "learning_rate": 3.30809432584033e-05, + "loss": 0.3223, + "mean_token_accuracy": 0.8592230677604675, + "step": 987 + }, + { + "epoch": 3.106918238993711, + "grad_norm": 0.1406700164079666, + "learning_rate": 3.306648766638765e-05, + "loss": 0.3349, + "mean_token_accuracy": 0.8577442169189453, + "step": 988 + }, + { + "epoch": 3.110062893081761, + "grad_norm": 0.15463581681251526, + "learning_rate": 3.3052020590700174e-05, + "loss": 0.3305, + "mean_token_accuracy": 0.8602052927017212, + "step": 989 + }, + { + "epoch": 3.1132075471698113, + "grad_norm": 0.1442749798297882, + "learning_rate": 3.30375420463533e-05, + "loss": 0.3205, + "mean_token_accuracy": 0.8626676201820374, + "step": 990 + }, + { + "epoch": 3.1163522012578615, + "grad_norm": 0.14940737187862396, + "learning_rate": 3.302305204837141e-05, + "loss": 0.3072, + "mean_token_accuracy": 0.8642657995223999, + "step": 991 + }, + { + "epoch": 3.119496855345912, + "grad_norm": 0.1608681082725525, + "learning_rate": 3.300855061179074e-05, + "loss": 0.3054, + "mean_token_accuracy": 0.8632607460021973, + "step": 992 + }, + { + "epoch": 3.1226415094339623, + "grad_norm": 0.1494629681110382, + "learning_rate": 3.2994037751659386e-05, + "loss": 0.3224, + "mean_token_accuracy": 0.8599094152450562, + "step": 993 + }, + { + "epoch": 3.1257861635220126, + "grad_norm": 0.1440831571817398, + "learning_rate": 3.297951348303733e-05, + "loss": 0.3369, + "mean_token_accuracy": 0.8589987754821777, + "step": 994 + }, + { + "epoch": 3.1289308176100628, + "grad_norm": 0.16431039571762085, + "learning_rate": 3.296497782099637e-05, + "loss": 0.3264, + "mean_token_accuracy": 0.8605884313583374, + "step": 995 + }, + { + "epoch": 3.1320754716981134, + "grad_norm": 0.14332625269889832, + "learning_rate": 3.2950430780620134e-05, + "loss": 0.3078, + "mean_token_accuracy": 0.8648683428764343, + "step": 996 + }, + { + "epoch": 3.1352201257861636, + "grad_norm": 0.1832369714975357, + "learning_rate": 3.293587237700405e-05, + "loss": 0.3178, + "mean_token_accuracy": 0.8615559339523315, + "step": 997 + }, + { + "epoch": 3.138364779874214, + "grad_norm": 0.14069117605686188, + "learning_rate": 3.292130262525536e-05, + "loss": 0.3123, + "mean_token_accuracy": 0.8634822368621826, + "step": 998 + }, + { + "epoch": 3.141509433962264, + "grad_norm": 0.16523166000843048, + "learning_rate": 3.290672154049306e-05, + "loss": 0.3357, + "mean_token_accuracy": 0.8579123616218567, + "step": 999 + }, + { + "epoch": 3.1446540880503147, + "grad_norm": 0.14075875282287598, + "learning_rate": 3.2892129137847886e-05, + "loss": 0.3249, + "mean_token_accuracy": 0.8633124828338623, + "step": 1000 + }, + { + "epoch": 3.147798742138365, + "grad_norm": 0.14860154688358307, + "learning_rate": 3.287752543246238e-05, + "loss": 0.3179, + "mean_token_accuracy": 0.8615362048149109, + "step": 1001 + }, + { + "epoch": 3.150943396226415, + "grad_norm": 0.15627436339855194, + "learning_rate": 3.2862910439490764e-05, + "loss": 0.3236, + "mean_token_accuracy": 0.8606275320053101, + "step": 1002 + }, + { + "epoch": 3.1540880503144653, + "grad_norm": 0.13712136447429657, + "learning_rate": 3.284828417409899e-05, + "loss": 0.3133, + "mean_token_accuracy": 0.8660383820533752, + "step": 1003 + }, + { + "epoch": 3.157232704402516, + "grad_norm": 0.16311588883399963, + "learning_rate": 3.283364665146469e-05, + "loss": 0.3095, + "mean_token_accuracy": 0.8647195100784302, + "step": 1004 + }, + { + "epoch": 3.160377358490566, + "grad_norm": 0.1587604582309723, + "learning_rate": 3.2818997886777214e-05, + "loss": 0.3333, + "mean_token_accuracy": 0.8577799201011658, + "step": 1005 + }, + { + "epoch": 3.1635220125786163, + "grad_norm": 0.14171631634235382, + "learning_rate": 3.280433789523753e-05, + "loss": 0.3386, + "mean_token_accuracy": 0.8587863445281982, + "step": 1006 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 0.14268644154071808, + "learning_rate": 3.2789666692058304e-05, + "loss": 0.3312, + "mean_token_accuracy": 0.8618650436401367, + "step": 1007 + }, + { + "epoch": 3.169811320754717, + "grad_norm": 0.1492924988269806, + "learning_rate": 3.277498429246381e-05, + "loss": 0.3347, + "mean_token_accuracy": 0.8577045798301697, + "step": 1008 + }, + { + "epoch": 3.1729559748427674, + "grad_norm": 0.13884948194026947, + "learning_rate": 3.276029071168993e-05, + "loss": 0.3257, + "mean_token_accuracy": 0.8589639067649841, + "step": 1009 + }, + { + "epoch": 3.1761006289308176, + "grad_norm": 0.17117413878440857, + "learning_rate": 3.2745585964984175e-05, + "loss": 0.3281, + "mean_token_accuracy": 0.8598911166191101, + "step": 1010 + }, + { + "epoch": 3.1792452830188678, + "grad_norm": 0.15213529765605927, + "learning_rate": 3.273087006760563e-05, + "loss": 0.3272, + "mean_token_accuracy": 0.8612444996833801, + "step": 1011 + }, + { + "epoch": 3.1823899371069184, + "grad_norm": 0.13560143113136292, + "learning_rate": 3.271614303482494e-05, + "loss": 0.3371, + "mean_token_accuracy": 0.8586222529411316, + "step": 1012 + }, + { + "epoch": 3.1855345911949686, + "grad_norm": 0.15628263354301453, + "learning_rate": 3.270140488192434e-05, + "loss": 0.3292, + "mean_token_accuracy": 0.8595730662345886, + "step": 1013 + }, + { + "epoch": 3.188679245283019, + "grad_norm": 0.14781375229358673, + "learning_rate": 3.268665562419756e-05, + "loss": 0.3286, + "mean_token_accuracy": 0.8589670658111572, + "step": 1014 + }, + { + "epoch": 3.191823899371069, + "grad_norm": 0.15790531039237976, + "learning_rate": 3.267189527694989e-05, + "loss": 0.3274, + "mean_token_accuracy": 0.8619564175605774, + "step": 1015 + }, + { + "epoch": 3.1949685534591197, + "grad_norm": 0.14986778795719147, + "learning_rate": 3.26571238554981e-05, + "loss": 0.3204, + "mean_token_accuracy": 0.8628863096237183, + "step": 1016 + }, + { + "epoch": 3.19811320754717, + "grad_norm": 0.16138505935668945, + "learning_rate": 3.264234137517047e-05, + "loss": 0.3368, + "mean_token_accuracy": 0.8585647344589233, + "step": 1017 + }, + { + "epoch": 3.20125786163522, + "grad_norm": 0.1323172003030777, + "learning_rate": 3.262754785130676e-05, + "loss": 0.3126, + "mean_token_accuracy": 0.8637034893035889, + "step": 1018 + }, + { + "epoch": 3.2044025157232703, + "grad_norm": 0.15144748985767365, + "learning_rate": 3.261274329925817e-05, + "loss": 0.3142, + "mean_token_accuracy": 0.863036036491394, + "step": 1019 + }, + { + "epoch": 3.207547169811321, + "grad_norm": 0.15019363164901733, + "learning_rate": 3.259792773438734e-05, + "loss": 0.3355, + "mean_token_accuracy": 0.8558260202407837, + "step": 1020 + }, + { + "epoch": 3.210691823899371, + "grad_norm": 0.14695622026920319, + "learning_rate": 3.2583101172068366e-05, + "loss": 0.3308, + "mean_token_accuracy": 0.8598349690437317, + "step": 1021 + }, + { + "epoch": 3.2138364779874213, + "grad_norm": 0.15687057375907898, + "learning_rate": 3.256826362768675e-05, + "loss": 0.3112, + "mean_token_accuracy": 0.8649905323982239, + "step": 1022 + }, + { + "epoch": 3.2169811320754715, + "grad_norm": 0.15487264096736908, + "learning_rate": 3.255341511663937e-05, + "loss": 0.3235, + "mean_token_accuracy": 0.8581811785697937, + "step": 1023 + }, + { + "epoch": 3.220125786163522, + "grad_norm": 0.15453389286994934, + "learning_rate": 3.253855565433449e-05, + "loss": 0.3157, + "mean_token_accuracy": 0.860891580581665, + "step": 1024 + }, + { + "epoch": 3.2232704402515724, + "grad_norm": 0.1621147245168686, + "learning_rate": 3.2523685256191756e-05, + "loss": 0.3167, + "mean_token_accuracy": 0.8601864576339722, + "step": 1025 + }, + { + "epoch": 3.2264150943396226, + "grad_norm": 0.14345519244670868, + "learning_rate": 3.250880393764215e-05, + "loss": 0.314, + "mean_token_accuracy": 0.861411988735199, + "step": 1026 + }, + { + "epoch": 3.229559748427673, + "grad_norm": 0.1536128968000412, + "learning_rate": 3.249391171412798e-05, + "loss": 0.3488, + "mean_token_accuracy": 0.8550587296485901, + "step": 1027 + }, + { + "epoch": 3.2327044025157234, + "grad_norm": 0.15043888986110687, + "learning_rate": 3.247900860110288e-05, + "loss": 0.3356, + "mean_token_accuracy": 0.8564497232437134, + "step": 1028 + }, + { + "epoch": 3.2358490566037736, + "grad_norm": 0.1707436740398407, + "learning_rate": 3.246409461403178e-05, + "loss": 0.3344, + "mean_token_accuracy": 0.8586674928665161, + "step": 1029 + }, + { + "epoch": 3.238993710691824, + "grad_norm": 0.152404323220253, + "learning_rate": 3.244916976839089e-05, + "loss": 0.3287, + "mean_token_accuracy": 0.8581715822219849, + "step": 1030 + }, + { + "epoch": 3.242138364779874, + "grad_norm": 0.1631406843662262, + "learning_rate": 3.243423407966769e-05, + "loss": 0.3234, + "mean_token_accuracy": 0.8623092770576477, + "step": 1031 + }, + { + "epoch": 3.2452830188679247, + "grad_norm": 0.15967321395874023, + "learning_rate": 3.241928756336093e-05, + "loss": 0.3226, + "mean_token_accuracy": 0.8575816750526428, + "step": 1032 + }, + { + "epoch": 3.248427672955975, + "grad_norm": 0.14411385357379913, + "learning_rate": 3.240433023498056e-05, + "loss": 0.3309, + "mean_token_accuracy": 0.8565492630004883, + "step": 1033 + }, + { + "epoch": 3.251572327044025, + "grad_norm": 0.16305826604366302, + "learning_rate": 3.238936211004779e-05, + "loss": 0.3238, + "mean_token_accuracy": 0.8587154150009155, + "step": 1034 + }, + { + "epoch": 3.2547169811320753, + "grad_norm": 0.14919938147068024, + "learning_rate": 3.237438320409499e-05, + "loss": 0.3415, + "mean_token_accuracy": 0.8566460609436035, + "step": 1035 + }, + { + "epoch": 3.257861635220126, + "grad_norm": 0.15982939302921295, + "learning_rate": 3.2359393532665744e-05, + "loss": 0.3393, + "mean_token_accuracy": 0.8567012548446655, + "step": 1036 + }, + { + "epoch": 3.261006289308176, + "grad_norm": 0.15051038563251495, + "learning_rate": 3.234439311131483e-05, + "loss": 0.3311, + "mean_token_accuracy": 0.8606399297714233, + "step": 1037 + }, + { + "epoch": 3.2641509433962264, + "grad_norm": 0.15478473901748657, + "learning_rate": 3.232938195560812e-05, + "loss": 0.3029, + "mean_token_accuracy": 0.8636201620101929, + "step": 1038 + }, + { + "epoch": 3.2672955974842766, + "grad_norm": 0.16665510833263397, + "learning_rate": 3.231436008112268e-05, + "loss": 0.3252, + "mean_token_accuracy": 0.8577843904495239, + "step": 1039 + }, + { + "epoch": 3.270440251572327, + "grad_norm": 0.14956386387348175, + "learning_rate": 3.2299327503446675e-05, + "loss": 0.3244, + "mean_token_accuracy": 0.8613575100898743, + "step": 1040 + }, + { + "epoch": 3.2735849056603774, + "grad_norm": 0.15692093968391418, + "learning_rate": 3.2284284238179385e-05, + "loss": 0.3207, + "mean_token_accuracy": 0.8585643172264099, + "step": 1041 + }, + { + "epoch": 3.2767295597484276, + "grad_norm": 0.14993791282176971, + "learning_rate": 3.226923030093117e-05, + "loss": 0.3244, + "mean_token_accuracy": 0.8617245554924011, + "step": 1042 + }, + { + "epoch": 3.279874213836478, + "grad_norm": 0.14964789152145386, + "learning_rate": 3.225416570732346e-05, + "loss": 0.3282, + "mean_token_accuracy": 0.8592199087142944, + "step": 1043 + }, + { + "epoch": 3.2830188679245285, + "grad_norm": 0.1455584019422531, + "learning_rate": 3.2239090472988775e-05, + "loss": 0.323, + "mean_token_accuracy": 0.8582662343978882, + "step": 1044 + }, + { + "epoch": 3.2861635220125787, + "grad_norm": 0.14278201758861542, + "learning_rate": 3.222400461357064e-05, + "loss": 0.3016, + "mean_token_accuracy": 0.8661797046661377, + "step": 1045 + }, + { + "epoch": 3.289308176100629, + "grad_norm": 0.13362888991832733, + "learning_rate": 3.220890814472363e-05, + "loss": 0.3325, + "mean_token_accuracy": 0.8578916788101196, + "step": 1046 + }, + { + "epoch": 3.292452830188679, + "grad_norm": 0.14459533989429474, + "learning_rate": 3.219380108211331e-05, + "loss": 0.3237, + "mean_token_accuracy": 0.8623485565185547, + "step": 1047 + }, + { + "epoch": 3.2955974842767297, + "grad_norm": 0.13775929808616638, + "learning_rate": 3.217868344141625e-05, + "loss": 0.3279, + "mean_token_accuracy": 0.8596442341804504, + "step": 1048 + }, + { + "epoch": 3.29874213836478, + "grad_norm": 0.12983611226081848, + "learning_rate": 3.2163555238320006e-05, + "loss": 0.3241, + "mean_token_accuracy": 0.8623194694519043, + "step": 1049 + }, + { + "epoch": 3.30188679245283, + "grad_norm": 0.13327816128730774, + "learning_rate": 3.214841648852308e-05, + "loss": 0.3283, + "mean_token_accuracy": 0.8586874604225159, + "step": 1050 + }, + { + "epoch": 3.3050314465408803, + "grad_norm": 0.14740483462810516, + "learning_rate": 3.21332672077349e-05, + "loss": 0.3289, + "mean_token_accuracy": 0.8617115616798401, + "step": 1051 + }, + { + "epoch": 3.308176100628931, + "grad_norm": 0.1283697932958603, + "learning_rate": 3.211810741167588e-05, + "loss": 0.3175, + "mean_token_accuracy": 0.8588584065437317, + "step": 1052 + }, + { + "epoch": 3.311320754716981, + "grad_norm": 0.15197114646434784, + "learning_rate": 3.210293711607729e-05, + "loss": 0.3347, + "mean_token_accuracy": 0.8559669852256775, + "step": 1053 + }, + { + "epoch": 3.3144654088050314, + "grad_norm": 0.13700467348098755, + "learning_rate": 3.2087756336681306e-05, + "loss": 0.325, + "mean_token_accuracy": 0.8605079054832458, + "step": 1054 + }, + { + "epoch": 3.3176100628930816, + "grad_norm": 0.13200706243515015, + "learning_rate": 3.207256508924101e-05, + "loss": 0.3317, + "mean_token_accuracy": 0.8585512638092041, + "step": 1055 + }, + { + "epoch": 3.3207547169811322, + "grad_norm": 0.1488047093153, + "learning_rate": 3.2057363389520326e-05, + "loss": 0.3381, + "mean_token_accuracy": 0.8565067052841187, + "step": 1056 + }, + { + "epoch": 3.3238993710691824, + "grad_norm": 0.1520201414823532, + "learning_rate": 3.204215125329402e-05, + "loss": 0.3342, + "mean_token_accuracy": 0.8572053909301758, + "step": 1057 + }, + { + "epoch": 3.3270440251572326, + "grad_norm": 0.17158624529838562, + "learning_rate": 3.20269286963477e-05, + "loss": 0.3316, + "mean_token_accuracy": 0.858790934085846, + "step": 1058 + }, + { + "epoch": 3.330188679245283, + "grad_norm": 0.14773574471473694, + "learning_rate": 3.2011695734477776e-05, + "loss": 0.3207, + "mean_token_accuracy": 0.8638202548027039, + "step": 1059 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.17465537786483765, + "learning_rate": 3.199645238349146e-05, + "loss": 0.321, + "mean_token_accuracy": 0.8613391518592834, + "step": 1060 + }, + { + "epoch": 3.3364779874213837, + "grad_norm": 0.13760405778884888, + "learning_rate": 3.198119865920677e-05, + "loss": 0.3217, + "mean_token_accuracy": 0.8583642244338989, + "step": 1061 + }, + { + "epoch": 3.339622641509434, + "grad_norm": 0.15301382541656494, + "learning_rate": 3.196593457745243e-05, + "loss": 0.3197, + "mean_token_accuracy": 0.8593172430992126, + "step": 1062 + }, + { + "epoch": 3.342767295597484, + "grad_norm": 0.1372508555650711, + "learning_rate": 3.195066015406797e-05, + "loss": 0.3318, + "mean_token_accuracy": 0.8575015664100647, + "step": 1063 + }, + { + "epoch": 3.3459119496855347, + "grad_norm": 0.14857687056064606, + "learning_rate": 3.193537540490363e-05, + "loss": 0.3316, + "mean_token_accuracy": 0.8603220582008362, + "step": 1064 + }, + { + "epoch": 3.349056603773585, + "grad_norm": 0.12988995015621185, + "learning_rate": 3.192008034582034e-05, + "loss": 0.3467, + "mean_token_accuracy": 0.8531395792961121, + "step": 1065 + }, + { + "epoch": 3.352201257861635, + "grad_norm": 0.15077495574951172, + "learning_rate": 3.190477499268978e-05, + "loss": 0.3352, + "mean_token_accuracy": 0.8567934632301331, + "step": 1066 + }, + { + "epoch": 3.3553459119496853, + "grad_norm": 0.14799143373966217, + "learning_rate": 3.1889459361394266e-05, + "loss": 0.2899, + "mean_token_accuracy": 0.8670781850814819, + "step": 1067 + }, + { + "epoch": 3.358490566037736, + "grad_norm": 0.1631443202495575, + "learning_rate": 3.1874133467826804e-05, + "loss": 0.3139, + "mean_token_accuracy": 0.8630610108375549, + "step": 1068 + }, + { + "epoch": 3.361635220125786, + "grad_norm": 0.1495402604341507, + "learning_rate": 3.1858797327891034e-05, + "loss": 0.3071, + "mean_token_accuracy": 0.8628964424133301, + "step": 1069 + }, + { + "epoch": 3.3647798742138364, + "grad_norm": 0.1613866537809372, + "learning_rate": 3.1843450957501254e-05, + "loss": 0.3187, + "mean_token_accuracy": 0.8615692257881165, + "step": 1070 + }, + { + "epoch": 3.3679245283018866, + "grad_norm": 0.15920579433441162, + "learning_rate": 3.182809437258235e-05, + "loss": 0.3149, + "mean_token_accuracy": 0.862976610660553, + "step": 1071 + }, + { + "epoch": 3.3710691823899372, + "grad_norm": 0.15265892446041107, + "learning_rate": 3.181272758906982e-05, + "loss": 0.3171, + "mean_token_accuracy": 0.8599149584770203, + "step": 1072 + }, + { + "epoch": 3.3742138364779874, + "grad_norm": 0.15581786632537842, + "learning_rate": 3.179735062290974e-05, + "loss": 0.3255, + "mean_token_accuracy": 0.8590546250343323, + "step": 1073 + }, + { + "epoch": 3.3773584905660377, + "grad_norm": 0.13360513746738434, + "learning_rate": 3.178196349005877e-05, + "loss": 0.3146, + "mean_token_accuracy": 0.8623788952827454, + "step": 1074 + }, + { + "epoch": 3.380503144654088, + "grad_norm": 0.1339465230703354, + "learning_rate": 3.17665662064841e-05, + "loss": 0.3545, + "mean_token_accuracy": 0.852672278881073, + "step": 1075 + }, + { + "epoch": 3.3836477987421385, + "grad_norm": 0.1387154757976532, + "learning_rate": 3.175115878816346e-05, + "loss": 0.3286, + "mean_token_accuracy": 0.8580860495567322, + "step": 1076 + }, + { + "epoch": 3.3867924528301887, + "grad_norm": 0.14403823018074036, + "learning_rate": 3.173574125108508e-05, + "loss": 0.3441, + "mean_token_accuracy": 0.8584855794906616, + "step": 1077 + }, + { + "epoch": 3.389937106918239, + "grad_norm": 0.15723450481891632, + "learning_rate": 3.172031361124774e-05, + "loss": 0.3252, + "mean_token_accuracy": 0.8592239022254944, + "step": 1078 + }, + { + "epoch": 3.3930817610062896, + "grad_norm": 0.13823583722114563, + "learning_rate": 3.1704875884660645e-05, + "loss": 0.3227, + "mean_token_accuracy": 0.860748291015625, + "step": 1079 + }, + { + "epoch": 3.3962264150943398, + "grad_norm": 0.17623189091682434, + "learning_rate": 3.16894280873435e-05, + "loss": 0.3334, + "mean_token_accuracy": 0.8578466773033142, + "step": 1080 + }, + { + "epoch": 3.39937106918239, + "grad_norm": 0.15371590852737427, + "learning_rate": 3.1673970235326454e-05, + "loss": 0.3171, + "mean_token_accuracy": 0.8577912449836731, + "step": 1081 + }, + { + "epoch": 3.40251572327044, + "grad_norm": 0.16374284029006958, + "learning_rate": 3.165850234465009e-05, + "loss": 0.3333, + "mean_token_accuracy": 0.8590589761734009, + "step": 1082 + }, + { + "epoch": 3.4056603773584904, + "grad_norm": 0.1451641023159027, + "learning_rate": 3.16430244313654e-05, + "loss": 0.3259, + "mean_token_accuracy": 0.8558355569839478, + "step": 1083 + }, + { + "epoch": 3.408805031446541, + "grad_norm": 0.1551799774169922, + "learning_rate": 3.1627536511533795e-05, + "loss": 0.3339, + "mean_token_accuracy": 0.8557202219963074, + "step": 1084 + }, + { + "epoch": 3.411949685534591, + "grad_norm": 0.12902113795280457, + "learning_rate": 3.1612038601227054e-05, + "loss": 0.3279, + "mean_token_accuracy": 0.8582240343093872, + "step": 1085 + }, + { + "epoch": 3.4150943396226414, + "grad_norm": 0.15876270830631256, + "learning_rate": 3.159653071652732e-05, + "loss": 0.3307, + "mean_token_accuracy": 0.8565781712532043, + "step": 1086 + }, + { + "epoch": 3.418238993710692, + "grad_norm": 0.12267336994409561, + "learning_rate": 3.1581012873527095e-05, + "loss": 0.327, + "mean_token_accuracy": 0.8610835671424866, + "step": 1087 + }, + { + "epoch": 3.4213836477987423, + "grad_norm": 0.15954187512397766, + "learning_rate": 3.156548508832922e-05, + "loss": 0.3304, + "mean_token_accuracy": 0.8586665391921997, + "step": 1088 + }, + { + "epoch": 3.4245283018867925, + "grad_norm": 0.1352037936449051, + "learning_rate": 3.154994737704684e-05, + "loss": 0.3317, + "mean_token_accuracy": 0.8586193919181824, + "step": 1089 + }, + { + "epoch": 3.4276729559748427, + "grad_norm": 0.15149874985218048, + "learning_rate": 3.153439975580341e-05, + "loss": 0.3375, + "mean_token_accuracy": 0.854519248008728, + "step": 1090 + }, + { + "epoch": 3.430817610062893, + "grad_norm": 0.13168424367904663, + "learning_rate": 3.151884224073267e-05, + "loss": 0.332, + "mean_token_accuracy": 0.8577566742897034, + "step": 1091 + }, + { + "epoch": 3.4339622641509435, + "grad_norm": 0.13555015623569489, + "learning_rate": 3.150327484797861e-05, + "loss": 0.3236, + "mean_token_accuracy": 0.8604238033294678, + "step": 1092 + }, + { + "epoch": 3.4371069182389937, + "grad_norm": 0.132098987698555, + "learning_rate": 3.1487697593695495e-05, + "loss": 0.3232, + "mean_token_accuracy": 0.8571335077285767, + "step": 1093 + }, + { + "epoch": 3.440251572327044, + "grad_norm": 0.12883129715919495, + "learning_rate": 3.147211049404779e-05, + "loss": 0.3271, + "mean_token_accuracy": 0.8585851192474365, + "step": 1094 + }, + { + "epoch": 3.4433962264150946, + "grad_norm": 0.12580397725105286, + "learning_rate": 3.145651356521022e-05, + "loss": 0.3376, + "mean_token_accuracy": 0.8587803840637207, + "step": 1095 + }, + { + "epoch": 3.4465408805031448, + "grad_norm": 0.13099335134029388, + "learning_rate": 3.1440906823367676e-05, + "loss": 0.3191, + "mean_token_accuracy": 0.8609188795089722, + "step": 1096 + }, + { + "epoch": 3.449685534591195, + "grad_norm": 0.1349143236875534, + "learning_rate": 3.142529028471525e-05, + "loss": 0.3198, + "mean_token_accuracy": 0.8595296144485474, + "step": 1097 + }, + { + "epoch": 3.452830188679245, + "grad_norm": 0.14235574007034302, + "learning_rate": 3.140966396545817e-05, + "loss": 0.3402, + "mean_token_accuracy": 0.854846179485321, + "step": 1098 + }, + { + "epoch": 3.4559748427672954, + "grad_norm": 0.12889491021633148, + "learning_rate": 3.139402788181186e-05, + "loss": 0.3474, + "mean_token_accuracy": 0.8538563847541809, + "step": 1099 + }, + { + "epoch": 3.459119496855346, + "grad_norm": 0.15164689719676971, + "learning_rate": 3.137838205000185e-05, + "loss": 0.3369, + "mean_token_accuracy": 0.8573994040489197, + "step": 1100 + }, + { + "epoch": 3.4622641509433962, + "grad_norm": 0.1371922492980957, + "learning_rate": 3.136272648626377e-05, + "loss": 0.3334, + "mean_token_accuracy": 0.8574711084365845, + "step": 1101 + }, + { + "epoch": 3.4654088050314464, + "grad_norm": 0.1531357765197754, + "learning_rate": 3.1347061206843376e-05, + "loss": 0.329, + "mean_token_accuracy": 0.8560079336166382, + "step": 1102 + }, + { + "epoch": 3.468553459119497, + "grad_norm": 0.15599913895130157, + "learning_rate": 3.133138622799651e-05, + "loss": 0.3245, + "mean_token_accuracy": 0.8592775464057922, + "step": 1103 + }, + { + "epoch": 3.4716981132075473, + "grad_norm": 0.13518376648426056, + "learning_rate": 3.131570156598905e-05, + "loss": 0.325, + "mean_token_accuracy": 0.8589106798171997, + "step": 1104 + }, + { + "epoch": 3.4748427672955975, + "grad_norm": 0.15266206860542297, + "learning_rate": 3.1300007237096944e-05, + "loss": 0.3346, + "mean_token_accuracy": 0.857874870300293, + "step": 1105 + }, + { + "epoch": 3.4779874213836477, + "grad_norm": 0.1420784741640091, + "learning_rate": 3.128430325760616e-05, + "loss": 0.3296, + "mean_token_accuracy": 0.8598121404647827, + "step": 1106 + }, + { + "epoch": 3.481132075471698, + "grad_norm": 0.15022684633731842, + "learning_rate": 3.126858964381269e-05, + "loss": 0.3211, + "mean_token_accuracy": 0.8595227003097534, + "step": 1107 + }, + { + "epoch": 3.4842767295597485, + "grad_norm": 0.15471023321151733, + "learning_rate": 3.125286641202252e-05, + "loss": 0.3259, + "mean_token_accuracy": 0.8603160381317139, + "step": 1108 + }, + { + "epoch": 3.4874213836477987, + "grad_norm": 0.1372515708208084, + "learning_rate": 3.123713357855163e-05, + "loss": 0.3256, + "mean_token_accuracy": 0.8592529296875, + "step": 1109 + }, + { + "epoch": 3.490566037735849, + "grad_norm": 0.14333589375019073, + "learning_rate": 3.122139115972591e-05, + "loss": 0.3216, + "mean_token_accuracy": 0.8592953681945801, + "step": 1110 + }, + { + "epoch": 3.4937106918238996, + "grad_norm": 0.13496816158294678, + "learning_rate": 3.120563917188127e-05, + "loss": 0.3319, + "mean_token_accuracy": 0.8571884632110596, + "step": 1111 + }, + { + "epoch": 3.49685534591195, + "grad_norm": 0.13933950662612915, + "learning_rate": 3.1189877631363514e-05, + "loss": 0.3262, + "mean_token_accuracy": 0.8581831455230713, + "step": 1112 + }, + { + "epoch": 3.5, + "grad_norm": 0.13966146111488342, + "learning_rate": 3.117410655452835e-05, + "loss": 0.3428, + "mean_token_accuracy": 0.8562673926353455, + "step": 1113 + }, + { + "epoch": 3.50314465408805, + "grad_norm": 0.13303886353969574, + "learning_rate": 3.115832595774139e-05, + "loss": 0.3212, + "mean_token_accuracy": 0.8587968349456787, + "step": 1114 + }, + { + "epoch": 3.5062893081761004, + "grad_norm": 0.14135101437568665, + "learning_rate": 3.114253585737813e-05, + "loss": 0.3278, + "mean_token_accuracy": 0.8600554466247559, + "step": 1115 + }, + { + "epoch": 3.509433962264151, + "grad_norm": 0.1329166740179062, + "learning_rate": 3.112673626982394e-05, + "loss": 0.3426, + "mean_token_accuracy": 0.853922963142395, + "step": 1116 + }, + { + "epoch": 3.5125786163522013, + "grad_norm": 0.1364695280790329, + "learning_rate": 3.1110927211474e-05, + "loss": 0.343, + "mean_token_accuracy": 0.8539784550666809, + "step": 1117 + }, + { + "epoch": 3.5157232704402515, + "grad_norm": 0.1294035166501999, + "learning_rate": 3.109510869873335e-05, + "loss": 0.3456, + "mean_token_accuracy": 0.8523107767105103, + "step": 1118 + }, + { + "epoch": 3.518867924528302, + "grad_norm": 0.1287209391593933, + "learning_rate": 3.107928074801682e-05, + "loss": 0.3331, + "mean_token_accuracy": 0.8596871495246887, + "step": 1119 + }, + { + "epoch": 3.5220125786163523, + "grad_norm": 0.12785553932189941, + "learning_rate": 3.106344337574904e-05, + "loss": 0.3308, + "mean_token_accuracy": 0.8588385581970215, + "step": 1120 + }, + { + "epoch": 3.5251572327044025, + "grad_norm": 0.14281029999256134, + "learning_rate": 3.1047596598364436e-05, + "loss": 0.3362, + "mean_token_accuracy": 0.8574225902557373, + "step": 1121 + }, + { + "epoch": 3.5283018867924527, + "grad_norm": 0.13183027505874634, + "learning_rate": 3.1031740432307164e-05, + "loss": 0.3127, + "mean_token_accuracy": 0.8617753386497498, + "step": 1122 + }, + { + "epoch": 3.531446540880503, + "grad_norm": 0.13874752819538116, + "learning_rate": 3.1015874894031144e-05, + "loss": 0.3334, + "mean_token_accuracy": 0.8580701351165771, + "step": 1123 + }, + { + "epoch": 3.5345911949685536, + "grad_norm": 0.12882445752620697, + "learning_rate": 3.1e-05, + "loss": 0.3373, + "mean_token_accuracy": 0.8580555319786072, + "step": 1124 + }, + { + "epoch": 3.5377358490566038, + "grad_norm": 0.14017626643180847, + "learning_rate": 3.0984115766687096e-05, + "loss": 0.3346, + "mean_token_accuracy": 0.8570591807365417, + "step": 1125 + }, + { + "epoch": 3.540880503144654, + "grad_norm": 0.12685616314411163, + "learning_rate": 3.0968222210575446e-05, + "loss": 0.3221, + "mean_token_accuracy": 0.8617547154426575, + "step": 1126 + }, + { + "epoch": 3.5440251572327046, + "grad_norm": 0.1380213052034378, + "learning_rate": 3.095231934815777e-05, + "loss": 0.3239, + "mean_token_accuracy": 0.8619950413703918, + "step": 1127 + }, + { + "epoch": 3.547169811320755, + "grad_norm": 0.136666402220726, + "learning_rate": 3.0936407195936444e-05, + "loss": 0.329, + "mean_token_accuracy": 0.8563064336776733, + "step": 1128 + }, + { + "epoch": 3.550314465408805, + "grad_norm": 0.1525544971227646, + "learning_rate": 3.092048577042347e-05, + "loss": 0.3268, + "mean_token_accuracy": 0.8579536080360413, + "step": 1129 + }, + { + "epoch": 3.5534591194968552, + "grad_norm": 0.14374618232250214, + "learning_rate": 3.090455508814047e-05, + "loss": 0.3191, + "mean_token_accuracy": 0.861701250076294, + "step": 1130 + }, + { + "epoch": 3.5566037735849054, + "grad_norm": 0.1475425511598587, + "learning_rate": 3.088861516561871e-05, + "loss": 0.3168, + "mean_token_accuracy": 0.8611730933189392, + "step": 1131 + }, + { + "epoch": 3.559748427672956, + "grad_norm": 0.1540241688489914, + "learning_rate": 3.087266601939898e-05, + "loss": 0.3125, + "mean_token_accuracy": 0.8643029928207397, + "step": 1132 + }, + { + "epoch": 3.5628930817610063, + "grad_norm": 0.14442327618598938, + "learning_rate": 3.0856707666031694e-05, + "loss": 0.3327, + "mean_token_accuracy": 0.8581700921058655, + "step": 1133 + }, + { + "epoch": 3.5660377358490565, + "grad_norm": 0.15339228510856628, + "learning_rate": 3.084074012207681e-05, + "loss": 0.3034, + "mean_token_accuracy": 0.8633737564086914, + "step": 1134 + }, + { + "epoch": 3.569182389937107, + "grad_norm": 0.14664144814014435, + "learning_rate": 3.0824763404103795e-05, + "loss": 0.3392, + "mean_token_accuracy": 0.8558675646781921, + "step": 1135 + }, + { + "epoch": 3.5723270440251573, + "grad_norm": 0.16255716979503632, + "learning_rate": 3.080877752869168e-05, + "loss": 0.3353, + "mean_token_accuracy": 0.8562329411506653, + "step": 1136 + }, + { + "epoch": 3.5754716981132075, + "grad_norm": 0.1473674178123474, + "learning_rate": 3.079278251242896e-05, + "loss": 0.336, + "mean_token_accuracy": 0.8567706942558289, + "step": 1137 + }, + { + "epoch": 3.5786163522012577, + "grad_norm": 0.1516093611717224, + "learning_rate": 3.0776778371913634e-05, + "loss": 0.305, + "mean_token_accuracy": 0.8626529574394226, + "step": 1138 + }, + { + "epoch": 3.581761006289308, + "grad_norm": 0.1458694338798523, + "learning_rate": 3.076076512375317e-05, + "loss": 0.3435, + "mean_token_accuracy": 0.8568443059921265, + "step": 1139 + }, + { + "epoch": 3.5849056603773586, + "grad_norm": 0.14711114764213562, + "learning_rate": 3.0744742784564476e-05, + "loss": 0.3441, + "mean_token_accuracy": 0.8580071926116943, + "step": 1140 + }, + { + "epoch": 3.588050314465409, + "grad_norm": 0.1530224233865738, + "learning_rate": 3.0728711370973915e-05, + "loss": 0.3328, + "mean_token_accuracy": 0.8571670055389404, + "step": 1141 + }, + { + "epoch": 3.591194968553459, + "grad_norm": 0.1374947428703308, + "learning_rate": 3.071267089961724e-05, + "loss": 0.3153, + "mean_token_accuracy": 0.8622951507568359, + "step": 1142 + }, + { + "epoch": 3.5943396226415096, + "grad_norm": 0.15225058794021606, + "learning_rate": 3.069662138713962e-05, + "loss": 0.3218, + "mean_token_accuracy": 0.8582719564437866, + "step": 1143 + }, + { + "epoch": 3.59748427672956, + "grad_norm": 0.13568712770938873, + "learning_rate": 3.0680562850195594e-05, + "loss": 0.3311, + "mean_token_accuracy": 0.8576697707176208, + "step": 1144 + }, + { + "epoch": 3.60062893081761, + "grad_norm": 0.16287925839424133, + "learning_rate": 3.0664495305449084e-05, + "loss": 0.3225, + "mean_token_accuracy": 0.8594067692756653, + "step": 1145 + }, + { + "epoch": 3.6037735849056602, + "grad_norm": 0.14078298211097717, + "learning_rate": 3.064841876957335e-05, + "loss": 0.3139, + "mean_token_accuracy": 0.8635094165802002, + "step": 1146 + }, + { + "epoch": 3.6069182389937104, + "grad_norm": 0.1636054962873459, + "learning_rate": 3.0632333259250966e-05, + "loss": 0.3261, + "mean_token_accuracy": 0.8589130640029907, + "step": 1147 + }, + { + "epoch": 3.610062893081761, + "grad_norm": 0.1595996916294098, + "learning_rate": 3.0616238791173846e-05, + "loss": 0.34, + "mean_token_accuracy": 0.8539798855781555, + "step": 1148 + }, + { + "epoch": 3.6132075471698113, + "grad_norm": 0.13862352073192596, + "learning_rate": 3.060013538204319e-05, + "loss": 0.3185, + "mean_token_accuracy": 0.8629903197288513, + "step": 1149 + }, + { + "epoch": 3.6163522012578615, + "grad_norm": 0.14423629641532898, + "learning_rate": 3.058402304856946e-05, + "loss": 0.332, + "mean_token_accuracy": 0.8597278594970703, + "step": 1150 + }, + { + "epoch": 3.619496855345912, + "grad_norm": 0.1510828286409378, + "learning_rate": 3.0567901807472394e-05, + "loss": 0.3137, + "mean_token_accuracy": 0.8636487126350403, + "step": 1151 + }, + { + "epoch": 3.6226415094339623, + "grad_norm": 0.14903636276721954, + "learning_rate": 3.055177167548098e-05, + "loss": 0.3312, + "mean_token_accuracy": 0.8585656881332397, + "step": 1152 + }, + { + "epoch": 3.6257861635220126, + "grad_norm": 0.1466004103422165, + "learning_rate": 3.053563266933343e-05, + "loss": 0.3295, + "mean_token_accuracy": 0.8588059544563293, + "step": 1153 + }, + { + "epoch": 3.6289308176100628, + "grad_norm": 0.14094513654708862, + "learning_rate": 3.051948480577714e-05, + "loss": 0.3323, + "mean_token_accuracy": 0.8572471737861633, + "step": 1154 + }, + { + "epoch": 3.632075471698113, + "grad_norm": 0.13098470866680145, + "learning_rate": 3.0503328101568713e-05, + "loss": 0.336, + "mean_token_accuracy": 0.8568713665008545, + "step": 1155 + }, + { + "epoch": 3.6352201257861636, + "grad_norm": 0.1380501538515091, + "learning_rate": 3.0487162573473944e-05, + "loss": 0.3424, + "mean_token_accuracy": 0.8545725345611572, + "step": 1156 + }, + { + "epoch": 3.638364779874214, + "grad_norm": 0.13335196673870087, + "learning_rate": 3.047098823826776e-05, + "loss": 0.3258, + "mean_token_accuracy": 0.8585770130157471, + "step": 1157 + }, + { + "epoch": 3.641509433962264, + "grad_norm": 0.14133410155773163, + "learning_rate": 3.0454805112734243e-05, + "loss": 0.3433, + "mean_token_accuracy": 0.8555521965026855, + "step": 1158 + }, + { + "epoch": 3.6446540880503147, + "grad_norm": 0.1298840194940567, + "learning_rate": 3.043861321366658e-05, + "loss": 0.3353, + "mean_token_accuracy": 0.859320342540741, + "step": 1159 + }, + { + "epoch": 3.647798742138365, + "grad_norm": 0.1415969878435135, + "learning_rate": 3.042241255786708e-05, + "loss": 0.3397, + "mean_token_accuracy": 0.8569203615188599, + "step": 1160 + }, + { + "epoch": 3.650943396226415, + "grad_norm": 0.1243518739938736, + "learning_rate": 3.040620316214713e-05, + "loss": 0.3227, + "mean_token_accuracy": 0.86089026927948, + "step": 1161 + }, + { + "epoch": 3.6540880503144653, + "grad_norm": 0.16862568259239197, + "learning_rate": 3.0389985043327183e-05, + "loss": 0.35, + "mean_token_accuracy": 0.8550198674201965, + "step": 1162 + }, + { + "epoch": 3.6572327044025155, + "grad_norm": 0.1371821016073227, + "learning_rate": 3.0373758218236747e-05, + "loss": 0.3112, + "mean_token_accuracy": 0.8630196452140808, + "step": 1163 + }, + { + "epoch": 3.660377358490566, + "grad_norm": 0.14441686868667603, + "learning_rate": 3.0357522703714365e-05, + "loss": 0.3322, + "mean_token_accuracy": 0.8588864207267761, + "step": 1164 + }, + { + "epoch": 3.6635220125786163, + "grad_norm": 0.14246846735477448, + "learning_rate": 3.03412785166076e-05, + "loss": 0.3236, + "mean_token_accuracy": 0.8609263300895691, + "step": 1165 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.14187541604042053, + "learning_rate": 3.032502567377302e-05, + "loss": 0.3287, + "mean_token_accuracy": 0.8563394546508789, + "step": 1166 + }, + { + "epoch": 3.669811320754717, + "grad_norm": 0.1399676501750946, + "learning_rate": 3.030876419207616e-05, + "loss": 0.3193, + "mean_token_accuracy": 0.8611453771591187, + "step": 1167 + }, + { + "epoch": 3.6729559748427674, + "grad_norm": 0.15003567934036255, + "learning_rate": 3.0292494088391522e-05, + "loss": 0.3436, + "mean_token_accuracy": 0.8553731441497803, + "step": 1168 + }, + { + "epoch": 3.6761006289308176, + "grad_norm": 0.1415679007768631, + "learning_rate": 3.027621537960257e-05, + "loss": 0.3343, + "mean_token_accuracy": 0.8586793541908264, + "step": 1169 + }, + { + "epoch": 3.6792452830188678, + "grad_norm": 0.13909806311130524, + "learning_rate": 3.0259928082601675e-05, + "loss": 0.3342, + "mean_token_accuracy": 0.8584513664245605, + "step": 1170 + }, + { + "epoch": 3.682389937106918, + "grad_norm": 0.1442382037639618, + "learning_rate": 3.0243632214290137e-05, + "loss": 0.3401, + "mean_token_accuracy": 0.854636549949646, + "step": 1171 + }, + { + "epoch": 3.6855345911949686, + "grad_norm": 0.1362895667552948, + "learning_rate": 3.022732779157815e-05, + "loss": 0.3486, + "mean_token_accuracy": 0.8539217710494995, + "step": 1172 + }, + { + "epoch": 3.688679245283019, + "grad_norm": 0.14030933380126953, + "learning_rate": 3.0211014831384773e-05, + "loss": 0.3305, + "mean_token_accuracy": 0.8588977456092834, + "step": 1173 + }, + { + "epoch": 3.691823899371069, + "grad_norm": 0.14149655401706696, + "learning_rate": 3.0194693350637943e-05, + "loss": 0.331, + "mean_token_accuracy": 0.8578828573226929, + "step": 1174 + }, + { + "epoch": 3.6949685534591197, + "grad_norm": 0.1377638727426529, + "learning_rate": 3.017836336627441e-05, + "loss": 0.3356, + "mean_token_accuracy": 0.854766845703125, + "step": 1175 + }, + { + "epoch": 3.69811320754717, + "grad_norm": 0.1417425572872162, + "learning_rate": 3.0162024895239787e-05, + "loss": 0.3372, + "mean_token_accuracy": 0.8555588126182556, + "step": 1176 + }, + { + "epoch": 3.70125786163522, + "grad_norm": 0.14174726605415344, + "learning_rate": 3.0145677954488457e-05, + "loss": 0.3338, + "mean_token_accuracy": 0.856699526309967, + "step": 1177 + }, + { + "epoch": 3.7044025157232703, + "grad_norm": 0.13998551666736603, + "learning_rate": 3.0129322560983616e-05, + "loss": 0.326, + "mean_token_accuracy": 0.8607649803161621, + "step": 1178 + }, + { + "epoch": 3.7075471698113205, + "grad_norm": 0.1254308521747589, + "learning_rate": 3.0112958731697214e-05, + "loss": 0.3512, + "mean_token_accuracy": 0.8531991243362427, + "step": 1179 + }, + { + "epoch": 3.710691823899371, + "grad_norm": 0.1437944918870926, + "learning_rate": 3.0096586483609966e-05, + "loss": 0.3354, + "mean_token_accuracy": 0.8561474680900574, + "step": 1180 + }, + { + "epoch": 3.7138364779874213, + "grad_norm": 0.1321929395198822, + "learning_rate": 3.0080205833711325e-05, + "loss": 0.3284, + "mean_token_accuracy": 0.8593660593032837, + "step": 1181 + }, + { + "epoch": 3.7169811320754715, + "grad_norm": 0.15401658415794373, + "learning_rate": 3.0063816798999456e-05, + "loss": 0.3314, + "mean_token_accuracy": 0.8591458201408386, + "step": 1182 + }, + { + "epoch": 3.720125786163522, + "grad_norm": 0.14057710766792297, + "learning_rate": 3.0047419396481227e-05, + "loss": 0.3475, + "mean_token_accuracy": 0.8535549640655518, + "step": 1183 + }, + { + "epoch": 3.7232704402515724, + "grad_norm": 0.13213631510734558, + "learning_rate": 3.0031013643172185e-05, + "loss": 0.3159, + "mean_token_accuracy": 0.8608952164649963, + "step": 1184 + }, + { + "epoch": 3.7264150943396226, + "grad_norm": 0.1514212042093277, + "learning_rate": 3.0014599556096557e-05, + "loss": 0.3384, + "mean_token_accuracy": 0.8577061295509338, + "step": 1185 + }, + { + "epoch": 3.729559748427673, + "grad_norm": 0.14160504937171936, + "learning_rate": 2.9998177152287204e-05, + "loss": 0.3297, + "mean_token_accuracy": 0.8586458563804626, + "step": 1186 + }, + { + "epoch": 3.732704402515723, + "grad_norm": 0.14224717020988464, + "learning_rate": 2.9981746448785615e-05, + "loss": 0.3203, + "mean_token_accuracy": 0.8611958622932434, + "step": 1187 + }, + { + "epoch": 3.7358490566037736, + "grad_norm": 0.12940874695777893, + "learning_rate": 2.9965307462641906e-05, + "loss": 0.3384, + "mean_token_accuracy": 0.8546769618988037, + "step": 1188 + }, + { + "epoch": 3.738993710691824, + "grad_norm": 0.1318395733833313, + "learning_rate": 2.9948860210914773e-05, + "loss": 0.3339, + "mean_token_accuracy": 0.8579802513122559, + "step": 1189 + }, + { + "epoch": 3.742138364779874, + "grad_norm": 0.14659515023231506, + "learning_rate": 2.9932404710671504e-05, + "loss": 0.3371, + "mean_token_accuracy": 0.8568556308746338, + "step": 1190 + }, + { + "epoch": 3.7452830188679247, + "grad_norm": 0.1241716668009758, + "learning_rate": 2.9915940978987926e-05, + "loss": 0.3199, + "mean_token_accuracy": 0.8621662855148315, + "step": 1191 + }, + { + "epoch": 3.748427672955975, + "grad_norm": 0.14975996315479279, + "learning_rate": 2.989946903294843e-05, + "loss": 0.3447, + "mean_token_accuracy": 0.8569864630699158, + "step": 1192 + }, + { + "epoch": 3.751572327044025, + "grad_norm": 0.12985654175281525, + "learning_rate": 2.9882988889645913e-05, + "loss": 0.3268, + "mean_token_accuracy": 0.8582627773284912, + "step": 1193 + }, + { + "epoch": 3.7547169811320753, + "grad_norm": 0.14152680337429047, + "learning_rate": 2.9866500566181802e-05, + "loss": 0.3121, + "mean_token_accuracy": 0.8632294535636902, + "step": 1194 + }, + { + "epoch": 3.757861635220126, + "grad_norm": 0.14638929069042206, + "learning_rate": 2.985000407966598e-05, + "loss": 0.3317, + "mean_token_accuracy": 0.8586151003837585, + "step": 1195 + }, + { + "epoch": 3.761006289308176, + "grad_norm": 0.1430143564939499, + "learning_rate": 2.9833499447216824e-05, + "loss": 0.3387, + "mean_token_accuracy": 0.8545314073562622, + "step": 1196 + }, + { + "epoch": 3.7641509433962264, + "grad_norm": 0.14943763613700867, + "learning_rate": 2.9816986685961156e-05, + "loss": 0.3322, + "mean_token_accuracy": 0.858390748500824, + "step": 1197 + }, + { + "epoch": 3.767295597484277, + "grad_norm": 0.14059577882289886, + "learning_rate": 2.9800465813034242e-05, + "loss": 0.3294, + "mean_token_accuracy": 0.8604702949523926, + "step": 1198 + }, + { + "epoch": 3.770440251572327, + "grad_norm": 0.15087758004665375, + "learning_rate": 2.9783936845579747e-05, + "loss": 0.3381, + "mean_token_accuracy": 0.8561389446258545, + "step": 1199 + }, + { + "epoch": 3.7735849056603774, + "grad_norm": 0.14814791083335876, + "learning_rate": 2.9767399800749755e-05, + "loss": 0.3311, + "mean_token_accuracy": 0.8588146567344666, + "step": 1200 + }, + { + "epoch": 3.7767295597484276, + "grad_norm": 0.13886821269989014, + "learning_rate": 2.9750854695704715e-05, + "loss": 0.332, + "mean_token_accuracy": 0.8596969842910767, + "step": 1201 + }, + { + "epoch": 3.779874213836478, + "grad_norm": 0.15242241322994232, + "learning_rate": 2.9734301547613465e-05, + "loss": 0.3277, + "mean_token_accuracy": 0.8580199480056763, + "step": 1202 + }, + { + "epoch": 3.7830188679245285, + "grad_norm": 0.13634704053401947, + "learning_rate": 2.971774037365316e-05, + "loss": 0.3357, + "mean_token_accuracy": 0.8593721985816956, + "step": 1203 + }, + { + "epoch": 3.7861635220125787, + "grad_norm": 0.14197935163974762, + "learning_rate": 2.9701171191009304e-05, + "loss": 0.3378, + "mean_token_accuracy": 0.8571075797080994, + "step": 1204 + }, + { + "epoch": 3.789308176100629, + "grad_norm": 0.1357307732105255, + "learning_rate": 2.96845940168757e-05, + "loss": 0.3152, + "mean_token_accuracy": 0.8626152276992798, + "step": 1205 + }, + { + "epoch": 3.7924528301886795, + "grad_norm": 0.1317468136548996, + "learning_rate": 2.966800886845445e-05, + "loss": 0.3392, + "mean_token_accuracy": 0.8555001616477966, + "step": 1206 + }, + { + "epoch": 3.7955974842767297, + "grad_norm": 0.14835762977600098, + "learning_rate": 2.9651415762955925e-05, + "loss": 0.344, + "mean_token_accuracy": 0.8575615882873535, + "step": 1207 + }, + { + "epoch": 3.79874213836478, + "grad_norm": 0.14677360653877258, + "learning_rate": 2.9634814717598762e-05, + "loss": 0.3227, + "mean_token_accuracy": 0.8616207838058472, + "step": 1208 + }, + { + "epoch": 3.80188679245283, + "grad_norm": 0.13944582641124725, + "learning_rate": 2.961820574960982e-05, + "loss": 0.3467, + "mean_token_accuracy": 0.8558523058891296, + "step": 1209 + }, + { + "epoch": 3.8050314465408803, + "grad_norm": 0.1433691531419754, + "learning_rate": 2.9601588876224215e-05, + "loss": 0.3417, + "mean_token_accuracy": 0.8568807244300842, + "step": 1210 + }, + { + "epoch": 3.808176100628931, + "grad_norm": 0.14389808475971222, + "learning_rate": 2.958496411468522e-05, + "loss": 0.3118, + "mean_token_accuracy": 0.8629230260848999, + "step": 1211 + }, + { + "epoch": 3.811320754716981, + "grad_norm": 0.14503751695156097, + "learning_rate": 2.956833148224433e-05, + "loss": 0.3369, + "mean_token_accuracy": 0.8592314720153809, + "step": 1212 + }, + { + "epoch": 3.8144654088050314, + "grad_norm": 0.14210133254528046, + "learning_rate": 2.955169099616119e-05, + "loss": 0.3301, + "mean_token_accuracy": 0.8592085242271423, + "step": 1213 + }, + { + "epoch": 3.817610062893082, + "grad_norm": 0.12544848024845123, + "learning_rate": 2.95350426737036e-05, + "loss": 0.3211, + "mean_token_accuracy": 0.8602308034896851, + "step": 1214 + }, + { + "epoch": 3.8207547169811322, + "grad_norm": 0.15146970748901367, + "learning_rate": 2.9518386532147487e-05, + "loss": 0.3319, + "mean_token_accuracy": 0.857859194278717, + "step": 1215 + }, + { + "epoch": 3.8238993710691824, + "grad_norm": 0.13156718015670776, + "learning_rate": 2.95017225887769e-05, + "loss": 0.3218, + "mean_token_accuracy": 0.8625040650367737, + "step": 1216 + }, + { + "epoch": 3.8270440251572326, + "grad_norm": 0.14363285899162292, + "learning_rate": 2.948505086088397e-05, + "loss": 0.3457, + "mean_token_accuracy": 0.8564530611038208, + "step": 1217 + }, + { + "epoch": 3.830188679245283, + "grad_norm": 0.12839969992637634, + "learning_rate": 2.9468371365768926e-05, + "loss": 0.3099, + "mean_token_accuracy": 0.8643601536750793, + "step": 1218 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 0.12974990904331207, + "learning_rate": 2.9451684120740038e-05, + "loss": 0.3274, + "mean_token_accuracy": 0.8582882285118103, + "step": 1219 + }, + { + "epoch": 3.8364779874213837, + "grad_norm": 0.14015363156795502, + "learning_rate": 2.943498914311364e-05, + "loss": 0.3368, + "mean_token_accuracy": 0.8556147217750549, + "step": 1220 + }, + { + "epoch": 3.839622641509434, + "grad_norm": 0.13860003650188446, + "learning_rate": 2.941828645021406e-05, + "loss": 0.3375, + "mean_token_accuracy": 0.8543303608894348, + "step": 1221 + }, + { + "epoch": 3.8427672955974845, + "grad_norm": 0.14258892834186554, + "learning_rate": 2.9401576059373656e-05, + "loss": 0.3259, + "mean_token_accuracy": 0.8594518303871155, + "step": 1222 + }, + { + "epoch": 3.8459119496855347, + "grad_norm": 0.13030867278575897, + "learning_rate": 2.9384857987932768e-05, + "loss": 0.3162, + "mean_token_accuracy": 0.8623016476631165, + "step": 1223 + }, + { + "epoch": 3.849056603773585, + "grad_norm": 0.13517674803733826, + "learning_rate": 2.9368132253239702e-05, + "loss": 0.3254, + "mean_token_accuracy": 0.8589973449707031, + "step": 1224 + }, + { + "epoch": 3.852201257861635, + "grad_norm": 0.12886177003383636, + "learning_rate": 2.935139887265072e-05, + "loss": 0.3354, + "mean_token_accuracy": 0.8530575037002563, + "step": 1225 + }, + { + "epoch": 3.8553459119496853, + "grad_norm": 0.14796893298625946, + "learning_rate": 2.9334657863530016e-05, + "loss": 0.3325, + "mean_token_accuracy": 0.8576770424842834, + "step": 1226 + }, + { + "epoch": 3.858490566037736, + "grad_norm": 0.1297404170036316, + "learning_rate": 2.9317909243249706e-05, + "loss": 0.3436, + "mean_token_accuracy": 0.8564417958259583, + "step": 1227 + }, + { + "epoch": 3.861635220125786, + "grad_norm": 0.1457653045654297, + "learning_rate": 2.9301153029189794e-05, + "loss": 0.3387, + "mean_token_accuracy": 0.8541455268859863, + "step": 1228 + }, + { + "epoch": 3.8647798742138364, + "grad_norm": 0.135748028755188, + "learning_rate": 2.928438923873816e-05, + "loss": 0.3194, + "mean_token_accuracy": 0.8600180745124817, + "step": 1229 + }, + { + "epoch": 3.867924528301887, + "grad_norm": 0.14161410927772522, + "learning_rate": 2.926761788929058e-05, + "loss": 0.3369, + "mean_token_accuracy": 0.8556387424468994, + "step": 1230 + }, + { + "epoch": 3.8710691823899372, + "grad_norm": 0.13314643502235413, + "learning_rate": 2.9250838998250638e-05, + "loss": 0.3317, + "mean_token_accuracy": 0.8579115867614746, + "step": 1231 + }, + { + "epoch": 3.8742138364779874, + "grad_norm": 0.14150525629520416, + "learning_rate": 2.9234052583029746e-05, + "loss": 0.3222, + "mean_token_accuracy": 0.8584760427474976, + "step": 1232 + }, + { + "epoch": 3.8773584905660377, + "grad_norm": 0.13233336806297302, + "learning_rate": 2.9217258661047142e-05, + "loss": 0.3517, + "mean_token_accuracy": 0.852990448474884, + "step": 1233 + }, + { + "epoch": 3.880503144654088, + "grad_norm": 0.15107811987400055, + "learning_rate": 2.920045724972985e-05, + "loss": 0.3305, + "mean_token_accuracy": 0.8559724688529968, + "step": 1234 + }, + { + "epoch": 3.8836477987421385, + "grad_norm": 0.16093167662620544, + "learning_rate": 2.9183648366512648e-05, + "loss": 0.3315, + "mean_token_accuracy": 0.8561708927154541, + "step": 1235 + }, + { + "epoch": 3.8867924528301887, + "grad_norm": 0.14102764427661896, + "learning_rate": 2.9166832028838085e-05, + "loss": 0.332, + "mean_token_accuracy": 0.856883704662323, + "step": 1236 + }, + { + "epoch": 3.889937106918239, + "grad_norm": 0.15067261457443237, + "learning_rate": 2.915000825415644e-05, + "loss": 0.3338, + "mean_token_accuracy": 0.858379065990448, + "step": 1237 + }, + { + "epoch": 3.8930817610062896, + "grad_norm": 0.1495073437690735, + "learning_rate": 2.9133177059925715e-05, + "loss": 0.3218, + "mean_token_accuracy": 0.8599880933761597, + "step": 1238 + }, + { + "epoch": 3.8962264150943398, + "grad_norm": 0.14388205111026764, + "learning_rate": 2.9116338463611596e-05, + "loss": 0.348, + "mean_token_accuracy": 0.8538846373558044, + "step": 1239 + }, + { + "epoch": 3.89937106918239, + "grad_norm": 0.1329399049282074, + "learning_rate": 2.9099492482687478e-05, + "loss": 0.3267, + "mean_token_accuracy": 0.8591561317443848, + "step": 1240 + }, + { + "epoch": 3.90251572327044, + "grad_norm": 0.14387187361717224, + "learning_rate": 2.9082639134634378e-05, + "loss": 0.3285, + "mean_token_accuracy": 0.8577826023101807, + "step": 1241 + }, + { + "epoch": 3.9056603773584904, + "grad_norm": 0.1352883279323578, + "learning_rate": 2.9065778436941003e-05, + "loss": 0.3314, + "mean_token_accuracy": 0.8583676218986511, + "step": 1242 + }, + { + "epoch": 3.908805031446541, + "grad_norm": 0.1337868720293045, + "learning_rate": 2.904891040710365e-05, + "loss": 0.3267, + "mean_token_accuracy": 0.857197105884552, + "step": 1243 + }, + { + "epoch": 3.911949685534591, + "grad_norm": 0.13829989731311798, + "learning_rate": 2.903203506262624e-05, + "loss": 0.3357, + "mean_token_accuracy": 0.85539710521698, + "step": 1244 + }, + { + "epoch": 3.9150943396226414, + "grad_norm": 0.1476396769285202, + "learning_rate": 2.9015152421020296e-05, + "loss": 0.3344, + "mean_token_accuracy": 0.857477068901062, + "step": 1245 + }, + { + "epoch": 3.918238993710692, + "grad_norm": 0.14620055258274078, + "learning_rate": 2.899826249980489e-05, + "loss": 0.3423, + "mean_token_accuracy": 0.8560461401939392, + "step": 1246 + }, + { + "epoch": 3.9213836477987423, + "grad_norm": 0.12422426789999008, + "learning_rate": 2.898136531650666e-05, + "loss": 0.3546, + "mean_token_accuracy": 0.8542535901069641, + "step": 1247 + }, + { + "epoch": 3.9245283018867925, + "grad_norm": 0.13504689931869507, + "learning_rate": 2.8964460888659786e-05, + "loss": 0.3391, + "mean_token_accuracy": 0.858195424079895, + "step": 1248 + }, + { + "epoch": 3.9276729559748427, + "grad_norm": 0.13244280219078064, + "learning_rate": 2.8947549233805953e-05, + "loss": 0.3121, + "mean_token_accuracy": 0.8605561852455139, + "step": 1249 + }, + { + "epoch": 3.930817610062893, + "grad_norm": 0.13584861159324646, + "learning_rate": 2.893063036949435e-05, + "loss": 0.3304, + "mean_token_accuracy": 0.8578067421913147, + "step": 1250 + }, + { + "epoch": 3.9339622641509435, + "grad_norm": 0.14337222278118134, + "learning_rate": 2.891370431328165e-05, + "loss": 0.3347, + "mean_token_accuracy": 0.8570818305015564, + "step": 1251 + }, + { + "epoch": 3.9371069182389937, + "grad_norm": 0.1409681886434555, + "learning_rate": 2.8896771082731986e-05, + "loss": 0.3374, + "mean_token_accuracy": 0.8561692833900452, + "step": 1252 + }, + { + "epoch": 3.940251572327044, + "grad_norm": 0.13311010599136353, + "learning_rate": 2.8879830695416933e-05, + "loss": 0.3386, + "mean_token_accuracy": 0.8557576537132263, + "step": 1253 + }, + { + "epoch": 3.9433962264150946, + "grad_norm": 0.13869957625865936, + "learning_rate": 2.8862883168915508e-05, + "loss": 0.3236, + "mean_token_accuracy": 0.8604661822319031, + "step": 1254 + }, + { + "epoch": 3.9465408805031448, + "grad_norm": 0.15289968252182007, + "learning_rate": 2.884592852081412e-05, + "loss": 0.3256, + "mean_token_accuracy": 0.8590101003646851, + "step": 1255 + }, + { + "epoch": 3.949685534591195, + "grad_norm": 0.12691056728363037, + "learning_rate": 2.882896676870657e-05, + "loss": 0.327, + "mean_token_accuracy": 0.8594552874565125, + "step": 1256 + }, + { + "epoch": 3.952830188679245, + "grad_norm": 0.13348102569580078, + "learning_rate": 2.8811997930194032e-05, + "loss": 0.3335, + "mean_token_accuracy": 0.8576066493988037, + "step": 1257 + }, + { + "epoch": 3.9559748427672954, + "grad_norm": 0.13760992884635925, + "learning_rate": 2.8795022022885043e-05, + "loss": 0.3247, + "mean_token_accuracy": 0.8583594560623169, + "step": 1258 + }, + { + "epoch": 3.959119496855346, + "grad_norm": 0.12815621495246887, + "learning_rate": 2.8778039064395464e-05, + "loss": 0.332, + "mean_token_accuracy": 0.8571073412895203, + "step": 1259 + }, + { + "epoch": 3.9622641509433962, + "grad_norm": 0.14194735884666443, + "learning_rate": 2.8761049072348478e-05, + "loss": 0.3321, + "mean_token_accuracy": 0.8575592637062073, + "step": 1260 + }, + { + "epoch": 3.9654088050314464, + "grad_norm": 0.12652727961540222, + "learning_rate": 2.874405206437455e-05, + "loss": 0.3126, + "mean_token_accuracy": 0.8608962297439575, + "step": 1261 + }, + { + "epoch": 3.968553459119497, + "grad_norm": 0.13028152287006378, + "learning_rate": 2.8727048058111467e-05, + "loss": 0.3395, + "mean_token_accuracy": 0.8562440872192383, + "step": 1262 + }, + { + "epoch": 3.9716981132075473, + "grad_norm": 0.1258905678987503, + "learning_rate": 2.8710037071204235e-05, + "loss": 0.3402, + "mean_token_accuracy": 0.8553774356842041, + "step": 1263 + }, + { + "epoch": 3.9748427672955975, + "grad_norm": 0.1402544230222702, + "learning_rate": 2.8693019121305123e-05, + "loss": 0.3403, + "mean_token_accuracy": 0.8573205471038818, + "step": 1264 + }, + { + "epoch": 3.9779874213836477, + "grad_norm": 0.13947366178035736, + "learning_rate": 2.867599422607363e-05, + "loss": 0.3292, + "mean_token_accuracy": 0.8590210676193237, + "step": 1265 + }, + { + "epoch": 3.981132075471698, + "grad_norm": 0.13215205073356628, + "learning_rate": 2.865896240317645e-05, + "loss": 0.3266, + "mean_token_accuracy": 0.8595618605613708, + "step": 1266 + }, + { + "epoch": 3.9842767295597485, + "grad_norm": 0.1323062777519226, + "learning_rate": 2.8641923670287465e-05, + "loss": 0.3276, + "mean_token_accuracy": 0.8571467995643616, + "step": 1267 + }, + { + "epoch": 3.9874213836477987, + "grad_norm": 0.12528589367866516, + "learning_rate": 2.8624878045087744e-05, + "loss": 0.3238, + "mean_token_accuracy": 0.8595758080482483, + "step": 1268 + }, + { + "epoch": 3.990566037735849, + "grad_norm": 0.14454223215579987, + "learning_rate": 2.8607825545265492e-05, + "loss": 0.3193, + "mean_token_accuracy": 0.8578279614448547, + "step": 1269 + }, + { + "epoch": 3.9937106918238996, + "grad_norm": 0.13367754220962524, + "learning_rate": 2.8590766188516047e-05, + "loss": 0.324, + "mean_token_accuracy": 0.8594267964363098, + "step": 1270 + }, + { + "epoch": 3.99685534591195, + "grad_norm": 0.14187419414520264, + "learning_rate": 2.8573699992541892e-05, + "loss": 0.3318, + "mean_token_accuracy": 0.8586729168891907, + "step": 1271 + }, + { + "epoch": 4.0, + "grad_norm": 0.1359594166278839, + "learning_rate": 2.8556626975052563e-05, + "loss": 0.3187, + "mean_token_accuracy": 0.8581359386444092, + "step": 1272 + }, + { + "epoch": 4.00314465408805, + "grad_norm": 0.21873971819877625, + "learning_rate": 2.85395471537647e-05, + "loss": 0.249, + "mean_token_accuracy": 0.883579432964325, + "step": 1273 + }, + { + "epoch": 4.0062893081761, + "grad_norm": 0.35204988718032837, + "learning_rate": 2.8522460546402002e-05, + "loss": 0.2546, + "mean_token_accuracy": 0.8850920796394348, + "step": 1274 + }, + { + "epoch": 4.009433962264151, + "grad_norm": 0.26803094148635864, + "learning_rate": 2.8505367170695207e-05, + "loss": 0.2628, + "mean_token_accuracy": 0.8832299709320068, + "step": 1275 + }, + { + "epoch": 4.012578616352202, + "grad_norm": 0.3539370894432068, + "learning_rate": 2.8488267044382073e-05, + "loss": 0.2587, + "mean_token_accuracy": 0.886466920375824, + "step": 1276 + }, + { + "epoch": 4.015723270440252, + "grad_norm": 0.2860783636569977, + "learning_rate": 2.847116018520737e-05, + "loss": 0.2652, + "mean_token_accuracy": 0.8802006840705872, + "step": 1277 + }, + { + "epoch": 4.018867924528302, + "grad_norm": 0.19191104173660278, + "learning_rate": 2.8454046610922847e-05, + "loss": 0.2587, + "mean_token_accuracy": 0.884539008140564, + "step": 1278 + }, + { + "epoch": 4.022012578616352, + "grad_norm": 0.2441309690475464, + "learning_rate": 2.843692633928723e-05, + "loss": 0.2266, + "mean_token_accuracy": 0.8915911912918091, + "step": 1279 + }, + { + "epoch": 4.0251572327044025, + "grad_norm": 0.17421981692314148, + "learning_rate": 2.8419799388066182e-05, + "loss": 0.2455, + "mean_token_accuracy": 0.8853268623352051, + "step": 1280 + }, + { + "epoch": 4.028301886792453, + "grad_norm": 0.20515094697475433, + "learning_rate": 2.840266577503232e-05, + "loss": 0.2521, + "mean_token_accuracy": 0.886737585067749, + "step": 1281 + }, + { + "epoch": 4.031446540880503, + "grad_norm": 0.1697227954864502, + "learning_rate": 2.8385525517965143e-05, + "loss": 0.2502, + "mean_token_accuracy": 0.8833692669868469, + "step": 1282 + }, + { + "epoch": 4.034591194968553, + "grad_norm": 0.19101938605308533, + "learning_rate": 2.836837863465107e-05, + "loss": 0.2615, + "mean_token_accuracy": 0.8790598511695862, + "step": 1283 + }, + { + "epoch": 4.037735849056604, + "grad_norm": 0.16805845499038696, + "learning_rate": 2.8351225142883395e-05, + "loss": 0.2596, + "mean_token_accuracy": 0.8821934461593628, + "step": 1284 + }, + { + "epoch": 4.040880503144654, + "grad_norm": 0.18171757459640503, + "learning_rate": 2.8334065060462246e-05, + "loss": 0.2338, + "mean_token_accuracy": 0.8911201357841492, + "step": 1285 + }, + { + "epoch": 4.044025157232705, + "grad_norm": 0.17769482731819153, + "learning_rate": 2.8316898405194617e-05, + "loss": 0.2514, + "mean_token_accuracy": 0.8844525814056396, + "step": 1286 + }, + { + "epoch": 4.047169811320755, + "grad_norm": 0.16998951137065887, + "learning_rate": 2.829972519489431e-05, + "loss": 0.24, + "mean_token_accuracy": 0.8891401290893555, + "step": 1287 + }, + { + "epoch": 4.050314465408805, + "grad_norm": 0.1702141910791397, + "learning_rate": 2.828254544738192e-05, + "loss": 0.2489, + "mean_token_accuracy": 0.8864291310310364, + "step": 1288 + }, + { + "epoch": 4.053459119496855, + "grad_norm": 0.16874520480632782, + "learning_rate": 2.826535918048484e-05, + "loss": 0.2406, + "mean_token_accuracy": 0.8878934383392334, + "step": 1289 + }, + { + "epoch": 4.056603773584905, + "grad_norm": 0.15455903112888336, + "learning_rate": 2.8248166412037238e-05, + "loss": 0.2479, + "mean_token_accuracy": 0.8852613568305969, + "step": 1290 + }, + { + "epoch": 4.059748427672956, + "grad_norm": 0.16695955395698547, + "learning_rate": 2.8230967159879997e-05, + "loss": 0.23, + "mean_token_accuracy": 0.8919079303741455, + "step": 1291 + }, + { + "epoch": 4.062893081761007, + "grad_norm": 0.16394896805286407, + "learning_rate": 2.8213761441860765e-05, + "loss": 0.2713, + "mean_token_accuracy": 0.8791160583496094, + "step": 1292 + }, + { + "epoch": 4.066037735849057, + "grad_norm": 0.15975283086299896, + "learning_rate": 2.819654927583386e-05, + "loss": 0.2384, + "mean_token_accuracy": 0.8890205025672913, + "step": 1293 + }, + { + "epoch": 4.069182389937107, + "grad_norm": 0.17008937895298004, + "learning_rate": 2.817933067966033e-05, + "loss": 0.2517, + "mean_token_accuracy": 0.8825728893280029, + "step": 1294 + }, + { + "epoch": 4.072327044025157, + "grad_norm": 0.1542217880487442, + "learning_rate": 2.816210567120787e-05, + "loss": 0.2735, + "mean_token_accuracy": 0.879520833492279, + "step": 1295 + }, + { + "epoch": 4.0754716981132075, + "grad_norm": 0.17298361659049988, + "learning_rate": 2.8144874268350827e-05, + "loss": 0.2556, + "mean_token_accuracy": 0.8827891945838928, + "step": 1296 + }, + { + "epoch": 4.078616352201258, + "grad_norm": 0.15569692850112915, + "learning_rate": 2.8127636488970203e-05, + "loss": 0.2269, + "mean_token_accuracy": 0.8923097252845764, + "step": 1297 + }, + { + "epoch": 4.081761006289308, + "grad_norm": 0.1688852161169052, + "learning_rate": 2.8110392350953606e-05, + "loss": 0.2302, + "mean_token_accuracy": 0.8880395293235779, + "step": 1298 + }, + { + "epoch": 4.084905660377358, + "grad_norm": 0.1484537571668625, + "learning_rate": 2.8093141872195246e-05, + "loss": 0.2613, + "mean_token_accuracy": 0.8827516436576843, + "step": 1299 + }, + { + "epoch": 4.088050314465409, + "grad_norm": 0.16927222907543182, + "learning_rate": 2.8075885070595906e-05, + "loss": 0.2536, + "mean_token_accuracy": 0.8848833441734314, + "step": 1300 + }, + { + "epoch": 4.091194968553459, + "grad_norm": 0.14585481584072113, + "learning_rate": 2.805862196406293e-05, + "loss": 0.2578, + "mean_token_accuracy": 0.8819864988327026, + "step": 1301 + }, + { + "epoch": 4.09433962264151, + "grad_norm": 0.1680736243724823, + "learning_rate": 2.8041352570510216e-05, + "loss": 0.2529, + "mean_token_accuracy": 0.8855777382850647, + "step": 1302 + }, + { + "epoch": 4.09748427672956, + "grad_norm": 0.14265228807926178, + "learning_rate": 2.802407690785818e-05, + "loss": 0.2368, + "mean_token_accuracy": 0.8872923851013184, + "step": 1303 + }, + { + "epoch": 4.10062893081761, + "grad_norm": 0.15464213490486145, + "learning_rate": 2.8006794994033728e-05, + "loss": 0.2616, + "mean_token_accuracy": 0.8812229037284851, + "step": 1304 + }, + { + "epoch": 4.10377358490566, + "grad_norm": 0.14933174848556519, + "learning_rate": 2.7989506846970284e-05, + "loss": 0.2466, + "mean_token_accuracy": 0.8838765621185303, + "step": 1305 + }, + { + "epoch": 4.1069182389937104, + "grad_norm": 0.1553312987089157, + "learning_rate": 2.7972212484607708e-05, + "loss": 0.2505, + "mean_token_accuracy": 0.8844928741455078, + "step": 1306 + }, + { + "epoch": 4.110062893081761, + "grad_norm": 0.15248145163059235, + "learning_rate": 2.7954911924892344e-05, + "loss": 0.2452, + "mean_token_accuracy": 0.8878949880599976, + "step": 1307 + }, + { + "epoch": 4.113207547169812, + "grad_norm": 0.14316493272781372, + "learning_rate": 2.793760518577693e-05, + "loss": 0.2567, + "mean_token_accuracy": 0.8830364942550659, + "step": 1308 + }, + { + "epoch": 4.116352201257862, + "grad_norm": 0.15453389286994934, + "learning_rate": 2.792029228522064e-05, + "loss": 0.2372, + "mean_token_accuracy": 0.8906042575836182, + "step": 1309 + }, + { + "epoch": 4.119496855345912, + "grad_norm": 0.14568613469600677, + "learning_rate": 2.7902973241189037e-05, + "loss": 0.254, + "mean_token_accuracy": 0.8863977789878845, + "step": 1310 + }, + { + "epoch": 4.122641509433962, + "grad_norm": 0.14949272572994232, + "learning_rate": 2.788564807165406e-05, + "loss": 0.2367, + "mean_token_accuracy": 0.8867618441581726, + "step": 1311 + }, + { + "epoch": 4.1257861635220126, + "grad_norm": 0.15481707453727722, + "learning_rate": 2.7868316794594e-05, + "loss": 0.2428, + "mean_token_accuracy": 0.8883107900619507, + "step": 1312 + }, + { + "epoch": 4.128930817610063, + "grad_norm": 0.145402193069458, + "learning_rate": 2.7850979427993484e-05, + "loss": 0.2408, + "mean_token_accuracy": 0.887713611125946, + "step": 1313 + }, + { + "epoch": 4.132075471698113, + "grad_norm": 0.14378874003887177, + "learning_rate": 2.7833635989843474e-05, + "loss": 0.2601, + "mean_token_accuracy": 0.8857198357582092, + "step": 1314 + }, + { + "epoch": 4.135220125786163, + "grad_norm": 0.15104271471500397, + "learning_rate": 2.78162864981412e-05, + "loss": 0.252, + "mean_token_accuracy": 0.8861721754074097, + "step": 1315 + }, + { + "epoch": 4.138364779874214, + "grad_norm": 0.13934184610843658, + "learning_rate": 2.7798930970890216e-05, + "loss": 0.2509, + "mean_token_accuracy": 0.8866226077079773, + "step": 1316 + }, + { + "epoch": 4.1415094339622645, + "grad_norm": 0.14720742404460907, + "learning_rate": 2.7781569426100304e-05, + "loss": 0.2464, + "mean_token_accuracy": 0.8861123919487, + "step": 1317 + }, + { + "epoch": 4.144654088050315, + "grad_norm": 0.1393972486257553, + "learning_rate": 2.7764201881787512e-05, + "loss": 0.2655, + "mean_token_accuracy": 0.8817521929740906, + "step": 1318 + }, + { + "epoch": 4.147798742138365, + "grad_norm": 0.1448722630739212, + "learning_rate": 2.7746828355974104e-05, + "loss": 0.2364, + "mean_token_accuracy": 0.8872301578521729, + "step": 1319 + }, + { + "epoch": 4.150943396226415, + "grad_norm": 0.13423529267311096, + "learning_rate": 2.7729448866688544e-05, + "loss": 0.2527, + "mean_token_accuracy": 0.8843175768852234, + "step": 1320 + }, + { + "epoch": 4.154088050314465, + "grad_norm": 0.13578058779239655, + "learning_rate": 2.771206343196551e-05, + "loss": 0.2603, + "mean_token_accuracy": 0.8812825083732605, + "step": 1321 + }, + { + "epoch": 4.1572327044025155, + "grad_norm": 0.1407482773065567, + "learning_rate": 2.7694672069845813e-05, + "loss": 0.2423, + "mean_token_accuracy": 0.8843265771865845, + "step": 1322 + }, + { + "epoch": 4.160377358490566, + "grad_norm": 0.14086057245731354, + "learning_rate": 2.767727479837644e-05, + "loss": 0.2347, + "mean_token_accuracy": 0.8889259099960327, + "step": 1323 + }, + { + "epoch": 4.163522012578617, + "grad_norm": 0.14522001147270203, + "learning_rate": 2.7659871635610515e-05, + "loss": 0.2498, + "mean_token_accuracy": 0.8857369422912598, + "step": 1324 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.14944982528686523, + "learning_rate": 2.7642462599607252e-05, + "loss": 0.2333, + "mean_token_accuracy": 0.8869674205780029, + "step": 1325 + }, + { + "epoch": 4.169811320754717, + "grad_norm": 0.1487564593553543, + "learning_rate": 2.7625047708431976e-05, + "loss": 0.2352, + "mean_token_accuracy": 0.8852526545524597, + "step": 1326 + }, + { + "epoch": 4.172955974842767, + "grad_norm": 0.1437779664993286, + "learning_rate": 2.7607626980156092e-05, + "loss": 0.262, + "mean_token_accuracy": 0.8827253580093384, + "step": 1327 + }, + { + "epoch": 4.176100628930818, + "grad_norm": 0.1432497501373291, + "learning_rate": 2.7590200432857047e-05, + "loss": 0.2567, + "mean_token_accuracy": 0.8826262354850769, + "step": 1328 + }, + { + "epoch": 4.179245283018868, + "grad_norm": 0.14890795946121216, + "learning_rate": 2.7572768084618334e-05, + "loss": 0.2688, + "mean_token_accuracy": 0.8793274760246277, + "step": 1329 + }, + { + "epoch": 4.182389937106918, + "grad_norm": 0.14550916850566864, + "learning_rate": 2.755532995352947e-05, + "loss": 0.233, + "mean_token_accuracy": 0.8903444409370422, + "step": 1330 + }, + { + "epoch": 4.185534591194968, + "grad_norm": 0.18591845035552979, + "learning_rate": 2.753788605768596e-05, + "loss": 0.2596, + "mean_token_accuracy": 0.8813364505767822, + "step": 1331 + }, + { + "epoch": 4.188679245283019, + "grad_norm": 0.13862967491149902, + "learning_rate": 2.7520436415189306e-05, + "loss": 0.2622, + "mean_token_accuracy": 0.8814187049865723, + "step": 1332 + }, + { + "epoch": 4.1918238993710695, + "grad_norm": 0.16407757997512817, + "learning_rate": 2.7502981044146963e-05, + "loss": 0.2579, + "mean_token_accuracy": 0.8837925791740417, + "step": 1333 + }, + { + "epoch": 4.19496855345912, + "grad_norm": 0.14519527554512024, + "learning_rate": 2.7485519962672337e-05, + "loss": 0.2695, + "mean_token_accuracy": 0.8818954825401306, + "step": 1334 + }, + { + "epoch": 4.19811320754717, + "grad_norm": 0.14345420897006989, + "learning_rate": 2.746805318888476e-05, + "loss": 0.2494, + "mean_token_accuracy": 0.8830834627151489, + "step": 1335 + }, + { + "epoch": 4.20125786163522, + "grad_norm": 0.1446036845445633, + "learning_rate": 2.7450580740909463e-05, + "loss": 0.2498, + "mean_token_accuracy": 0.8848795890808105, + "step": 1336 + }, + { + "epoch": 4.20440251572327, + "grad_norm": 0.145370215177536, + "learning_rate": 2.743310263687757e-05, + "loss": 0.268, + "mean_token_accuracy": 0.8806397914886475, + "step": 1337 + }, + { + "epoch": 4.2075471698113205, + "grad_norm": 0.13711602985858917, + "learning_rate": 2.7415618894926072e-05, + "loss": 0.23, + "mean_token_accuracy": 0.8897081613540649, + "step": 1338 + }, + { + "epoch": 4.210691823899371, + "grad_norm": 0.15641474723815918, + "learning_rate": 2.739812953319782e-05, + "loss": 0.2406, + "mean_token_accuracy": 0.8866468071937561, + "step": 1339 + }, + { + "epoch": 4.213836477987422, + "grad_norm": 0.1482914835214615, + "learning_rate": 2.738063456984148e-05, + "loss": 0.2472, + "mean_token_accuracy": 0.885341227054596, + "step": 1340 + }, + { + "epoch": 4.216981132075472, + "grad_norm": 0.14941436052322388, + "learning_rate": 2.736313402301155e-05, + "loss": 0.2558, + "mean_token_accuracy": 0.8828262090682983, + "step": 1341 + }, + { + "epoch": 4.220125786163522, + "grad_norm": 0.14204715192317963, + "learning_rate": 2.73456279108683e-05, + "loss": 0.2591, + "mean_token_accuracy": 0.8831762671470642, + "step": 1342 + }, + { + "epoch": 4.223270440251572, + "grad_norm": 0.14757885038852692, + "learning_rate": 2.73281162515778e-05, + "loss": 0.2582, + "mean_token_accuracy": 0.8833774328231812, + "step": 1343 + }, + { + "epoch": 4.226415094339623, + "grad_norm": 0.14690524339675903, + "learning_rate": 2.7310599063311857e-05, + "loss": 0.258, + "mean_token_accuracy": 0.8827973008155823, + "step": 1344 + }, + { + "epoch": 4.229559748427673, + "grad_norm": 0.14617246389389038, + "learning_rate": 2.7293076364248015e-05, + "loss": 0.2412, + "mean_token_accuracy": 0.886563241481781, + "step": 1345 + }, + { + "epoch": 4.232704402515723, + "grad_norm": 0.13870076835155487, + "learning_rate": 2.7275548172569557e-05, + "loss": 0.2321, + "mean_token_accuracy": 0.8864620923995972, + "step": 1346 + }, + { + "epoch": 4.235849056603773, + "grad_norm": 0.15120095014572144, + "learning_rate": 2.7258014506465442e-05, + "loss": 0.2448, + "mean_token_accuracy": 0.8861029744148254, + "step": 1347 + }, + { + "epoch": 4.238993710691824, + "grad_norm": 0.15668298304080963, + "learning_rate": 2.724047538413033e-05, + "loss": 0.2577, + "mean_token_accuracy": 0.8828524947166443, + "step": 1348 + }, + { + "epoch": 4.2421383647798745, + "grad_norm": 0.13791413605213165, + "learning_rate": 2.7222930823764517e-05, + "loss": 0.2473, + "mean_token_accuracy": 0.8867437243461609, + "step": 1349 + }, + { + "epoch": 4.245283018867925, + "grad_norm": 0.15054844319820404, + "learning_rate": 2.7205380843573967e-05, + "loss": 0.2549, + "mean_token_accuracy": 0.8830904364585876, + "step": 1350 + }, + { + "epoch": 4.248427672955975, + "grad_norm": 0.14027497172355652, + "learning_rate": 2.718782546177026e-05, + "loss": 0.2551, + "mean_token_accuracy": 0.8864468336105347, + "step": 1351 + }, + { + "epoch": 4.251572327044025, + "grad_norm": 0.15007853507995605, + "learning_rate": 2.7170264696570586e-05, + "loss": 0.2437, + "mean_token_accuracy": 0.8871759176254272, + "step": 1352 + }, + { + "epoch": 4.254716981132075, + "grad_norm": 0.1608761101961136, + "learning_rate": 2.7152698566197703e-05, + "loss": 0.2557, + "mean_token_accuracy": 0.8846498131752014, + "step": 1353 + }, + { + "epoch": 4.2578616352201255, + "grad_norm": 0.13730347156524658, + "learning_rate": 2.7135127088879947e-05, + "loss": 0.2541, + "mean_token_accuracy": 0.8821074366569519, + "step": 1354 + }, + { + "epoch": 4.261006289308176, + "grad_norm": 0.14933264255523682, + "learning_rate": 2.7117550282851222e-05, + "loss": 0.2569, + "mean_token_accuracy": 0.8827639818191528, + "step": 1355 + }, + { + "epoch": 4.264150943396227, + "grad_norm": 0.15967227518558502, + "learning_rate": 2.709996816635093e-05, + "loss": 0.2457, + "mean_token_accuracy": 0.8853583335876465, + "step": 1356 + }, + { + "epoch": 4.267295597484277, + "grad_norm": 0.130757138133049, + "learning_rate": 2.7082380757623997e-05, + "loss": 0.2624, + "mean_token_accuracy": 0.8816043734550476, + "step": 1357 + }, + { + "epoch": 4.270440251572327, + "grad_norm": 0.17749525606632233, + "learning_rate": 2.7064788074920853e-05, + "loss": 0.2297, + "mean_token_accuracy": 0.8903690576553345, + "step": 1358 + }, + { + "epoch": 4.273584905660377, + "grad_norm": 0.14349985122680664, + "learning_rate": 2.7047190136497374e-05, + "loss": 0.2644, + "mean_token_accuracy": 0.8817373514175415, + "step": 1359 + }, + { + "epoch": 4.276729559748428, + "grad_norm": 0.15383297204971313, + "learning_rate": 2.702958696061492e-05, + "loss": 0.2536, + "mean_token_accuracy": 0.8856379985809326, + "step": 1360 + }, + { + "epoch": 4.279874213836478, + "grad_norm": 0.1427260786294937, + "learning_rate": 2.7011978565540258e-05, + "loss": 0.2431, + "mean_token_accuracy": 0.8847337365150452, + "step": 1361 + }, + { + "epoch": 4.283018867924528, + "grad_norm": 0.15544083714485168, + "learning_rate": 2.6994364969545596e-05, + "loss": 0.2447, + "mean_token_accuracy": 0.8849520087242126, + "step": 1362 + }, + { + "epoch": 4.286163522012578, + "grad_norm": 0.15651412308216095, + "learning_rate": 2.697674619090852e-05, + "loss": 0.2366, + "mean_token_accuracy": 0.8874123692512512, + "step": 1363 + }, + { + "epoch": 4.289308176100629, + "grad_norm": 0.15618132054805756, + "learning_rate": 2.6959122247911996e-05, + "loss": 0.255, + "mean_token_accuracy": 0.8844594359397888, + "step": 1364 + }, + { + "epoch": 4.2924528301886795, + "grad_norm": 0.14532870054244995, + "learning_rate": 2.694149315884436e-05, + "loss": 0.2429, + "mean_token_accuracy": 0.8856728076934814, + "step": 1365 + }, + { + "epoch": 4.29559748427673, + "grad_norm": 0.15094998478889465, + "learning_rate": 2.692385894199929e-05, + "loss": 0.257, + "mean_token_accuracy": 0.8823837637901306, + "step": 1366 + }, + { + "epoch": 4.29874213836478, + "grad_norm": 0.1429101526737213, + "learning_rate": 2.6906219615675756e-05, + "loss": 0.246, + "mean_token_accuracy": 0.8854039311408997, + "step": 1367 + }, + { + "epoch": 4.30188679245283, + "grad_norm": 0.1493634283542633, + "learning_rate": 2.6888575198178073e-05, + "loss": 0.2523, + "mean_token_accuracy": 0.8816664814949036, + "step": 1368 + }, + { + "epoch": 4.30503144654088, + "grad_norm": 0.1594749540090561, + "learning_rate": 2.6870925707815807e-05, + "loss": 0.2433, + "mean_token_accuracy": 0.8858900666236877, + "step": 1369 + }, + { + "epoch": 4.3081761006289305, + "grad_norm": 0.16246452927589417, + "learning_rate": 2.6853271162903792e-05, + "loss": 0.28, + "mean_token_accuracy": 0.8772223591804504, + "step": 1370 + }, + { + "epoch": 4.311320754716981, + "grad_norm": 0.15938788652420044, + "learning_rate": 2.6835611581762135e-05, + "loss": 0.2522, + "mean_token_accuracy": 0.8834171295166016, + "step": 1371 + }, + { + "epoch": 4.314465408805032, + "grad_norm": 0.1860852986574173, + "learning_rate": 2.6817946982716125e-05, + "loss": 0.2239, + "mean_token_accuracy": 0.8932704925537109, + "step": 1372 + }, + { + "epoch": 4.317610062893082, + "grad_norm": 0.1534428745508194, + "learning_rate": 2.6800277384096287e-05, + "loss": 0.2503, + "mean_token_accuracy": 0.8848916292190552, + "step": 1373 + }, + { + "epoch": 4.320754716981132, + "grad_norm": 0.19438529014587402, + "learning_rate": 2.6782602804238328e-05, + "loss": 0.2566, + "mean_token_accuracy": 0.8817089200019836, + "step": 1374 + }, + { + "epoch": 4.323899371069182, + "grad_norm": 0.1576819121837616, + "learning_rate": 2.6764923261483135e-05, + "loss": 0.252, + "mean_token_accuracy": 0.8846593499183655, + "step": 1375 + }, + { + "epoch": 4.327044025157233, + "grad_norm": 0.18220695853233337, + "learning_rate": 2.6747238774176717e-05, + "loss": 0.2499, + "mean_token_accuracy": 0.8850845098495483, + "step": 1376 + }, + { + "epoch": 4.330188679245283, + "grad_norm": 0.15748025476932526, + "learning_rate": 2.6729549360670244e-05, + "loss": 0.2523, + "mean_token_accuracy": 0.8865378499031067, + "step": 1377 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 0.15679998695850372, + "learning_rate": 2.6711855039319972e-05, + "loss": 0.2525, + "mean_token_accuracy": 0.8859933614730835, + "step": 1378 + }, + { + "epoch": 4.336477987421383, + "grad_norm": 0.1435849666595459, + "learning_rate": 2.6694155828487272e-05, + "loss": 0.2553, + "mean_token_accuracy": 0.882286787033081, + "step": 1379 + }, + { + "epoch": 4.339622641509434, + "grad_norm": 0.14357006549835205, + "learning_rate": 2.6676451746538577e-05, + "loss": 0.2519, + "mean_token_accuracy": 0.8833568692207336, + "step": 1380 + }, + { + "epoch": 4.3427672955974845, + "grad_norm": 0.14986519515514374, + "learning_rate": 2.6658742811845377e-05, + "loss": 0.2494, + "mean_token_accuracy": 0.8818812370300293, + "step": 1381 + }, + { + "epoch": 4.345911949685535, + "grad_norm": 0.14738526940345764, + "learning_rate": 2.6641029042784197e-05, + "loss": 0.2504, + "mean_token_accuracy": 0.8836377859115601, + "step": 1382 + }, + { + "epoch": 4.349056603773585, + "grad_norm": 0.15075050294399261, + "learning_rate": 2.6623310457736575e-05, + "loss": 0.2638, + "mean_token_accuracy": 0.8816152811050415, + "step": 1383 + }, + { + "epoch": 4.352201257861635, + "grad_norm": 0.14811988174915314, + "learning_rate": 2.660558707508906e-05, + "loss": 0.2639, + "mean_token_accuracy": 0.8767325282096863, + "step": 1384 + }, + { + "epoch": 4.355345911949685, + "grad_norm": 0.13753147423267365, + "learning_rate": 2.6587858913233168e-05, + "loss": 0.2412, + "mean_token_accuracy": 0.8852753639221191, + "step": 1385 + }, + { + "epoch": 4.3584905660377355, + "grad_norm": 0.1575927436351776, + "learning_rate": 2.657012599056536e-05, + "loss": 0.252, + "mean_token_accuracy": 0.8817105293273926, + "step": 1386 + }, + { + "epoch": 4.361635220125786, + "grad_norm": 0.14534930884838104, + "learning_rate": 2.6552388325487078e-05, + "loss": 0.2562, + "mean_token_accuracy": 0.8825299143791199, + "step": 1387 + }, + { + "epoch": 4.364779874213837, + "grad_norm": 0.1528378576040268, + "learning_rate": 2.6534645936404655e-05, + "loss": 0.2606, + "mean_token_accuracy": 0.8781949877738953, + "step": 1388 + }, + { + "epoch": 4.367924528301887, + "grad_norm": 0.14020651578903198, + "learning_rate": 2.6516898841729323e-05, + "loss": 0.257, + "mean_token_accuracy": 0.8830549716949463, + "step": 1389 + }, + { + "epoch": 4.371069182389937, + "grad_norm": 0.13903018832206726, + "learning_rate": 2.6499147059877213e-05, + "loss": 0.2522, + "mean_token_accuracy": 0.8827590346336365, + "step": 1390 + }, + { + "epoch": 4.3742138364779874, + "grad_norm": 0.14772510528564453, + "learning_rate": 2.6481390609269308e-05, + "loss": 0.2625, + "mean_token_accuracy": 0.8802635073661804, + "step": 1391 + }, + { + "epoch": 4.377358490566038, + "grad_norm": 0.1616673320531845, + "learning_rate": 2.646362950833145e-05, + "loss": 0.2425, + "mean_token_accuracy": 0.8878819942474365, + "step": 1392 + }, + { + "epoch": 4.380503144654088, + "grad_norm": 0.13991199433803558, + "learning_rate": 2.6445863775494282e-05, + "loss": 0.2522, + "mean_token_accuracy": 0.8848397135734558, + "step": 1393 + }, + { + "epoch": 4.383647798742138, + "grad_norm": 0.1424325704574585, + "learning_rate": 2.6428093429193288e-05, + "loss": 0.2689, + "mean_token_accuracy": 0.8799664974212646, + "step": 1394 + }, + { + "epoch": 4.386792452830189, + "grad_norm": 0.1396665871143341, + "learning_rate": 2.6410318487868707e-05, + "loss": 0.2548, + "mean_token_accuracy": 0.8830381631851196, + "step": 1395 + }, + { + "epoch": 4.389937106918239, + "grad_norm": 0.14341147243976593, + "learning_rate": 2.6392538969965565e-05, + "loss": 0.243, + "mean_token_accuracy": 0.8879396319389343, + "step": 1396 + }, + { + "epoch": 4.3930817610062896, + "grad_norm": 0.1465020775794983, + "learning_rate": 2.637475489393363e-05, + "loss": 0.2391, + "mean_token_accuracy": 0.8867096900939941, + "step": 1397 + }, + { + "epoch": 4.39622641509434, + "grad_norm": 0.1537989377975464, + "learning_rate": 2.63569662782274e-05, + "loss": 0.2578, + "mean_token_accuracy": 0.8815436959266663, + "step": 1398 + }, + { + "epoch": 4.39937106918239, + "grad_norm": 0.1487364023923874, + "learning_rate": 2.6339173141306095e-05, + "loss": 0.2679, + "mean_token_accuracy": 0.8777388334274292, + "step": 1399 + }, + { + "epoch": 4.40251572327044, + "grad_norm": 0.15459056198596954, + "learning_rate": 2.6321375501633603e-05, + "loss": 0.2686, + "mean_token_accuracy": 0.8784300088882446, + "step": 1400 + }, + { + "epoch": 4.40566037735849, + "grad_norm": 0.139274001121521, + "learning_rate": 2.6303573377678513e-05, + "loss": 0.2573, + "mean_token_accuracy": 0.8845201134681702, + "step": 1401 + }, + { + "epoch": 4.408805031446541, + "grad_norm": 0.15442243218421936, + "learning_rate": 2.6285766787914034e-05, + "loss": 0.2497, + "mean_token_accuracy": 0.8847933411598206, + "step": 1402 + }, + { + "epoch": 4.411949685534591, + "grad_norm": 0.1377149522304535, + "learning_rate": 2.626795575081804e-05, + "loss": 0.2274, + "mean_token_accuracy": 0.8895265460014343, + "step": 1403 + }, + { + "epoch": 4.415094339622642, + "grad_norm": 0.14790353178977966, + "learning_rate": 2.6250140284873017e-05, + "loss": 0.2685, + "mean_token_accuracy": 0.8800943493843079, + "step": 1404 + }, + { + "epoch": 4.418238993710692, + "grad_norm": 0.13157331943511963, + "learning_rate": 2.6232320408566025e-05, + "loss": 0.2427, + "mean_token_accuracy": 0.8863032460212708, + "step": 1405 + }, + { + "epoch": 4.421383647798742, + "grad_norm": 0.14972056448459625, + "learning_rate": 2.6214496140388718e-05, + "loss": 0.2452, + "mean_token_accuracy": 0.8844586610794067, + "step": 1406 + }, + { + "epoch": 4.4245283018867925, + "grad_norm": 0.14638881385326385, + "learning_rate": 2.6196667498837302e-05, + "loss": 0.2597, + "mean_token_accuracy": 0.880418598651886, + "step": 1407 + }, + { + "epoch": 4.427672955974843, + "grad_norm": 0.1468118280172348, + "learning_rate": 2.617883450241252e-05, + "loss": 0.2518, + "mean_token_accuracy": 0.8848738670349121, + "step": 1408 + }, + { + "epoch": 4.430817610062893, + "grad_norm": 0.14775048196315765, + "learning_rate": 2.616099716961964e-05, + "loss": 0.2653, + "mean_token_accuracy": 0.8803516030311584, + "step": 1409 + }, + { + "epoch": 4.433962264150943, + "grad_norm": 0.14332270622253418, + "learning_rate": 2.6143155518968428e-05, + "loss": 0.2487, + "mean_token_accuracy": 0.8877217173576355, + "step": 1410 + }, + { + "epoch": 4.437106918238994, + "grad_norm": 0.14214986562728882, + "learning_rate": 2.612530956897312e-05, + "loss": 0.2705, + "mean_token_accuracy": 0.8784423470497131, + "step": 1411 + }, + { + "epoch": 4.440251572327044, + "grad_norm": 0.1403186023235321, + "learning_rate": 2.6107459338152427e-05, + "loss": 0.2665, + "mean_token_accuracy": 0.8789308071136475, + "step": 1412 + }, + { + "epoch": 4.443396226415095, + "grad_norm": 0.138310045003891, + "learning_rate": 2.60896048450295e-05, + "loss": 0.2499, + "mean_token_accuracy": 0.8843627572059631, + "step": 1413 + }, + { + "epoch": 4.446540880503145, + "grad_norm": 0.1412084996700287, + "learning_rate": 2.607174610813191e-05, + "loss": 0.2594, + "mean_token_accuracy": 0.8839447498321533, + "step": 1414 + }, + { + "epoch": 4.449685534591195, + "grad_norm": 0.14437635242938995, + "learning_rate": 2.605388314599163e-05, + "loss": 0.241, + "mean_token_accuracy": 0.8860413432121277, + "step": 1415 + }, + { + "epoch": 4.452830188679245, + "grad_norm": 0.14431603252887726, + "learning_rate": 2.6036015977145018e-05, + "loss": 0.256, + "mean_token_accuracy": 0.8797090649604797, + "step": 1416 + }, + { + "epoch": 4.455974842767295, + "grad_norm": 0.1461351066827774, + "learning_rate": 2.6018144620132805e-05, + "loss": 0.2438, + "mean_token_accuracy": 0.8827074766159058, + "step": 1417 + }, + { + "epoch": 4.459119496855346, + "grad_norm": 0.14454837143421173, + "learning_rate": 2.6000269093500055e-05, + "loss": 0.2541, + "mean_token_accuracy": 0.8819749355316162, + "step": 1418 + }, + { + "epoch": 4.462264150943396, + "grad_norm": 0.15199217200279236, + "learning_rate": 2.598238941579617e-05, + "loss": 0.2622, + "mean_token_accuracy": 0.8807674050331116, + "step": 1419 + }, + { + "epoch": 4.465408805031447, + "grad_norm": 0.1487475484609604, + "learning_rate": 2.596450560557485e-05, + "loss": 0.2574, + "mean_token_accuracy": 0.8806383013725281, + "step": 1420 + }, + { + "epoch": 4.468553459119497, + "grad_norm": 0.14922156929969788, + "learning_rate": 2.59466176813941e-05, + "loss": 0.2574, + "mean_token_accuracy": 0.8796212077140808, + "step": 1421 + }, + { + "epoch": 4.471698113207547, + "grad_norm": 0.13734014332294464, + "learning_rate": 2.5928725661816162e-05, + "loss": 0.2412, + "mean_token_accuracy": 0.8878635168075562, + "step": 1422 + }, + { + "epoch": 4.4748427672955975, + "grad_norm": 0.1519346535205841, + "learning_rate": 2.5910829565407573e-05, + "loss": 0.2607, + "mean_token_accuracy": 0.8802406191825867, + "step": 1423 + }, + { + "epoch": 4.477987421383648, + "grad_norm": 0.13957345485687256, + "learning_rate": 2.5892929410739066e-05, + "loss": 0.2437, + "mean_token_accuracy": 0.8885535001754761, + "step": 1424 + }, + { + "epoch": 4.481132075471698, + "grad_norm": 0.14830255508422852, + "learning_rate": 2.587502521638559e-05, + "loss": 0.2575, + "mean_token_accuracy": 0.8828555345535278, + "step": 1425 + }, + { + "epoch": 4.484276729559748, + "grad_norm": 0.13677102327346802, + "learning_rate": 2.5857117000926298e-05, + "loss": 0.2501, + "mean_token_accuracy": 0.8842593431472778, + "step": 1426 + }, + { + "epoch": 4.487421383647799, + "grad_norm": 0.1367623507976532, + "learning_rate": 2.5839204782944506e-05, + "loss": 0.2689, + "mean_token_accuracy": 0.8797082304954529, + "step": 1427 + }, + { + "epoch": 4.490566037735849, + "grad_norm": 0.1288725733757019, + "learning_rate": 2.582128858102769e-05, + "loss": 0.2567, + "mean_token_accuracy": 0.8836309909820557, + "step": 1428 + }, + { + "epoch": 4.4937106918239, + "grad_norm": 0.13104671239852905, + "learning_rate": 2.5803368413767443e-05, + "loss": 0.2503, + "mean_token_accuracy": 0.8835127949714661, + "step": 1429 + }, + { + "epoch": 4.49685534591195, + "grad_norm": 0.1426146924495697, + "learning_rate": 2.5785444299759504e-05, + "loss": 0.2556, + "mean_token_accuracy": 0.8816784620285034, + "step": 1430 + }, + { + "epoch": 4.5, + "grad_norm": 0.13342611491680145, + "learning_rate": 2.576751625760368e-05, + "loss": 0.2584, + "mean_token_accuracy": 0.879668116569519, + "step": 1431 + }, + { + "epoch": 4.50314465408805, + "grad_norm": 0.15487535297870636, + "learning_rate": 2.5749584305903866e-05, + "loss": 0.2569, + "mean_token_accuracy": 0.8806970119476318, + "step": 1432 + }, + { + "epoch": 4.5062893081761, + "grad_norm": 0.14090071618556976, + "learning_rate": 2.5731648463268015e-05, + "loss": 0.2415, + "mean_token_accuracy": 0.8850799798965454, + "step": 1433 + }, + { + "epoch": 4.509433962264151, + "grad_norm": 0.14992307126522064, + "learning_rate": 2.571370874830811e-05, + "loss": 0.2522, + "mean_token_accuracy": 0.882178008556366, + "step": 1434 + }, + { + "epoch": 4.512578616352201, + "grad_norm": 0.13169358670711517, + "learning_rate": 2.569576517964016e-05, + "loss": 0.2544, + "mean_token_accuracy": 0.8801778554916382, + "step": 1435 + }, + { + "epoch": 4.515723270440252, + "grad_norm": 0.16984504461288452, + "learning_rate": 2.567781777588416e-05, + "loss": 0.2474, + "mean_token_accuracy": 0.8849585056304932, + "step": 1436 + }, + { + "epoch": 4.518867924528302, + "grad_norm": 0.1379225105047226, + "learning_rate": 2.565986655566411e-05, + "loss": 0.2451, + "mean_token_accuracy": 0.88438481092453, + "step": 1437 + }, + { + "epoch": 4.522012578616352, + "grad_norm": 0.18996167182922363, + "learning_rate": 2.5641911537607952e-05, + "loss": 0.2607, + "mean_token_accuracy": 0.8825079798698425, + "step": 1438 + }, + { + "epoch": 4.5251572327044025, + "grad_norm": 0.13947978615760803, + "learning_rate": 2.562395274034756e-05, + "loss": 0.2568, + "mean_token_accuracy": 0.8816097378730774, + "step": 1439 + }, + { + "epoch": 4.528301886792453, + "grad_norm": 0.20490330457687378, + "learning_rate": 2.5605990182518752e-05, + "loss": 0.2699, + "mean_token_accuracy": 0.8782700896263123, + "step": 1440 + }, + { + "epoch": 4.531446540880503, + "grad_norm": 0.1351514607667923, + "learning_rate": 2.5588023882761233e-05, + "loss": 0.2508, + "mean_token_accuracy": 0.8827007412910461, + "step": 1441 + }, + { + "epoch": 4.534591194968553, + "grad_norm": 0.18768662214279175, + "learning_rate": 2.5570053859718606e-05, + "loss": 0.2583, + "mean_token_accuracy": 0.8811473846435547, + "step": 1442 + }, + { + "epoch": 4.537735849056604, + "grad_norm": 0.14161916077136993, + "learning_rate": 2.5552080132038312e-05, + "loss": 0.239, + "mean_token_accuracy": 0.8842949867248535, + "step": 1443 + }, + { + "epoch": 4.540880503144654, + "grad_norm": 0.1600307822227478, + "learning_rate": 2.5534102718371664e-05, + "loss": 0.2711, + "mean_token_accuracy": 0.8794844746589661, + "step": 1444 + }, + { + "epoch": 4.544025157232705, + "grad_norm": 0.1379203200340271, + "learning_rate": 2.5516121637373782e-05, + "loss": 0.2464, + "mean_token_accuracy": 0.8868380188941956, + "step": 1445 + }, + { + "epoch": 4.547169811320755, + "grad_norm": 0.15503491461277008, + "learning_rate": 2.5498136907703594e-05, + "loss": 0.2582, + "mean_token_accuracy": 0.8819460868835449, + "step": 1446 + }, + { + "epoch": 4.550314465408805, + "grad_norm": 0.13813835382461548, + "learning_rate": 2.5480148548023823e-05, + "loss": 0.2668, + "mean_token_accuracy": 0.878455638885498, + "step": 1447 + }, + { + "epoch": 4.553459119496855, + "grad_norm": 0.1430351883172989, + "learning_rate": 2.5462156577000952e-05, + "loss": 0.2432, + "mean_token_accuracy": 0.8864967226982117, + "step": 1448 + }, + { + "epoch": 4.556603773584905, + "grad_norm": 0.14724938571453094, + "learning_rate": 2.5444161013305217e-05, + "loss": 0.2618, + "mean_token_accuracy": 0.8809149861335754, + "step": 1449 + }, + { + "epoch": 4.559748427672956, + "grad_norm": 0.132796511054039, + "learning_rate": 2.5426161875610572e-05, + "loss": 0.2667, + "mean_token_accuracy": 0.8805984854698181, + "step": 1450 + }, + { + "epoch": 4.562893081761006, + "grad_norm": 0.1508307307958603, + "learning_rate": 2.540815918259469e-05, + "loss": 0.2494, + "mean_token_accuracy": 0.8844383955001831, + "step": 1451 + }, + { + "epoch": 4.566037735849057, + "grad_norm": 0.13710395991802216, + "learning_rate": 2.539015295293893e-05, + "loss": 0.2698, + "mean_token_accuracy": 0.8794138431549072, + "step": 1452 + }, + { + "epoch": 4.569182389937107, + "grad_norm": 0.13251298666000366, + "learning_rate": 2.5372143205328306e-05, + "loss": 0.2413, + "mean_token_accuracy": 0.8852861523628235, + "step": 1453 + }, + { + "epoch": 4.572327044025157, + "grad_norm": 0.1437736451625824, + "learning_rate": 2.5354129958451513e-05, + "loss": 0.2661, + "mean_token_accuracy": 0.8799407482147217, + "step": 1454 + }, + { + "epoch": 4.5754716981132075, + "grad_norm": 0.1383388191461563, + "learning_rate": 2.5336113231000862e-05, + "loss": 0.2517, + "mean_token_accuracy": 0.8848426938056946, + "step": 1455 + }, + { + "epoch": 4.578616352201258, + "grad_norm": 0.14111852645874023, + "learning_rate": 2.5318093041672257e-05, + "loss": 0.2618, + "mean_token_accuracy": 0.8823397159576416, + "step": 1456 + }, + { + "epoch": 4.581761006289308, + "grad_norm": 0.14448735117912292, + "learning_rate": 2.5300069409165232e-05, + "loss": 0.2371, + "mean_token_accuracy": 0.8877670764923096, + "step": 1457 + }, + { + "epoch": 4.584905660377358, + "grad_norm": 0.13009530305862427, + "learning_rate": 2.5282042352182867e-05, + "loss": 0.2415, + "mean_token_accuracy": 0.8879152536392212, + "step": 1458 + }, + { + "epoch": 4.588050314465409, + "grad_norm": 0.13649174571037292, + "learning_rate": 2.5264011889431795e-05, + "loss": 0.2632, + "mean_token_accuracy": 0.8810548186302185, + "step": 1459 + }, + { + "epoch": 4.591194968553459, + "grad_norm": 0.12930653989315033, + "learning_rate": 2.5245978039622197e-05, + "loss": 0.2498, + "mean_token_accuracy": 0.8841792345046997, + "step": 1460 + }, + { + "epoch": 4.59433962264151, + "grad_norm": 0.14395128190517426, + "learning_rate": 2.522794082146776e-05, + "loss": 0.2493, + "mean_token_accuracy": 0.8833113312721252, + "step": 1461 + }, + { + "epoch": 4.59748427672956, + "grad_norm": 0.13741520047187805, + "learning_rate": 2.5209900253685674e-05, + "loss": 0.2596, + "mean_token_accuracy": 0.8786746859550476, + "step": 1462 + }, + { + "epoch": 4.60062893081761, + "grad_norm": 0.14270655810832977, + "learning_rate": 2.5191856354996595e-05, + "loss": 0.2429, + "mean_token_accuracy": 0.8827742338180542, + "step": 1463 + }, + { + "epoch": 4.60377358490566, + "grad_norm": 0.13880446553230286, + "learning_rate": 2.5173809144124635e-05, + "loss": 0.2521, + "mean_token_accuracy": 0.8830817341804504, + "step": 1464 + }, + { + "epoch": 4.6069182389937104, + "grad_norm": 0.14457201957702637, + "learning_rate": 2.5155758639797357e-05, + "loss": 0.2489, + "mean_token_accuracy": 0.8850651383399963, + "step": 1465 + }, + { + "epoch": 4.610062893081761, + "grad_norm": 0.13805538415908813, + "learning_rate": 2.513770486074574e-05, + "loss": 0.2488, + "mean_token_accuracy": 0.883724570274353, + "step": 1466 + }, + { + "epoch": 4.613207547169811, + "grad_norm": 0.1541612446308136, + "learning_rate": 2.5119647825704134e-05, + "loss": 0.2526, + "mean_token_accuracy": 0.8837642073631287, + "step": 1467 + }, + { + "epoch": 4.616352201257862, + "grad_norm": 0.1382627934217453, + "learning_rate": 2.51015875534103e-05, + "loss": 0.2639, + "mean_token_accuracy": 0.8827504515647888, + "step": 1468 + }, + { + "epoch": 4.619496855345912, + "grad_norm": 0.15054289996623993, + "learning_rate": 2.5083524062605344e-05, + "loss": 0.2509, + "mean_token_accuracy": 0.8836148977279663, + "step": 1469 + }, + { + "epoch": 4.622641509433962, + "grad_norm": 0.14112670719623566, + "learning_rate": 2.506545737203371e-05, + "loss": 0.2523, + "mean_token_accuracy": 0.8837305903434753, + "step": 1470 + }, + { + "epoch": 4.6257861635220126, + "grad_norm": 0.14425958693027496, + "learning_rate": 2.5047387500443177e-05, + "loss": 0.247, + "mean_token_accuracy": 0.8845484852790833, + "step": 1471 + }, + { + "epoch": 4.628930817610063, + "grad_norm": 0.14548322558403015, + "learning_rate": 2.5029314466584798e-05, + "loss": 0.2467, + "mean_token_accuracy": 0.883867084980011, + "step": 1472 + }, + { + "epoch": 4.632075471698113, + "grad_norm": 0.14163750410079956, + "learning_rate": 2.5011238289212948e-05, + "loss": 0.2776, + "mean_token_accuracy": 0.8781344294548035, + "step": 1473 + }, + { + "epoch": 4.635220125786163, + "grad_norm": 0.14586465060710907, + "learning_rate": 2.499315898708521e-05, + "loss": 0.2493, + "mean_token_accuracy": 0.8815778493881226, + "step": 1474 + }, + { + "epoch": 4.638364779874214, + "grad_norm": 0.14209404587745667, + "learning_rate": 2.4975076578962454e-05, + "loss": 0.2525, + "mean_token_accuracy": 0.8831738233566284, + "step": 1475 + }, + { + "epoch": 4.6415094339622645, + "grad_norm": 0.14334115386009216, + "learning_rate": 2.4956991083608766e-05, + "loss": 0.2669, + "mean_token_accuracy": 0.882244348526001, + "step": 1476 + }, + { + "epoch": 4.644654088050315, + "grad_norm": 0.1406649500131607, + "learning_rate": 2.493890251979141e-05, + "loss": 0.2515, + "mean_token_accuracy": 0.8821365833282471, + "step": 1477 + }, + { + "epoch": 4.647798742138365, + "grad_norm": 0.1404079645872116, + "learning_rate": 2.4920810906280873e-05, + "loss": 0.2533, + "mean_token_accuracy": 0.8844720125198364, + "step": 1478 + }, + { + "epoch": 4.650943396226415, + "grad_norm": 0.13785113394260406, + "learning_rate": 2.4902716261850764e-05, + "loss": 0.2638, + "mean_token_accuracy": 0.8819215893745422, + "step": 1479 + }, + { + "epoch": 4.654088050314465, + "grad_norm": 0.15292143821716309, + "learning_rate": 2.4884618605277874e-05, + "loss": 0.2659, + "mean_token_accuracy": 0.8811072111129761, + "step": 1480 + }, + { + "epoch": 4.6572327044025155, + "grad_norm": 0.13847500085830688, + "learning_rate": 2.4866517955342094e-05, + "loss": 0.2564, + "mean_token_accuracy": 0.8817123770713806, + "step": 1481 + }, + { + "epoch": 4.660377358490566, + "grad_norm": 0.1566178798675537, + "learning_rate": 2.4848414330826443e-05, + "loss": 0.2505, + "mean_token_accuracy": 0.8840153813362122, + "step": 1482 + }, + { + "epoch": 4.663522012578616, + "grad_norm": 0.1307821422815323, + "learning_rate": 2.4830307750517017e-05, + "loss": 0.268, + "mean_token_accuracy": 0.8796277046203613, + "step": 1483 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.152281254529953, + "learning_rate": 2.481219823320296e-05, + "loss": 0.254, + "mean_token_accuracy": 0.8816496133804321, + "step": 1484 + }, + { + "epoch": 4.669811320754717, + "grad_norm": 0.14067430794239044, + "learning_rate": 2.479408579767649e-05, + "loss": 0.2512, + "mean_token_accuracy": 0.8842344284057617, + "step": 1485 + }, + { + "epoch": 4.672955974842767, + "grad_norm": 0.13918046653270721, + "learning_rate": 2.4775970462732858e-05, + "loss": 0.2603, + "mean_token_accuracy": 0.8810861110687256, + "step": 1486 + }, + { + "epoch": 4.676100628930818, + "grad_norm": 0.1361314058303833, + "learning_rate": 2.4757852247170293e-05, + "loss": 0.256, + "mean_token_accuracy": 0.8834766149520874, + "step": 1487 + }, + { + "epoch": 4.679245283018868, + "grad_norm": 0.1420655995607376, + "learning_rate": 2.4739731169790036e-05, + "loss": 0.2482, + "mean_token_accuracy": 0.8854100108146667, + "step": 1488 + }, + { + "epoch": 4.682389937106918, + "grad_norm": 0.14817163348197937, + "learning_rate": 2.4721607249396295e-05, + "loss": 0.2581, + "mean_token_accuracy": 0.8823645710945129, + "step": 1489 + }, + { + "epoch": 4.685534591194968, + "grad_norm": 0.14173953235149384, + "learning_rate": 2.4703480504796227e-05, + "loss": 0.2423, + "mean_token_accuracy": 0.8879494071006775, + "step": 1490 + }, + { + "epoch": 4.688679245283019, + "grad_norm": 0.142574280500412, + "learning_rate": 2.4685350954799908e-05, + "loss": 0.2595, + "mean_token_accuracy": 0.8818832635879517, + "step": 1491 + }, + { + "epoch": 4.6918238993710695, + "grad_norm": 0.14576321840286255, + "learning_rate": 2.466721861822034e-05, + "loss": 0.256, + "mean_token_accuracy": 0.8829867243766785, + "step": 1492 + }, + { + "epoch": 4.69496855345912, + "grad_norm": 0.13140003383159637, + "learning_rate": 2.4649083513873423e-05, + "loss": 0.251, + "mean_token_accuracy": 0.8816654086112976, + "step": 1493 + }, + { + "epoch": 4.69811320754717, + "grad_norm": 0.1674601286649704, + "learning_rate": 2.4630945660577907e-05, + "loss": 0.2488, + "mean_token_accuracy": 0.885489284992218, + "step": 1494 + }, + { + "epoch": 4.70125786163522, + "grad_norm": 0.14407427608966827, + "learning_rate": 2.4612805077155404e-05, + "loss": 0.2642, + "mean_token_accuracy": 0.8817337155342102, + "step": 1495 + }, + { + "epoch": 4.70440251572327, + "grad_norm": 0.14941011369228363, + "learning_rate": 2.4594661782430368e-05, + "loss": 0.2667, + "mean_token_accuracy": 0.8806317448616028, + "step": 1496 + }, + { + "epoch": 4.7075471698113205, + "grad_norm": 0.1350492388010025, + "learning_rate": 2.4576515795230057e-05, + "loss": 0.265, + "mean_token_accuracy": 0.876676619052887, + "step": 1497 + }, + { + "epoch": 4.710691823899371, + "grad_norm": 0.14951270818710327, + "learning_rate": 2.4558367134384516e-05, + "loss": 0.2466, + "mean_token_accuracy": 0.8872973918914795, + "step": 1498 + }, + { + "epoch": 4.713836477987421, + "grad_norm": 0.14224649965763092, + "learning_rate": 2.4540215818726587e-05, + "loss": 0.2582, + "mean_token_accuracy": 0.8833028078079224, + "step": 1499 + }, + { + "epoch": 4.716981132075472, + "grad_norm": 0.15279057621955872, + "learning_rate": 2.452206186709185e-05, + "loss": 0.2644, + "mean_token_accuracy": 0.8810228705406189, + "step": 1500 + }, + { + "epoch": 4.720125786163522, + "grad_norm": 0.1417638510465622, + "learning_rate": 2.4503905298318612e-05, + "loss": 0.2682, + "mean_token_accuracy": 0.8790037631988525, + "step": 1501 + }, + { + "epoch": 4.723270440251572, + "grad_norm": 0.13526369631290436, + "learning_rate": 2.448574613124793e-05, + "loss": 0.2681, + "mean_token_accuracy": 0.8802682161331177, + "step": 1502 + }, + { + "epoch": 4.726415094339623, + "grad_norm": 0.1354237049818039, + "learning_rate": 2.4467584384723512e-05, + "loss": 0.2644, + "mean_token_accuracy": 0.8789896965026855, + "step": 1503 + }, + { + "epoch": 4.729559748427673, + "grad_norm": 0.13121561706066132, + "learning_rate": 2.444942007759178e-05, + "loss": 0.2537, + "mean_token_accuracy": 0.8831503987312317, + "step": 1504 + }, + { + "epoch": 4.732704402515723, + "grad_norm": 0.13547173142433167, + "learning_rate": 2.4431253228701795e-05, + "loss": 0.2565, + "mean_token_accuracy": 0.87940514087677, + "step": 1505 + }, + { + "epoch": 4.735849056603773, + "grad_norm": 0.13463617861270905, + "learning_rate": 2.4413083856905257e-05, + "loss": 0.2424, + "mean_token_accuracy": 0.886320173740387, + "step": 1506 + }, + { + "epoch": 4.738993710691824, + "grad_norm": 0.12674209475517273, + "learning_rate": 2.4394911981056493e-05, + "loss": 0.2573, + "mean_token_accuracy": 0.880353569984436, + "step": 1507 + }, + { + "epoch": 4.7421383647798745, + "grad_norm": 0.14087824523448944, + "learning_rate": 2.437673762001241e-05, + "loss": 0.2591, + "mean_token_accuracy": 0.8803715705871582, + "step": 1508 + }, + { + "epoch": 4.745283018867925, + "grad_norm": 0.13621707260608673, + "learning_rate": 2.4358560792632515e-05, + "loss": 0.2791, + "mean_token_accuracy": 0.8760728240013123, + "step": 1509 + }, + { + "epoch": 4.748427672955975, + "grad_norm": 0.12566280364990234, + "learning_rate": 2.4340381517778867e-05, + "loss": 0.2527, + "mean_token_accuracy": 0.882553219795227, + "step": 1510 + }, + { + "epoch": 4.751572327044025, + "grad_norm": 0.1349441558122635, + "learning_rate": 2.432219981431605e-05, + "loss": 0.2484, + "mean_token_accuracy": 0.8867226839065552, + "step": 1511 + }, + { + "epoch": 4.754716981132075, + "grad_norm": 0.12752796709537506, + "learning_rate": 2.4304015701111197e-05, + "loss": 0.2615, + "mean_token_accuracy": 0.8807107210159302, + "step": 1512 + }, + { + "epoch": 4.7578616352201255, + "grad_norm": 0.12869000434875488, + "learning_rate": 2.428582919703391e-05, + "loss": 0.2587, + "mean_token_accuracy": 0.8817474842071533, + "step": 1513 + }, + { + "epoch": 4.761006289308176, + "grad_norm": 0.12598003447055817, + "learning_rate": 2.4267640320956302e-05, + "loss": 0.2672, + "mean_token_accuracy": 0.8799456357955933, + "step": 1514 + }, + { + "epoch": 4.764150943396227, + "grad_norm": 0.125724658370018, + "learning_rate": 2.4249449091752922e-05, + "loss": 0.2374, + "mean_token_accuracy": 0.8862662315368652, + "step": 1515 + }, + { + "epoch": 4.767295597484277, + "grad_norm": 0.121979720890522, + "learning_rate": 2.4231255528300778e-05, + "loss": 0.2584, + "mean_token_accuracy": 0.8822363615036011, + "step": 1516 + }, + { + "epoch": 4.770440251572327, + "grad_norm": 0.12360676378011703, + "learning_rate": 2.421305964947929e-05, + "loss": 0.2559, + "mean_token_accuracy": 0.8823013305664062, + "step": 1517 + }, + { + "epoch": 4.773584905660377, + "grad_norm": 0.13420429825782776, + "learning_rate": 2.419486147417028e-05, + "loss": 0.2326, + "mean_token_accuracy": 0.8917833566665649, + "step": 1518 + }, + { + "epoch": 4.776729559748428, + "grad_norm": 0.12840516865253448, + "learning_rate": 2.417666102125797e-05, + "loss": 0.2455, + "mean_token_accuracy": 0.8831151127815247, + "step": 1519 + }, + { + "epoch": 4.779874213836478, + "grad_norm": 0.14521858096122742, + "learning_rate": 2.415845830962892e-05, + "loss": 0.2555, + "mean_token_accuracy": 0.8834573030471802, + "step": 1520 + }, + { + "epoch": 4.783018867924528, + "grad_norm": 0.13063116371631622, + "learning_rate": 2.4140253358172064e-05, + "loss": 0.2442, + "mean_token_accuracy": 0.8842610120773315, + "step": 1521 + }, + { + "epoch": 4.786163522012579, + "grad_norm": 0.13243721425533295, + "learning_rate": 2.4122046185778628e-05, + "loss": 0.2705, + "mean_token_accuracy": 0.8768439888954163, + "step": 1522 + }, + { + "epoch": 4.789308176100629, + "grad_norm": 0.14029352366924286, + "learning_rate": 2.4103836811342167e-05, + "loss": 0.2562, + "mean_token_accuracy": 0.8825986385345459, + "step": 1523 + }, + { + "epoch": 4.7924528301886795, + "grad_norm": 0.1397697478532791, + "learning_rate": 2.4085625253758505e-05, + "loss": 0.2495, + "mean_token_accuracy": 0.8802828788757324, + "step": 1524 + }, + { + "epoch": 4.79559748427673, + "grad_norm": 0.13132429122924805, + "learning_rate": 2.4067411531925736e-05, + "loss": 0.2662, + "mean_token_accuracy": 0.8824609518051147, + "step": 1525 + }, + { + "epoch": 4.79874213836478, + "grad_norm": 0.12849049270153046, + "learning_rate": 2.404919566474422e-05, + "loss": 0.24, + "mean_token_accuracy": 0.887162446975708, + "step": 1526 + }, + { + "epoch": 4.80188679245283, + "grad_norm": 0.1436852514743805, + "learning_rate": 2.40309776711165e-05, + "loss": 0.255, + "mean_token_accuracy": 0.8830965757369995, + "step": 1527 + }, + { + "epoch": 4.80503144654088, + "grad_norm": 0.12779057025909424, + "learning_rate": 2.401275756994737e-05, + "loss": 0.2673, + "mean_token_accuracy": 0.8801604509353638, + "step": 1528 + }, + { + "epoch": 4.8081761006289305, + "grad_norm": 0.13711079955101013, + "learning_rate": 2.399453538014379e-05, + "loss": 0.2391, + "mean_token_accuracy": 0.8846153020858765, + "step": 1529 + }, + { + "epoch": 4.811320754716981, + "grad_norm": 0.12312142550945282, + "learning_rate": 2.397631112061488e-05, + "loss": 0.2518, + "mean_token_accuracy": 0.8821859955787659, + "step": 1530 + }, + { + "epoch": 4.814465408805032, + "grad_norm": 0.13619542121887207, + "learning_rate": 2.3958084810271927e-05, + "loss": 0.2531, + "mean_token_accuracy": 0.8836720585823059, + "step": 1531 + }, + { + "epoch": 4.817610062893082, + "grad_norm": 0.12221843004226685, + "learning_rate": 2.393985646802833e-05, + "loss": 0.2481, + "mean_token_accuracy": 0.8841218948364258, + "step": 1532 + }, + { + "epoch": 4.820754716981132, + "grad_norm": 0.12619447708129883, + "learning_rate": 2.392162611279961e-05, + "loss": 0.2502, + "mean_token_accuracy": 0.8820463418960571, + "step": 1533 + }, + { + "epoch": 4.823899371069182, + "grad_norm": 0.12062551081180573, + "learning_rate": 2.3903393763503355e-05, + "loss": 0.2734, + "mean_token_accuracy": 0.8766216039657593, + "step": 1534 + }, + { + "epoch": 4.827044025157233, + "grad_norm": 0.12718960642814636, + "learning_rate": 2.388515943905925e-05, + "loss": 0.2495, + "mean_token_accuracy": 0.8845596313476562, + "step": 1535 + }, + { + "epoch": 4.830188679245283, + "grad_norm": 0.12057467550039291, + "learning_rate": 2.3866923158389018e-05, + "loss": 0.2325, + "mean_token_accuracy": 0.8902129530906677, + "step": 1536 + }, + { + "epoch": 4.833333333333333, + "grad_norm": 0.13578729331493378, + "learning_rate": 2.3848684940416402e-05, + "loss": 0.2785, + "mean_token_accuracy": 0.8787068128585815, + "step": 1537 + }, + { + "epoch": 4.836477987421384, + "grad_norm": 0.12010408192873001, + "learning_rate": 2.383044480406717e-05, + "loss": 0.2825, + "mean_token_accuracy": 0.8767572641372681, + "step": 1538 + }, + { + "epoch": 4.839622641509434, + "grad_norm": 0.12753932178020477, + "learning_rate": 2.381220276826907e-05, + "loss": 0.2458, + "mean_token_accuracy": 0.8807628750801086, + "step": 1539 + }, + { + "epoch": 4.8427672955974845, + "grad_norm": 0.12194069474935532, + "learning_rate": 2.3793958851951828e-05, + "loss": 0.2588, + "mean_token_accuracy": 0.8817402124404907, + "step": 1540 + }, + { + "epoch": 4.845911949685535, + "grad_norm": 0.1362626850605011, + "learning_rate": 2.377571307404712e-05, + "loss": 0.2514, + "mean_token_accuracy": 0.8836599588394165, + "step": 1541 + }, + { + "epoch": 4.849056603773585, + "grad_norm": 0.129854217171669, + "learning_rate": 2.3757465453488557e-05, + "loss": 0.262, + "mean_token_accuracy": 0.8794586062431335, + "step": 1542 + }, + { + "epoch": 4.852201257861635, + "grad_norm": 0.12912380695343018, + "learning_rate": 2.3739216009211644e-05, + "loss": 0.2602, + "mean_token_accuracy": 0.8814219832420349, + "step": 1543 + }, + { + "epoch": 4.855345911949685, + "grad_norm": 0.12774620950222015, + "learning_rate": 2.37209647601538e-05, + "loss": 0.2629, + "mean_token_accuracy": 0.881087064743042, + "step": 1544 + }, + { + "epoch": 4.8584905660377355, + "grad_norm": 0.1410323977470398, + "learning_rate": 2.370271172525431e-05, + "loss": 0.2469, + "mean_token_accuracy": 0.8836389780044556, + "step": 1545 + }, + { + "epoch": 4.861635220125786, + "grad_norm": 0.12685927748680115, + "learning_rate": 2.3684456923454316e-05, + "loss": 0.2493, + "mean_token_accuracy": 0.8849629759788513, + "step": 1546 + }, + { + "epoch": 4.864779874213837, + "grad_norm": 0.1375240981578827, + "learning_rate": 2.3666200373696785e-05, + "loss": 0.2671, + "mean_token_accuracy": 0.881800651550293, + "step": 1547 + }, + { + "epoch": 4.867924528301887, + "grad_norm": 0.128312885761261, + "learning_rate": 2.36479420949265e-05, + "loss": 0.2664, + "mean_token_accuracy": 0.8808472752571106, + "step": 1548 + }, + { + "epoch": 4.871069182389937, + "grad_norm": 0.13644251227378845, + "learning_rate": 2.3629682106090036e-05, + "loss": 0.2468, + "mean_token_accuracy": 0.883743405342102, + "step": 1549 + }, + { + "epoch": 4.8742138364779874, + "grad_norm": 0.143430694937706, + "learning_rate": 2.361142042613576e-05, + "loss": 0.2463, + "mean_token_accuracy": 0.8866894245147705, + "step": 1550 + }, + { + "epoch": 4.877358490566038, + "grad_norm": 0.12638427317142487, + "learning_rate": 2.3593157074013768e-05, + "loss": 0.2547, + "mean_token_accuracy": 0.881319522857666, + "step": 1551 + }, + { + "epoch": 4.880503144654088, + "grad_norm": 0.1527438908815384, + "learning_rate": 2.35748920686759e-05, + "loss": 0.2457, + "mean_token_accuracy": 0.8852053880691528, + "step": 1552 + }, + { + "epoch": 4.883647798742138, + "grad_norm": 0.12867127358913422, + "learning_rate": 2.355662542907573e-05, + "loss": 0.2653, + "mean_token_accuracy": 0.8813451528549194, + "step": 1553 + }, + { + "epoch": 4.886792452830189, + "grad_norm": 0.14862971007823944, + "learning_rate": 2.3538357174168497e-05, + "loss": 0.2605, + "mean_token_accuracy": 0.8801792860031128, + "step": 1554 + }, + { + "epoch": 4.889937106918239, + "grad_norm": 0.13223330676555634, + "learning_rate": 2.352008732291115e-05, + "loss": 0.2311, + "mean_token_accuracy": 0.8890978097915649, + "step": 1555 + }, + { + "epoch": 4.8930817610062896, + "grad_norm": 0.13772611320018768, + "learning_rate": 2.3501815894262265e-05, + "loss": 0.2641, + "mean_token_accuracy": 0.8789441585540771, + "step": 1556 + }, + { + "epoch": 4.89622641509434, + "grad_norm": 0.1223578229546547, + "learning_rate": 2.3483542907182066e-05, + "loss": 0.2585, + "mean_token_accuracy": 0.8826157450675964, + "step": 1557 + }, + { + "epoch": 4.89937106918239, + "grad_norm": 0.13819842040538788, + "learning_rate": 2.3465268380632394e-05, + "loss": 0.2657, + "mean_token_accuracy": 0.880345344543457, + "step": 1558 + }, + { + "epoch": 4.90251572327044, + "grad_norm": 0.12548887729644775, + "learning_rate": 2.344699233357669e-05, + "loss": 0.2589, + "mean_token_accuracy": 0.8830808997154236, + "step": 1559 + }, + { + "epoch": 4.90566037735849, + "grad_norm": 0.13984771072864532, + "learning_rate": 2.342871478497998e-05, + "loss": 0.2667, + "mean_token_accuracy": 0.8818701505661011, + "step": 1560 + }, + { + "epoch": 4.908805031446541, + "grad_norm": 0.12641951441764832, + "learning_rate": 2.341043575380883e-05, + "loss": 0.2563, + "mean_token_accuracy": 0.883787214756012, + "step": 1561 + }, + { + "epoch": 4.911949685534591, + "grad_norm": 0.14930589497089386, + "learning_rate": 2.339215525903135e-05, + "loss": 0.2669, + "mean_token_accuracy": 0.8794329762458801, + "step": 1562 + }, + { + "epoch": 4.915094339622642, + "grad_norm": 0.1343175768852234, + "learning_rate": 2.337387331961718e-05, + "loss": 0.2506, + "mean_token_accuracy": 0.8854690790176392, + "step": 1563 + }, + { + "epoch": 4.918238993710692, + "grad_norm": 0.13801869750022888, + "learning_rate": 2.3355589954537448e-05, + "loss": 0.2556, + "mean_token_accuracy": 0.8809491395950317, + "step": 1564 + }, + { + "epoch": 4.921383647798742, + "grad_norm": 0.13780134916305542, + "learning_rate": 2.3337305182764768e-05, + "loss": 0.2551, + "mean_token_accuracy": 0.8815582394599915, + "step": 1565 + }, + { + "epoch": 4.9245283018867925, + "grad_norm": 0.12479320913553238, + "learning_rate": 2.3319019023273203e-05, + "loss": 0.2442, + "mean_token_accuracy": 0.8860586881637573, + "step": 1566 + }, + { + "epoch": 4.927672955974843, + "grad_norm": 0.14212225377559662, + "learning_rate": 2.3300731495038272e-05, + "loss": 0.2428, + "mean_token_accuracy": 0.8860717415809631, + "step": 1567 + }, + { + "epoch": 4.930817610062893, + "grad_norm": 0.12582793831825256, + "learning_rate": 2.3282442617036906e-05, + "loss": 0.2483, + "mean_token_accuracy": 0.8846839070320129, + "step": 1568 + }, + { + "epoch": 4.933962264150943, + "grad_norm": 0.1322171986103058, + "learning_rate": 2.3264152408247433e-05, + "loss": 0.2527, + "mean_token_accuracy": 0.8855623602867126, + "step": 1569 + }, + { + "epoch": 4.937106918238994, + "grad_norm": 0.1252465844154358, + "learning_rate": 2.3245860887649564e-05, + "loss": 0.2588, + "mean_token_accuracy": 0.8804665207862854, + "step": 1570 + }, + { + "epoch": 4.940251572327044, + "grad_norm": 0.12859833240509033, + "learning_rate": 2.3227568074224376e-05, + "loss": 0.2635, + "mean_token_accuracy": 0.8824833035469055, + "step": 1571 + }, + { + "epoch": 4.943396226415095, + "grad_norm": 0.12550607323646545, + "learning_rate": 2.3209273986954287e-05, + "loss": 0.2491, + "mean_token_accuracy": 0.886185884475708, + "step": 1572 + }, + { + "epoch": 4.946540880503145, + "grad_norm": 0.1253989338874817, + "learning_rate": 2.319097864482302e-05, + "loss": 0.2512, + "mean_token_accuracy": 0.8835926651954651, + "step": 1573 + }, + { + "epoch": 4.949685534591195, + "grad_norm": 0.12692295014858246, + "learning_rate": 2.3172682066815636e-05, + "loss": 0.2585, + "mean_token_accuracy": 0.880564272403717, + "step": 1574 + }, + { + "epoch": 4.952830188679245, + "grad_norm": 0.1225670799612999, + "learning_rate": 2.3154384271918443e-05, + "loss": 0.2451, + "mean_token_accuracy": 0.8834196329116821, + "step": 1575 + }, + { + "epoch": 4.955974842767295, + "grad_norm": 0.11881422251462936, + "learning_rate": 2.3136085279119033e-05, + "loss": 0.2651, + "mean_token_accuracy": 0.879045307636261, + "step": 1576 + }, + { + "epoch": 4.959119496855346, + "grad_norm": 0.12771472334861755, + "learning_rate": 2.3117785107406224e-05, + "loss": 0.2527, + "mean_token_accuracy": 0.883535623550415, + "step": 1577 + }, + { + "epoch": 4.962264150943396, + "grad_norm": 0.12338591367006302, + "learning_rate": 2.3099483775770078e-05, + "loss": 0.2542, + "mean_token_accuracy": 0.8821091055870056, + "step": 1578 + }, + { + "epoch": 4.965408805031447, + "grad_norm": 0.12770599126815796, + "learning_rate": 2.308118130320184e-05, + "loss": 0.2622, + "mean_token_accuracy": 0.8797197937965393, + "step": 1579 + }, + { + "epoch": 4.968553459119497, + "grad_norm": 0.12757545709609985, + "learning_rate": 2.3062877708693957e-05, + "loss": 0.2512, + "mean_token_accuracy": 0.8837404847145081, + "step": 1580 + }, + { + "epoch": 4.971698113207547, + "grad_norm": 0.13121959567070007, + "learning_rate": 2.3044573011240025e-05, + "loss": 0.2521, + "mean_token_accuracy": 0.8823564052581787, + "step": 1581 + }, + { + "epoch": 4.9748427672955975, + "grad_norm": 0.12287531793117523, + "learning_rate": 2.30262672298348e-05, + "loss": 0.2531, + "mean_token_accuracy": 0.8842577338218689, + "step": 1582 + }, + { + "epoch": 4.977987421383648, + "grad_norm": 0.13713645935058594, + "learning_rate": 2.3007960383474148e-05, + "loss": 0.2495, + "mean_token_accuracy": 0.8849415183067322, + "step": 1583 + }, + { + "epoch": 4.981132075471698, + "grad_norm": 0.1428612619638443, + "learning_rate": 2.2989652491155043e-05, + "loss": 0.2563, + "mean_token_accuracy": 0.8826706409454346, + "step": 1584 + }, + { + "epoch": 4.984276729559748, + "grad_norm": 0.12580153346061707, + "learning_rate": 2.297134357187556e-05, + "loss": 0.2639, + "mean_token_accuracy": 0.878861665725708, + "step": 1585 + }, + { + "epoch": 4.987421383647799, + "grad_norm": 0.13792036473751068, + "learning_rate": 2.2953033644634813e-05, + "loss": 0.2754, + "mean_token_accuracy": 0.8782982230186462, + "step": 1586 + }, + { + "epoch": 4.990566037735849, + "grad_norm": 0.13837353885173798, + "learning_rate": 2.293472272843299e-05, + "loss": 0.2526, + "mean_token_accuracy": 0.8833487033843994, + "step": 1587 + }, + { + "epoch": 4.9937106918239, + "grad_norm": 0.14238163828849792, + "learning_rate": 2.2916410842271274e-05, + "loss": 0.2435, + "mean_token_accuracy": 0.8903558254241943, + "step": 1588 + }, + { + "epoch": 4.99685534591195, + "grad_norm": 0.14869339764118195, + "learning_rate": 2.2898098005151893e-05, + "loss": 0.2322, + "mean_token_accuracy": 0.8898493647575378, + "step": 1589 + }, + { + "epoch": 5.0, + "grad_norm": 0.14632923901081085, + "learning_rate": 2.2879784236078023e-05, + "loss": 0.2493, + "mean_token_accuracy": 0.8759793639183044, + "step": 1590 + }, + { + "epoch": 5.00314465408805, + "grad_norm": 0.22233015298843384, + "learning_rate": 2.286146955405384e-05, + "loss": 0.1916, + "mean_token_accuracy": 0.9095132946968079, + "step": 1591 + }, + { + "epoch": 5.0062893081761, + "grad_norm": 0.25025674700737, + "learning_rate": 2.2843153978084445e-05, + "loss": 0.1819, + "mean_token_accuracy": 0.9113137125968933, + "step": 1592 + }, + { + "epoch": 5.009433962264151, + "grad_norm": 0.2684067189693451, + "learning_rate": 2.282483752717587e-05, + "loss": 0.1972, + "mean_token_accuracy": 0.9079714417457581, + "step": 1593 + }, + { + "epoch": 5.012578616352202, + "grad_norm": 0.24029596149921417, + "learning_rate": 2.2806520220335066e-05, + "loss": 0.1678, + "mean_token_accuracy": 0.9172501564025879, + "step": 1594 + }, + { + "epoch": 5.015723270440252, + "grad_norm": 0.2321806699037552, + "learning_rate": 2.2788202076569866e-05, + "loss": 0.1724, + "mean_token_accuracy": 0.9155396223068237, + "step": 1595 + }, + { + "epoch": 5.018867924528302, + "grad_norm": 0.18021827936172485, + "learning_rate": 2.2769883114888964e-05, + "loss": 0.1703, + "mean_token_accuracy": 0.9156048893928528, + "step": 1596 + }, + { + "epoch": 5.022012578616352, + "grad_norm": 0.23715201020240784, + "learning_rate": 2.2751563354301915e-05, + "loss": 0.1776, + "mean_token_accuracy": 0.9144327044487, + "step": 1597 + }, + { + "epoch": 5.0251572327044025, + "grad_norm": 0.16609273850917816, + "learning_rate": 2.273324281381909e-05, + "loss": 0.1682, + "mean_token_accuracy": 0.9172971844673157, + "step": 1598 + }, + { + "epoch": 5.028301886792453, + "grad_norm": 0.1903139054775238, + "learning_rate": 2.271492151245169e-05, + "loss": 0.1598, + "mean_token_accuracy": 0.917824387550354, + "step": 1599 + }, + { + "epoch": 5.031446540880503, + "grad_norm": 0.17430105805397034, + "learning_rate": 2.269659946921168e-05, + "loss": 0.1585, + "mean_token_accuracy": 0.9176533818244934, + "step": 1600 + }, + { + "epoch": 5.034591194968553, + "grad_norm": 0.18532267212867737, + "learning_rate": 2.2678276703111814e-05, + "loss": 0.1731, + "mean_token_accuracy": 0.9151170253753662, + "step": 1601 + }, + { + "epoch": 5.037735849056604, + "grad_norm": 0.15971753001213074, + "learning_rate": 2.2659953233165583e-05, + "loss": 0.1707, + "mean_token_accuracy": 0.915444552898407, + "step": 1602 + }, + { + "epoch": 5.040880503144654, + "grad_norm": 0.1557256132364273, + "learning_rate": 2.2641629078387224e-05, + "loss": 0.1737, + "mean_token_accuracy": 0.9138659238815308, + "step": 1603 + }, + { + "epoch": 5.044025157232705, + "grad_norm": 0.16258005797863007, + "learning_rate": 2.2623304257791667e-05, + "loss": 0.1739, + "mean_token_accuracy": 0.91518634557724, + "step": 1604 + }, + { + "epoch": 5.047169811320755, + "grad_norm": 0.149249866604805, + "learning_rate": 2.260497879039455e-05, + "loss": 0.188, + "mean_token_accuracy": 0.9087100625038147, + "step": 1605 + }, + { + "epoch": 5.050314465408805, + "grad_norm": 0.17389704287052155, + "learning_rate": 2.2586652695212158e-05, + "loss": 0.1846, + "mean_token_accuracy": 0.9124392867088318, + "step": 1606 + }, + { + "epoch": 5.053459119496855, + "grad_norm": 0.1585531234741211, + "learning_rate": 2.2568325991261456e-05, + "loss": 0.1816, + "mean_token_accuracy": 0.9147888422012329, + "step": 1607 + }, + { + "epoch": 5.056603773584905, + "grad_norm": 0.14732636511325836, + "learning_rate": 2.254999869756002e-05, + "loss": 0.1642, + "mean_token_accuracy": 0.9162658452987671, + "step": 1608 + }, + { + "epoch": 5.059748427672956, + "grad_norm": 0.14769305288791656, + "learning_rate": 2.2531670833126056e-05, + "loss": 0.1838, + "mean_token_accuracy": 0.913840651512146, + "step": 1609 + }, + { + "epoch": 5.062893081761007, + "grad_norm": 0.1692722588777542, + "learning_rate": 2.2513342416978347e-05, + "loss": 0.1761, + "mean_token_accuracy": 0.9122536778450012, + "step": 1610 + }, + { + "epoch": 5.066037735849057, + "grad_norm": 0.14742763340473175, + "learning_rate": 2.2495013468136248e-05, + "loss": 0.1708, + "mean_token_accuracy": 0.9144207239151001, + "step": 1611 + }, + { + "epoch": 5.069182389937107, + "grad_norm": 0.14860284328460693, + "learning_rate": 2.2476684005619675e-05, + "loss": 0.1656, + "mean_token_accuracy": 0.9158710241317749, + "step": 1612 + }, + { + "epoch": 5.072327044025157, + "grad_norm": 0.14176800847053528, + "learning_rate": 2.2458354048449074e-05, + "loss": 0.1819, + "mean_token_accuracy": 0.9101191759109497, + "step": 1613 + }, + { + "epoch": 5.0754716981132075, + "grad_norm": 0.14405949413776398, + "learning_rate": 2.2440023615645404e-05, + "loss": 0.181, + "mean_token_accuracy": 0.9110443592071533, + "step": 1614 + }, + { + "epoch": 5.078616352201258, + "grad_norm": 0.13683363795280457, + "learning_rate": 2.242169272623012e-05, + "loss": 0.1714, + "mean_token_accuracy": 0.9149702191352844, + "step": 1615 + }, + { + "epoch": 5.081761006289308, + "grad_norm": 0.13986143469810486, + "learning_rate": 2.240336139922515e-05, + "loss": 0.1745, + "mean_token_accuracy": 0.9130762219429016, + "step": 1616 + }, + { + "epoch": 5.084905660377358, + "grad_norm": 0.1361878365278244, + "learning_rate": 2.2385029653652868e-05, + "loss": 0.1701, + "mean_token_accuracy": 0.9145857691764832, + "step": 1617 + }, + { + "epoch": 5.088050314465409, + "grad_norm": 0.1427004188299179, + "learning_rate": 2.23666975085361e-05, + "loss": 0.1744, + "mean_token_accuracy": 0.9139806032180786, + "step": 1618 + }, + { + "epoch": 5.091194968553459, + "grad_norm": 0.1479874551296234, + "learning_rate": 2.2348364982898075e-05, + "loss": 0.1791, + "mean_token_accuracy": 0.9146347045898438, + "step": 1619 + }, + { + "epoch": 5.09433962264151, + "grad_norm": 0.13859923183918, + "learning_rate": 2.2330032095762406e-05, + "loss": 0.1758, + "mean_token_accuracy": 0.9162372350692749, + "step": 1620 + }, + { + "epoch": 5.09748427672956, + "grad_norm": 0.14057232439517975, + "learning_rate": 2.2311698866153103e-05, + "loss": 0.1773, + "mean_token_accuracy": 0.9151803255081177, + "step": 1621 + }, + { + "epoch": 5.10062893081761, + "grad_norm": 0.1407633125782013, + "learning_rate": 2.2293365313094514e-05, + "loss": 0.1623, + "mean_token_accuracy": 0.9161638021469116, + "step": 1622 + }, + { + "epoch": 5.10377358490566, + "grad_norm": 0.14112257957458496, + "learning_rate": 2.2275031455611332e-05, + "loss": 0.1625, + "mean_token_accuracy": 0.9182310104370117, + "step": 1623 + }, + { + "epoch": 5.1069182389937104, + "grad_norm": 0.13362768292427063, + "learning_rate": 2.225669731272857e-05, + "loss": 0.1804, + "mean_token_accuracy": 0.9142118096351624, + "step": 1624 + }, + { + "epoch": 5.110062893081761, + "grad_norm": 0.1374814510345459, + "learning_rate": 2.2238362903471525e-05, + "loss": 0.1758, + "mean_token_accuracy": 0.9115723371505737, + "step": 1625 + }, + { + "epoch": 5.113207547169812, + "grad_norm": 0.13537247478961945, + "learning_rate": 2.222002824686578e-05, + "loss": 0.1618, + "mean_token_accuracy": 0.9196138978004456, + "step": 1626 + }, + { + "epoch": 5.116352201257862, + "grad_norm": 0.13499002158641815, + "learning_rate": 2.2201693361937164e-05, + "loss": 0.1802, + "mean_token_accuracy": 0.9118757843971252, + "step": 1627 + }, + { + "epoch": 5.119496855345912, + "grad_norm": 0.1436554193496704, + "learning_rate": 2.218335826771176e-05, + "loss": 0.1892, + "mean_token_accuracy": 0.913590133190155, + "step": 1628 + }, + { + "epoch": 5.122641509433962, + "grad_norm": 0.12786825001239777, + "learning_rate": 2.216502298321585e-05, + "loss": 0.1811, + "mean_token_accuracy": 0.9122364521026611, + "step": 1629 + }, + { + "epoch": 5.1257861635220126, + "grad_norm": 0.1332731992006302, + "learning_rate": 2.2146687527475924e-05, + "loss": 0.1716, + "mean_token_accuracy": 0.9132739901542664, + "step": 1630 + }, + { + "epoch": 5.128930817610063, + "grad_norm": 0.12857694923877716, + "learning_rate": 2.2128351919518655e-05, + "loss": 0.1814, + "mean_token_accuracy": 0.9113665819168091, + "step": 1631 + }, + { + "epoch": 5.132075471698113, + "grad_norm": 0.13038919866085052, + "learning_rate": 2.211001617837085e-05, + "loss": 0.1664, + "mean_token_accuracy": 0.916091799736023, + "step": 1632 + }, + { + "epoch": 5.135220125786163, + "grad_norm": 0.1276601105928421, + "learning_rate": 2.2091680323059487e-05, + "loss": 0.1762, + "mean_token_accuracy": 0.9136810898780823, + "step": 1633 + }, + { + "epoch": 5.138364779874214, + "grad_norm": 0.1319180130958557, + "learning_rate": 2.2073344372611628e-05, + "loss": 0.1737, + "mean_token_accuracy": 0.9124643802642822, + "step": 1634 + }, + { + "epoch": 5.1415094339622645, + "grad_norm": 0.13005971908569336, + "learning_rate": 2.205500834605447e-05, + "loss": 0.163, + "mean_token_accuracy": 0.9175032377243042, + "step": 1635 + }, + { + "epoch": 5.144654088050315, + "grad_norm": 0.12882187962532043, + "learning_rate": 2.2036672262415265e-05, + "loss": 0.1598, + "mean_token_accuracy": 0.9175060391426086, + "step": 1636 + }, + { + "epoch": 5.147798742138365, + "grad_norm": 0.1321483552455902, + "learning_rate": 2.201833614072132e-05, + "loss": 0.1992, + "mean_token_accuracy": 0.9085779190063477, + "step": 1637 + }, + { + "epoch": 5.150943396226415, + "grad_norm": 0.12345718592405319, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.1769, + "mean_token_accuracy": 0.9143081307411194, + "step": 1638 + }, + { + "epoch": 5.154088050314465, + "grad_norm": 0.13488464057445526, + "learning_rate": 2.1981663859278684e-05, + "loss": 0.1526, + "mean_token_accuracy": 0.9188219308853149, + "step": 1639 + }, + { + "epoch": 5.1572327044025155, + "grad_norm": 0.129329651594162, + "learning_rate": 2.1963327737584745e-05, + "loss": 0.1589, + "mean_token_accuracy": 0.9180588126182556, + "step": 1640 + }, + { + "epoch": 5.160377358490566, + "grad_norm": 0.13226263225078583, + "learning_rate": 2.194499165394554e-05, + "loss": 0.1732, + "mean_token_accuracy": 0.9152401685714722, + "step": 1641 + }, + { + "epoch": 5.163522012578617, + "grad_norm": 0.1298309713602066, + "learning_rate": 2.1926655627388378e-05, + "loss": 0.1664, + "mean_token_accuracy": 0.9183374047279358, + "step": 1642 + }, + { + "epoch": 5.166666666666667, + "grad_norm": 0.13375401496887207, + "learning_rate": 2.1908319676940522e-05, + "loss": 0.164, + "mean_token_accuracy": 0.9196304082870483, + "step": 1643 + }, + { + "epoch": 5.169811320754717, + "grad_norm": 0.13043470680713654, + "learning_rate": 2.1889983821629154e-05, + "loss": 0.1738, + "mean_token_accuracy": 0.9157758951187134, + "step": 1644 + }, + { + "epoch": 5.172955974842767, + "grad_norm": 0.13828632235527039, + "learning_rate": 2.1871648080481354e-05, + "loss": 0.1681, + "mean_token_accuracy": 0.9176316261291504, + "step": 1645 + }, + { + "epoch": 5.176100628930818, + "grad_norm": 0.12848587334156036, + "learning_rate": 2.185331247252408e-05, + "loss": 0.1735, + "mean_token_accuracy": 0.9150087237358093, + "step": 1646 + }, + { + "epoch": 5.179245283018868, + "grad_norm": 0.13007590174674988, + "learning_rate": 2.1834977016784155e-05, + "loss": 0.2002, + "mean_token_accuracy": 0.9054135084152222, + "step": 1647 + }, + { + "epoch": 5.182389937106918, + "grad_norm": 0.1344444900751114, + "learning_rate": 2.181664173228825e-05, + "loss": 0.1761, + "mean_token_accuracy": 0.9127033948898315, + "step": 1648 + }, + { + "epoch": 5.185534591194968, + "grad_norm": 0.1295919120311737, + "learning_rate": 2.1798306638062842e-05, + "loss": 0.1714, + "mean_token_accuracy": 0.9170975685119629, + "step": 1649 + }, + { + "epoch": 5.188679245283019, + "grad_norm": 0.1406605988740921, + "learning_rate": 2.177997175313423e-05, + "loss": 0.1525, + "mean_token_accuracy": 0.9223353266716003, + "step": 1650 + }, + { + "epoch": 5.1918238993710695, + "grad_norm": 0.127178356051445, + "learning_rate": 2.1761637096528477e-05, + "loss": 0.1552, + "mean_token_accuracy": 0.9182380437850952, + "step": 1651 + }, + { + "epoch": 5.19496855345912, + "grad_norm": 0.13574561476707458, + "learning_rate": 2.174330268727143e-05, + "loss": 0.1718, + "mean_token_accuracy": 0.9154348969459534, + "step": 1652 + }, + { + "epoch": 5.19811320754717, + "grad_norm": 0.1314130425453186, + "learning_rate": 2.172496854438867e-05, + "loss": 0.1687, + "mean_token_accuracy": 0.9156013131141663, + "step": 1653 + }, + { + "epoch": 5.20125786163522, + "grad_norm": 0.13452252745628357, + "learning_rate": 2.1706634686905495e-05, + "loss": 0.161, + "mean_token_accuracy": 0.9191825985908508, + "step": 1654 + }, + { + "epoch": 5.20440251572327, + "grad_norm": 0.13440141081809998, + "learning_rate": 2.1688301133846906e-05, + "loss": 0.1809, + "mean_token_accuracy": 0.9128972887992859, + "step": 1655 + }, + { + "epoch": 5.2075471698113205, + "grad_norm": 0.12809255719184875, + "learning_rate": 2.1669967904237603e-05, + "loss": 0.1729, + "mean_token_accuracy": 0.9155882000923157, + "step": 1656 + }, + { + "epoch": 5.210691823899371, + "grad_norm": 0.1266845464706421, + "learning_rate": 2.1651635017101934e-05, + "loss": 0.1741, + "mean_token_accuracy": 0.9155396223068237, + "step": 1657 + }, + { + "epoch": 5.213836477987422, + "grad_norm": 0.1275676041841507, + "learning_rate": 2.1633302491463905e-05, + "loss": 0.1685, + "mean_token_accuracy": 0.916077196598053, + "step": 1658 + }, + { + "epoch": 5.216981132075472, + "grad_norm": 0.13584747910499573, + "learning_rate": 2.1614970346347137e-05, + "loss": 0.1769, + "mean_token_accuracy": 0.9124895334243774, + "step": 1659 + }, + { + "epoch": 5.220125786163522, + "grad_norm": 0.12740573287010193, + "learning_rate": 2.1596638600774855e-05, + "loss": 0.1623, + "mean_token_accuracy": 0.9163357615470886, + "step": 1660 + }, + { + "epoch": 5.223270440251572, + "grad_norm": 0.12712137401103973, + "learning_rate": 2.1578307273769887e-05, + "loss": 0.1706, + "mean_token_accuracy": 0.9162405133247375, + "step": 1661 + }, + { + "epoch": 5.226415094339623, + "grad_norm": 0.12990765273571014, + "learning_rate": 2.1559976384354605e-05, + "loss": 0.1547, + "mean_token_accuracy": 0.9218593835830688, + "step": 1662 + }, + { + "epoch": 5.229559748427673, + "grad_norm": 0.13632164895534515, + "learning_rate": 2.1541645951550935e-05, + "loss": 0.1727, + "mean_token_accuracy": 0.9177563190460205, + "step": 1663 + }, + { + "epoch": 5.232704402515723, + "grad_norm": 0.1234545111656189, + "learning_rate": 2.1523315994380338e-05, + "loss": 0.1638, + "mean_token_accuracy": 0.9165735840797424, + "step": 1664 + }, + { + "epoch": 5.235849056603773, + "grad_norm": 0.14210264384746552, + "learning_rate": 2.1504986531863765e-05, + "loss": 0.1689, + "mean_token_accuracy": 0.9141770005226135, + "step": 1665 + }, + { + "epoch": 5.238993710691824, + "grad_norm": 0.13037016987800598, + "learning_rate": 2.148665758302167e-05, + "loss": 0.1708, + "mean_token_accuracy": 0.915972113609314, + "step": 1666 + }, + { + "epoch": 5.2421383647798745, + "grad_norm": 0.1427024006843567, + "learning_rate": 2.1468329166873953e-05, + "loss": 0.1919, + "mean_token_accuracy": 0.9098140597343445, + "step": 1667 + }, + { + "epoch": 5.245283018867925, + "grad_norm": 0.12425141036510468, + "learning_rate": 2.1450001302439984e-05, + "loss": 0.1646, + "mean_token_accuracy": 0.9146043658256531, + "step": 1668 + }, + { + "epoch": 5.248427672955975, + "grad_norm": 0.13127084076404572, + "learning_rate": 2.1431674008738553e-05, + "loss": 0.1644, + "mean_token_accuracy": 0.9161396026611328, + "step": 1669 + }, + { + "epoch": 5.251572327044025, + "grad_norm": 0.12927378714084625, + "learning_rate": 2.1413347304787854e-05, + "loss": 0.1653, + "mean_token_accuracy": 0.9174731373786926, + "step": 1670 + }, + { + "epoch": 5.254716981132075, + "grad_norm": 0.13044291734695435, + "learning_rate": 2.1395021209605464e-05, + "loss": 0.1757, + "mean_token_accuracy": 0.9150776863098145, + "step": 1671 + }, + { + "epoch": 5.2578616352201255, + "grad_norm": 0.1301099807024002, + "learning_rate": 2.1376695742208345e-05, + "loss": 0.1895, + "mean_token_accuracy": 0.9120878577232361, + "step": 1672 + }, + { + "epoch": 5.261006289308176, + "grad_norm": 0.135678231716156, + "learning_rate": 2.135837092161279e-05, + "loss": 0.1811, + "mean_token_accuracy": 0.9135366082191467, + "step": 1673 + }, + { + "epoch": 5.264150943396227, + "grad_norm": 0.12597790360450745, + "learning_rate": 2.1340046766834423e-05, + "loss": 0.183, + "mean_token_accuracy": 0.9111697673797607, + "step": 1674 + }, + { + "epoch": 5.267295597484277, + "grad_norm": 0.1251760572195053, + "learning_rate": 2.1321723296888198e-05, + "loss": 0.1835, + "mean_token_accuracy": 0.9082761406898499, + "step": 1675 + }, + { + "epoch": 5.270440251572327, + "grad_norm": 0.12519238889217377, + "learning_rate": 2.130340053078833e-05, + "loss": 0.166, + "mean_token_accuracy": 0.9158846735954285, + "step": 1676 + }, + { + "epoch": 5.273584905660377, + "grad_norm": 0.1259630024433136, + "learning_rate": 2.128507848754832e-05, + "loss": 0.1748, + "mean_token_accuracy": 0.9130755662918091, + "step": 1677 + }, + { + "epoch": 5.276729559748428, + "grad_norm": 0.13136066496372223, + "learning_rate": 2.1266757186180915e-05, + "loss": 0.1796, + "mean_token_accuracy": 0.9120888113975525, + "step": 1678 + }, + { + "epoch": 5.279874213836478, + "grad_norm": 0.1244376003742218, + "learning_rate": 2.1248436645698094e-05, + "loss": 0.1751, + "mean_token_accuracy": 0.9147348999977112, + "step": 1679 + }, + { + "epoch": 5.283018867924528, + "grad_norm": 0.1334747076034546, + "learning_rate": 2.1230116885111048e-05, + "loss": 0.1773, + "mean_token_accuracy": 0.9149150252342224, + "step": 1680 + }, + { + "epoch": 5.286163522012578, + "grad_norm": 0.12724976241588593, + "learning_rate": 2.1211797923430146e-05, + "loss": 0.1748, + "mean_token_accuracy": 0.9149224162101746, + "step": 1681 + }, + { + "epoch": 5.289308176100629, + "grad_norm": 0.13602736592292786, + "learning_rate": 2.119347977966494e-05, + "loss": 0.1758, + "mean_token_accuracy": 0.9136221408843994, + "step": 1682 + }, + { + "epoch": 5.2924528301886795, + "grad_norm": 0.12472439557313919, + "learning_rate": 2.117516247282414e-05, + "loss": 0.1674, + "mean_token_accuracy": 0.9162774682044983, + "step": 1683 + }, + { + "epoch": 5.29559748427673, + "grad_norm": 0.1277409791946411, + "learning_rate": 2.1156846021915568e-05, + "loss": 0.1737, + "mean_token_accuracy": 0.9161177277565002, + "step": 1684 + }, + { + "epoch": 5.29874213836478, + "grad_norm": 0.13467463850975037, + "learning_rate": 2.1138530445946167e-05, + "loss": 0.1769, + "mean_token_accuracy": 0.9126297235488892, + "step": 1685 + }, + { + "epoch": 5.30188679245283, + "grad_norm": 0.13067930936813354, + "learning_rate": 2.1120215763921982e-05, + "loss": 0.1792, + "mean_token_accuracy": 0.9156489968299866, + "step": 1686 + }, + { + "epoch": 5.30503144654088, + "grad_norm": 0.13587665557861328, + "learning_rate": 2.1101901994848113e-05, + "loss": 0.1763, + "mean_token_accuracy": 0.9159653186798096, + "step": 1687 + }, + { + "epoch": 5.3081761006289305, + "grad_norm": 0.13601058721542358, + "learning_rate": 2.1083589157728728e-05, + "loss": 0.1758, + "mean_token_accuracy": 0.9139232635498047, + "step": 1688 + }, + { + "epoch": 5.311320754716981, + "grad_norm": 0.13745374977588654, + "learning_rate": 2.1065277271567017e-05, + "loss": 0.1679, + "mean_token_accuracy": 0.9164733290672302, + "step": 1689 + }, + { + "epoch": 5.314465408805032, + "grad_norm": 0.1407531201839447, + "learning_rate": 2.104696635536519e-05, + "loss": 0.1876, + "mean_token_accuracy": 0.9122368693351746, + "step": 1690 + }, + { + "epoch": 5.317610062893082, + "grad_norm": 0.1359671652317047, + "learning_rate": 2.1028656428124442e-05, + "loss": 0.1859, + "mean_token_accuracy": 0.9110690355300903, + "step": 1691 + }, + { + "epoch": 5.320754716981132, + "grad_norm": 0.13148832321166992, + "learning_rate": 2.101034750884496e-05, + "loss": 0.1611, + "mean_token_accuracy": 0.9165538549423218, + "step": 1692 + }, + { + "epoch": 5.323899371069182, + "grad_norm": 0.13991323113441467, + "learning_rate": 2.0992039616525858e-05, + "loss": 0.1793, + "mean_token_accuracy": 0.9136778712272644, + "step": 1693 + }, + { + "epoch": 5.327044025157233, + "grad_norm": 0.14232312142848969, + "learning_rate": 2.09737327701652e-05, + "loss": 0.1744, + "mean_token_accuracy": 0.9157080054283142, + "step": 1694 + }, + { + "epoch": 5.330188679245283, + "grad_norm": 0.12538862228393555, + "learning_rate": 2.0955426988759978e-05, + "loss": 0.1733, + "mean_token_accuracy": 0.9145515561103821, + "step": 1695 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.14959652721881866, + "learning_rate": 2.093712229130605e-05, + "loss": 0.1776, + "mean_token_accuracy": 0.9126619100570679, + "step": 1696 + }, + { + "epoch": 5.336477987421383, + "grad_norm": 0.13184374570846558, + "learning_rate": 2.0918818696798162e-05, + "loss": 0.1756, + "mean_token_accuracy": 0.9145973920822144, + "step": 1697 + }, + { + "epoch": 5.339622641509434, + "grad_norm": 0.15026512742042542, + "learning_rate": 2.0900516224229924e-05, + "loss": 0.1735, + "mean_token_accuracy": 0.9127100706100464, + "step": 1698 + }, + { + "epoch": 5.3427672955974845, + "grad_norm": 0.12787233293056488, + "learning_rate": 2.088221489259378e-05, + "loss": 0.165, + "mean_token_accuracy": 0.9178786873817444, + "step": 1699 + }, + { + "epoch": 5.345911949685535, + "grad_norm": 0.14001305401325226, + "learning_rate": 2.086391472088097e-05, + "loss": 0.1707, + "mean_token_accuracy": 0.916415810585022, + "step": 1700 + }, + { + "epoch": 5.349056603773585, + "grad_norm": 0.13622647523880005, + "learning_rate": 2.084561572808156e-05, + "loss": 0.1764, + "mean_token_accuracy": 0.9140340685844421, + "step": 1701 + }, + { + "epoch": 5.352201257861635, + "grad_norm": 0.13659904897212982, + "learning_rate": 2.0827317933184366e-05, + "loss": 0.1679, + "mean_token_accuracy": 0.9176750779151917, + "step": 1702 + }, + { + "epoch": 5.355345911949685, + "grad_norm": 0.1333339959383011, + "learning_rate": 2.0809021355176982e-05, + "loss": 0.1776, + "mean_token_accuracy": 0.9108352065086365, + "step": 1703 + }, + { + "epoch": 5.3584905660377355, + "grad_norm": 0.1358334869146347, + "learning_rate": 2.0790726013045722e-05, + "loss": 0.1773, + "mean_token_accuracy": 0.9096439480781555, + "step": 1704 + }, + { + "epoch": 5.361635220125786, + "grad_norm": 0.13096484541893005, + "learning_rate": 2.077243192577563e-05, + "loss": 0.175, + "mean_token_accuracy": 0.915550172328949, + "step": 1705 + }, + { + "epoch": 5.364779874213837, + "grad_norm": 0.1310902237892151, + "learning_rate": 2.0754139112350442e-05, + "loss": 0.1847, + "mean_token_accuracy": 0.9114246368408203, + "step": 1706 + }, + { + "epoch": 5.367924528301887, + "grad_norm": 0.12905707955360413, + "learning_rate": 2.0735847591752573e-05, + "loss": 0.1707, + "mean_token_accuracy": 0.9200414419174194, + "step": 1707 + }, + { + "epoch": 5.371069182389937, + "grad_norm": 0.14065292477607727, + "learning_rate": 2.07175573829631e-05, + "loss": 0.1855, + "mean_token_accuracy": 0.9103058576583862, + "step": 1708 + }, + { + "epoch": 5.3742138364779874, + "grad_norm": 0.1275883913040161, + "learning_rate": 2.069926850496173e-05, + "loss": 0.1907, + "mean_token_accuracy": 0.9100257158279419, + "step": 1709 + }, + { + "epoch": 5.377358490566038, + "grad_norm": 0.12739497423171997, + "learning_rate": 2.0680980976726803e-05, + "loss": 0.1741, + "mean_token_accuracy": 0.9135763049125671, + "step": 1710 + }, + { + "epoch": 5.380503144654088, + "grad_norm": 0.13199791312217712, + "learning_rate": 2.066269481723524e-05, + "loss": 0.1763, + "mean_token_accuracy": 0.9162415862083435, + "step": 1711 + }, + { + "epoch": 5.383647798742138, + "grad_norm": 0.13175837695598602, + "learning_rate": 2.0644410045462558e-05, + "loss": 0.171, + "mean_token_accuracy": 0.9147182703018188, + "step": 1712 + }, + { + "epoch": 5.386792452830189, + "grad_norm": 0.12402436137199402, + "learning_rate": 2.0626126680382827e-05, + "loss": 0.1772, + "mean_token_accuracy": 0.9118449091911316, + "step": 1713 + }, + { + "epoch": 5.389937106918239, + "grad_norm": 0.13606113195419312, + "learning_rate": 2.060784474096866e-05, + "loss": 0.1791, + "mean_token_accuracy": 0.9129552841186523, + "step": 1714 + }, + { + "epoch": 5.3930817610062896, + "grad_norm": 0.12390584498643875, + "learning_rate": 2.0589564246191175e-05, + "loss": 0.1739, + "mean_token_accuracy": 0.911918044090271, + "step": 1715 + }, + { + "epoch": 5.39622641509434, + "grad_norm": 0.13724228739738464, + "learning_rate": 2.057128521502002e-05, + "loss": 0.1911, + "mean_token_accuracy": 0.9111776947975159, + "step": 1716 + }, + { + "epoch": 5.39937106918239, + "grad_norm": 0.1254095882177353, + "learning_rate": 2.055300766642331e-05, + "loss": 0.1768, + "mean_token_accuracy": 0.9121692180633545, + "step": 1717 + }, + { + "epoch": 5.40251572327044, + "grad_norm": 0.12512388825416565, + "learning_rate": 2.0534731619367608e-05, + "loss": 0.1692, + "mean_token_accuracy": 0.9146745800971985, + "step": 1718 + }, + { + "epoch": 5.40566037735849, + "grad_norm": 0.131209135055542, + "learning_rate": 2.0516457092817946e-05, + "loss": 0.1871, + "mean_token_accuracy": 0.9084495306015015, + "step": 1719 + }, + { + "epoch": 5.408805031446541, + "grad_norm": 0.12044340372085571, + "learning_rate": 2.0498184105737744e-05, + "loss": 0.185, + "mean_token_accuracy": 0.9076829552650452, + "step": 1720 + }, + { + "epoch": 5.411949685534591, + "grad_norm": 0.13348786532878876, + "learning_rate": 2.047991267708886e-05, + "loss": 0.1724, + "mean_token_accuracy": 0.9145122170448303, + "step": 1721 + }, + { + "epoch": 5.415094339622642, + "grad_norm": 0.132951557636261, + "learning_rate": 2.0461642825831505e-05, + "loss": 0.1833, + "mean_token_accuracy": 0.9118179678916931, + "step": 1722 + }, + { + "epoch": 5.418238993710692, + "grad_norm": 0.1334230601787567, + "learning_rate": 2.0443374570924276e-05, + "loss": 0.1851, + "mean_token_accuracy": 0.9120337963104248, + "step": 1723 + }, + { + "epoch": 5.421383647798742, + "grad_norm": 0.12848825752735138, + "learning_rate": 2.04251079313241e-05, + "loss": 0.1833, + "mean_token_accuracy": 0.9081471562385559, + "step": 1724 + }, + { + "epoch": 5.4245283018867925, + "grad_norm": 0.13386452198028564, + "learning_rate": 2.0406842925986244e-05, + "loss": 0.1887, + "mean_token_accuracy": 0.9072440266609192, + "step": 1725 + }, + { + "epoch": 5.427672955974843, + "grad_norm": 0.12583881616592407, + "learning_rate": 2.0388579573864248e-05, + "loss": 0.1695, + "mean_token_accuracy": 0.9142518639564514, + "step": 1726 + }, + { + "epoch": 5.430817610062893, + "grad_norm": 0.12993434071540833, + "learning_rate": 2.0370317893909967e-05, + "loss": 0.181, + "mean_token_accuracy": 0.9098360538482666, + "step": 1727 + }, + { + "epoch": 5.433962264150943, + "grad_norm": 0.13069041073322296, + "learning_rate": 2.035205790507351e-05, + "loss": 0.1693, + "mean_token_accuracy": 0.9145693778991699, + "step": 1728 + }, + { + "epoch": 5.437106918238994, + "grad_norm": 0.12848743796348572, + "learning_rate": 2.0333799626303224e-05, + "loss": 0.1598, + "mean_token_accuracy": 0.9185672998428345, + "step": 1729 + }, + { + "epoch": 5.440251572327044, + "grad_norm": 0.12107231467962265, + "learning_rate": 2.031554307654569e-05, + "loss": 0.1802, + "mean_token_accuracy": 0.9108401536941528, + "step": 1730 + }, + { + "epoch": 5.443396226415095, + "grad_norm": 0.1265760362148285, + "learning_rate": 2.0297288274745694e-05, + "loss": 0.1724, + "mean_token_accuracy": 0.9128392338752747, + "step": 1731 + }, + { + "epoch": 5.446540880503145, + "grad_norm": 0.1304648518562317, + "learning_rate": 2.0279035239846204e-05, + "loss": 0.1863, + "mean_token_accuracy": 0.9115615487098694, + "step": 1732 + }, + { + "epoch": 5.449685534591195, + "grad_norm": 0.12654311954975128, + "learning_rate": 2.0260783990788365e-05, + "loss": 0.1839, + "mean_token_accuracy": 0.9107187986373901, + "step": 1733 + }, + { + "epoch": 5.452830188679245, + "grad_norm": 0.12844638526439667, + "learning_rate": 2.0242534546511456e-05, + "loss": 0.1736, + "mean_token_accuracy": 0.9131293296813965, + "step": 1734 + }, + { + "epoch": 5.455974842767295, + "grad_norm": 0.12657323479652405, + "learning_rate": 2.0224286925952885e-05, + "loss": 0.1663, + "mean_token_accuracy": 0.9194426536560059, + "step": 1735 + }, + { + "epoch": 5.459119496855346, + "grad_norm": 0.12937375903129578, + "learning_rate": 2.0206041148048177e-05, + "loss": 0.1948, + "mean_token_accuracy": 0.9075583219528198, + "step": 1736 + }, + { + "epoch": 5.462264150943396, + "grad_norm": 0.12683670222759247, + "learning_rate": 2.0187797231730933e-05, + "loss": 0.1623, + "mean_token_accuracy": 0.9198793172836304, + "step": 1737 + }, + { + "epoch": 5.465408805031447, + "grad_norm": 0.12588712573051453, + "learning_rate": 2.016955519593284e-05, + "loss": 0.1909, + "mean_token_accuracy": 0.9105217456817627, + "step": 1738 + }, + { + "epoch": 5.468553459119497, + "grad_norm": 0.12617500126361847, + "learning_rate": 2.0151315059583603e-05, + "loss": 0.1721, + "mean_token_accuracy": 0.915179431438446, + "step": 1739 + }, + { + "epoch": 5.471698113207547, + "grad_norm": 0.12191683799028397, + "learning_rate": 2.0133076841610987e-05, + "loss": 0.1552, + "mean_token_accuracy": 0.9211398959159851, + "step": 1740 + }, + { + "epoch": 5.4748427672955975, + "grad_norm": 0.12133314460515976, + "learning_rate": 2.011484056094075e-05, + "loss": 0.1857, + "mean_token_accuracy": 0.9101731777191162, + "step": 1741 + }, + { + "epoch": 5.477987421383648, + "grad_norm": 0.12632842361927032, + "learning_rate": 2.009660623649665e-05, + "loss": 0.1752, + "mean_token_accuracy": 0.9160427451133728, + "step": 1742 + }, + { + "epoch": 5.481132075471698, + "grad_norm": 0.12612132728099823, + "learning_rate": 2.0078373887200402e-05, + "loss": 0.1484, + "mean_token_accuracy": 0.9197033047676086, + "step": 1743 + }, + { + "epoch": 5.484276729559748, + "grad_norm": 0.1312422752380371, + "learning_rate": 2.0060143531971676e-05, + "loss": 0.1624, + "mean_token_accuracy": 0.9184711575508118, + "step": 1744 + }, + { + "epoch": 5.487421383647799, + "grad_norm": 0.12046711146831512, + "learning_rate": 2.0041915189728082e-05, + "loss": 0.1493, + "mean_token_accuracy": 0.9204055070877075, + "step": 1745 + }, + { + "epoch": 5.490566037735849, + "grad_norm": 0.13251107931137085, + "learning_rate": 2.0023688879385123e-05, + "loss": 0.1806, + "mean_token_accuracy": 0.9120727181434631, + "step": 1746 + }, + { + "epoch": 5.4937106918239, + "grad_norm": 0.11958485841751099, + "learning_rate": 2.000546461985622e-05, + "loss": 0.1747, + "mean_token_accuracy": 0.9143047332763672, + "step": 1747 + }, + { + "epoch": 5.49685534591195, + "grad_norm": 0.12881681323051453, + "learning_rate": 1.998724243005264e-05, + "loss": 0.1825, + "mean_token_accuracy": 0.9094586968421936, + "step": 1748 + }, + { + "epoch": 5.5, + "grad_norm": 0.1296965628862381, + "learning_rate": 1.99690223288835e-05, + "loss": 0.1849, + "mean_token_accuracy": 0.9092406630516052, + "step": 1749 + }, + { + "epoch": 5.50314465408805, + "grad_norm": 0.1288004368543625, + "learning_rate": 1.995080433525579e-05, + "loss": 0.1914, + "mean_token_accuracy": 0.9089462757110596, + "step": 1750 + }, + { + "epoch": 5.5062893081761, + "grad_norm": 0.11885450780391693, + "learning_rate": 1.9932588468074266e-05, + "loss": 0.1653, + "mean_token_accuracy": 0.9152488708496094, + "step": 1751 + }, + { + "epoch": 5.509433962264151, + "grad_norm": 0.12399091571569443, + "learning_rate": 1.9914374746241504e-05, + "loss": 0.1836, + "mean_token_accuracy": 0.9103871583938599, + "step": 1752 + }, + { + "epoch": 5.512578616352201, + "grad_norm": 0.12655200064182281, + "learning_rate": 1.9896163188657846e-05, + "loss": 0.1718, + "mean_token_accuracy": 0.9169356822967529, + "step": 1753 + }, + { + "epoch": 5.515723270440252, + "grad_norm": 0.13177834451198578, + "learning_rate": 1.9877953814221378e-05, + "loss": 0.182, + "mean_token_accuracy": 0.9119299650192261, + "step": 1754 + }, + { + "epoch": 5.518867924528302, + "grad_norm": 0.12567467987537384, + "learning_rate": 1.9859746641827945e-05, + "loss": 0.1893, + "mean_token_accuracy": 0.9095203876495361, + "step": 1755 + }, + { + "epoch": 5.522012578616352, + "grad_norm": 0.12787534296512604, + "learning_rate": 1.984154169037108e-05, + "loss": 0.1861, + "mean_token_accuracy": 0.9093389511108398, + "step": 1756 + }, + { + "epoch": 5.5251572327044025, + "grad_norm": 0.12269425392150879, + "learning_rate": 1.9823338978742038e-05, + "loss": 0.1841, + "mean_token_accuracy": 0.9123227000236511, + "step": 1757 + }, + { + "epoch": 5.528301886792453, + "grad_norm": 0.12528327107429504, + "learning_rate": 1.9805138525829724e-05, + "loss": 0.176, + "mean_token_accuracy": 0.9133109450340271, + "step": 1758 + }, + { + "epoch": 5.531446540880503, + "grad_norm": 0.1250915825366974, + "learning_rate": 1.978694035052072e-05, + "loss": 0.1739, + "mean_token_accuracy": 0.9134659171104431, + "step": 1759 + }, + { + "epoch": 5.534591194968553, + "grad_norm": 0.1279488205909729, + "learning_rate": 1.9768744471699234e-05, + "loss": 0.1851, + "mean_token_accuracy": 0.9121567606925964, + "step": 1760 + }, + { + "epoch": 5.537735849056604, + "grad_norm": 0.12545357644557953, + "learning_rate": 1.9750550908247087e-05, + "loss": 0.1719, + "mean_token_accuracy": 0.9140828251838684, + "step": 1761 + }, + { + "epoch": 5.540880503144654, + "grad_norm": 0.12257158756256104, + "learning_rate": 1.973235967904371e-05, + "loss": 0.166, + "mean_token_accuracy": 0.9187101125717163, + "step": 1762 + }, + { + "epoch": 5.544025157232705, + "grad_norm": 0.12817668914794922, + "learning_rate": 1.97141708029661e-05, + "loss": 0.165, + "mean_token_accuracy": 0.9164320230484009, + "step": 1763 + }, + { + "epoch": 5.547169811320755, + "grad_norm": 0.11913950741291046, + "learning_rate": 1.9695984298888815e-05, + "loss": 0.1715, + "mean_token_accuracy": 0.9155896902084351, + "step": 1764 + }, + { + "epoch": 5.550314465408805, + "grad_norm": 0.12410813570022583, + "learning_rate": 1.9677800185683957e-05, + "loss": 0.1698, + "mean_token_accuracy": 0.9146424531936646, + "step": 1765 + }, + { + "epoch": 5.553459119496855, + "grad_norm": 0.12645314633846283, + "learning_rate": 1.9659618482221142e-05, + "loss": 0.1757, + "mean_token_accuracy": 0.9114490747451782, + "step": 1766 + }, + { + "epoch": 5.556603773584905, + "grad_norm": 0.1207004114985466, + "learning_rate": 1.964143920736749e-05, + "loss": 0.1799, + "mean_token_accuracy": 0.9112210273742676, + "step": 1767 + }, + { + "epoch": 5.559748427672956, + "grad_norm": 0.12781473994255066, + "learning_rate": 1.96232623799876e-05, + "loss": 0.1891, + "mean_token_accuracy": 0.9095780253410339, + "step": 1768 + }, + { + "epoch": 5.562893081761006, + "grad_norm": 0.12544502317905426, + "learning_rate": 1.960508801894352e-05, + "loss": 0.168, + "mean_token_accuracy": 0.9177098274230957, + "step": 1769 + }, + { + "epoch": 5.566037735849057, + "grad_norm": 0.13463300466537476, + "learning_rate": 1.958691614309475e-05, + "loss": 0.1853, + "mean_token_accuracy": 0.9120203256607056, + "step": 1770 + }, + { + "epoch": 5.569182389937107, + "grad_norm": 0.12383488565683365, + "learning_rate": 1.9568746771298214e-05, + "loss": 0.1787, + "mean_token_accuracy": 0.9111815094947815, + "step": 1771 + }, + { + "epoch": 5.572327044025157, + "grad_norm": 0.13381510972976685, + "learning_rate": 1.9550579922408232e-05, + "loss": 0.1758, + "mean_token_accuracy": 0.9136260747909546, + "step": 1772 + }, + { + "epoch": 5.5754716981132075, + "grad_norm": 0.11905031651258469, + "learning_rate": 1.9532415615276497e-05, + "loss": 0.1768, + "mean_token_accuracy": 0.9129524827003479, + "step": 1773 + }, + { + "epoch": 5.578616352201258, + "grad_norm": 0.12877804040908813, + "learning_rate": 1.951425386875208e-05, + "loss": 0.1911, + "mean_token_accuracy": 0.9062658548355103, + "step": 1774 + }, + { + "epoch": 5.581761006289308, + "grad_norm": 0.13063427805900574, + "learning_rate": 1.949609470168139e-05, + "loss": 0.1784, + "mean_token_accuracy": 0.9149458408355713, + "step": 1775 + }, + { + "epoch": 5.584905660377358, + "grad_norm": 0.1270115077495575, + "learning_rate": 1.947793813290816e-05, + "loss": 0.1811, + "mean_token_accuracy": 0.911460816860199, + "step": 1776 + }, + { + "epoch": 5.588050314465409, + "grad_norm": 0.12923474609851837, + "learning_rate": 1.945978418127342e-05, + "loss": 0.1786, + "mean_token_accuracy": 0.9101095199584961, + "step": 1777 + }, + { + "epoch": 5.591194968553459, + "grad_norm": 0.12055451422929764, + "learning_rate": 1.944163286561549e-05, + "loss": 0.1749, + "mean_token_accuracy": 0.9124361872673035, + "step": 1778 + }, + { + "epoch": 5.59433962264151, + "grad_norm": 0.13345621526241302, + "learning_rate": 1.9423484204769955e-05, + "loss": 0.1647, + "mean_token_accuracy": 0.9183674454689026, + "step": 1779 + }, + { + "epoch": 5.59748427672956, + "grad_norm": 0.1298670917749405, + "learning_rate": 1.940533821756964e-05, + "loss": 0.172, + "mean_token_accuracy": 0.9127525091171265, + "step": 1780 + }, + { + "epoch": 5.60062893081761, + "grad_norm": 0.12359672784805298, + "learning_rate": 1.93871949228446e-05, + "loss": 0.182, + "mean_token_accuracy": 0.9089277386665344, + "step": 1781 + }, + { + "epoch": 5.60377358490566, + "grad_norm": 0.12858720123767853, + "learning_rate": 1.9369054339422102e-05, + "loss": 0.1941, + "mean_token_accuracy": 0.9098080992698669, + "step": 1782 + }, + { + "epoch": 5.6069182389937104, + "grad_norm": 0.12454716861248016, + "learning_rate": 1.935091648612658e-05, + "loss": 0.1633, + "mean_token_accuracy": 0.9196724891662598, + "step": 1783 + }, + { + "epoch": 5.610062893081761, + "grad_norm": 0.1321592628955841, + "learning_rate": 1.9332781381779657e-05, + "loss": 0.1621, + "mean_token_accuracy": 0.9194785952568054, + "step": 1784 + }, + { + "epoch": 5.613207547169811, + "grad_norm": 0.12804855406284332, + "learning_rate": 1.9314649045200098e-05, + "loss": 0.187, + "mean_token_accuracy": 0.912610650062561, + "step": 1785 + }, + { + "epoch": 5.616352201257862, + "grad_norm": 0.12372619658708572, + "learning_rate": 1.9296519495203778e-05, + "loss": 0.176, + "mean_token_accuracy": 0.9116061925888062, + "step": 1786 + }, + { + "epoch": 5.619496855345912, + "grad_norm": 0.12314292788505554, + "learning_rate": 1.9278392750603704e-05, + "loss": 0.184, + "mean_token_accuracy": 0.9096059203147888, + "step": 1787 + }, + { + "epoch": 5.622641509433962, + "grad_norm": 0.12165670096874237, + "learning_rate": 1.9260268830209963e-05, + "loss": 0.168, + "mean_token_accuracy": 0.9138624668121338, + "step": 1788 + }, + { + "epoch": 5.6257861635220126, + "grad_norm": 0.1263875663280487, + "learning_rate": 1.924214775282971e-05, + "loss": 0.1991, + "mean_token_accuracy": 0.906342089176178, + "step": 1789 + }, + { + "epoch": 5.628930817610063, + "grad_norm": 0.12722733616828918, + "learning_rate": 1.9224029537267147e-05, + "loss": 0.178, + "mean_token_accuracy": 0.9143258929252625, + "step": 1790 + }, + { + "epoch": 5.632075471698113, + "grad_norm": 0.12726683914661407, + "learning_rate": 1.9205914202323506e-05, + "loss": 0.1733, + "mean_token_accuracy": 0.9135202765464783, + "step": 1791 + }, + { + "epoch": 5.635220125786163, + "grad_norm": 0.13158226013183594, + "learning_rate": 1.9187801766797042e-05, + "loss": 0.1799, + "mean_token_accuracy": 0.9105492830276489, + "step": 1792 + }, + { + "epoch": 5.638364779874214, + "grad_norm": 0.1274365484714508, + "learning_rate": 1.9169692249482992e-05, + "loss": 0.1552, + "mean_token_accuracy": 0.9198361039161682, + "step": 1793 + }, + { + "epoch": 5.6415094339622645, + "grad_norm": 0.12876805663108826, + "learning_rate": 1.915158566917356e-05, + "loss": 0.1762, + "mean_token_accuracy": 0.9127059578895569, + "step": 1794 + }, + { + "epoch": 5.644654088050315, + "grad_norm": 0.12105433642864227, + "learning_rate": 1.9133482044657904e-05, + "loss": 0.17, + "mean_token_accuracy": 0.9126316905021667, + "step": 1795 + }, + { + "epoch": 5.647798742138365, + "grad_norm": 0.12377595901489258, + "learning_rate": 1.911538139472213e-05, + "loss": 0.1673, + "mean_token_accuracy": 0.9142662882804871, + "step": 1796 + }, + { + "epoch": 5.650943396226415, + "grad_norm": 0.12388768792152405, + "learning_rate": 1.9097283738149238e-05, + "loss": 0.1675, + "mean_token_accuracy": 0.9142235517501831, + "step": 1797 + }, + { + "epoch": 5.654088050314465, + "grad_norm": 0.12548349797725677, + "learning_rate": 1.9079189093719136e-05, + "loss": 0.1847, + "mean_token_accuracy": 0.9101769328117371, + "step": 1798 + }, + { + "epoch": 5.6572327044025155, + "grad_norm": 0.12843972444534302, + "learning_rate": 1.906109748020859e-05, + "loss": 0.1691, + "mean_token_accuracy": 0.9158428311347961, + "step": 1799 + }, + { + "epoch": 5.660377358490566, + "grad_norm": 0.12968730926513672, + "learning_rate": 1.904300891639124e-05, + "loss": 0.1724, + "mean_token_accuracy": 0.9115962386131287, + "step": 1800 + }, + { + "epoch": 5.663522012578616, + "grad_norm": 0.13056190311908722, + "learning_rate": 1.9024923421037548e-05, + "loss": 0.1788, + "mean_token_accuracy": 0.9096761345863342, + "step": 1801 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 0.13141171634197235, + "learning_rate": 1.9006841012914797e-05, + "loss": 0.1787, + "mean_token_accuracy": 0.9131348729133606, + "step": 1802 + }, + { + "epoch": 5.669811320754717, + "grad_norm": 0.14636099338531494, + "learning_rate": 1.8988761710787064e-05, + "loss": 0.1755, + "mean_token_accuracy": 0.910908043384552, + "step": 1803 + }, + { + "epoch": 5.672955974842767, + "grad_norm": 0.13519425690174103, + "learning_rate": 1.8970685533415197e-05, + "loss": 0.1797, + "mean_token_accuracy": 0.9129673838615417, + "step": 1804 + }, + { + "epoch": 5.676100628930818, + "grad_norm": 0.1360684335231781, + "learning_rate": 1.8952612499556825e-05, + "loss": 0.1927, + "mean_token_accuracy": 0.9077761769294739, + "step": 1805 + }, + { + "epoch": 5.679245283018868, + "grad_norm": 0.13671882450580597, + "learning_rate": 1.893454262796629e-05, + "loss": 0.1778, + "mean_token_accuracy": 0.9140453338623047, + "step": 1806 + }, + { + "epoch": 5.682389937106918, + "grad_norm": 0.13327039778232574, + "learning_rate": 1.8916475937394662e-05, + "loss": 0.1719, + "mean_token_accuracy": 0.9152608513832092, + "step": 1807 + }, + { + "epoch": 5.685534591194968, + "grad_norm": 0.14007729291915894, + "learning_rate": 1.8898412446589707e-05, + "loss": 0.1847, + "mean_token_accuracy": 0.9102155566215515, + "step": 1808 + }, + { + "epoch": 5.688679245283019, + "grad_norm": 0.12568777799606323, + "learning_rate": 1.8880352174295872e-05, + "loss": 0.1674, + "mean_token_accuracy": 0.9137527346611023, + "step": 1809 + }, + { + "epoch": 5.6918238993710695, + "grad_norm": 0.12971539795398712, + "learning_rate": 1.886229513925427e-05, + "loss": 0.1766, + "mean_token_accuracy": 0.9112458229064941, + "step": 1810 + }, + { + "epoch": 5.69496855345912, + "grad_norm": 0.12138007581233978, + "learning_rate": 1.8844241360202645e-05, + "loss": 0.1816, + "mean_token_accuracy": 0.9118508696556091, + "step": 1811 + }, + { + "epoch": 5.69811320754717, + "grad_norm": 0.13702593743801117, + "learning_rate": 1.8826190855875367e-05, + "loss": 0.1659, + "mean_token_accuracy": 0.9177764058113098, + "step": 1812 + }, + { + "epoch": 5.70125786163522, + "grad_norm": 0.12681394815444946, + "learning_rate": 1.8808143645003417e-05, + "loss": 0.1936, + "mean_token_accuracy": 0.907345712184906, + "step": 1813 + }, + { + "epoch": 5.70440251572327, + "grad_norm": 0.12529529631137848, + "learning_rate": 1.8790099746314335e-05, + "loss": 0.1643, + "mean_token_accuracy": 0.9167233109474182, + "step": 1814 + }, + { + "epoch": 5.7075471698113205, + "grad_norm": 0.12051253020763397, + "learning_rate": 1.877205917853225e-05, + "loss": 0.1693, + "mean_token_accuracy": 0.9154717326164246, + "step": 1815 + }, + { + "epoch": 5.710691823899371, + "grad_norm": 0.1212969496846199, + "learning_rate": 1.8754021960377812e-05, + "loss": 0.1735, + "mean_token_accuracy": 0.9160333275794983, + "step": 1816 + }, + { + "epoch": 5.713836477987421, + "grad_norm": 0.12194069474935532, + "learning_rate": 1.8735988110568214e-05, + "loss": 0.1646, + "mean_token_accuracy": 0.9180842041969299, + "step": 1817 + }, + { + "epoch": 5.716981132075472, + "grad_norm": 0.12107089906930923, + "learning_rate": 1.8717957647817142e-05, + "loss": 0.1635, + "mean_token_accuracy": 0.9168097972869873, + "step": 1818 + }, + { + "epoch": 5.720125786163522, + "grad_norm": 0.1317150741815567, + "learning_rate": 1.8699930590834774e-05, + "loss": 0.1687, + "mean_token_accuracy": 0.9154680967330933, + "step": 1819 + }, + { + "epoch": 5.723270440251572, + "grad_norm": 0.12018724530935287, + "learning_rate": 1.8681906958327748e-05, + "loss": 0.1571, + "mean_token_accuracy": 0.9167401194572449, + "step": 1820 + }, + { + "epoch": 5.726415094339623, + "grad_norm": 0.12499828636646271, + "learning_rate": 1.8663886768999144e-05, + "loss": 0.1754, + "mean_token_accuracy": 0.9121738076210022, + "step": 1821 + }, + { + "epoch": 5.729559748427673, + "grad_norm": 0.1199810653924942, + "learning_rate": 1.864587004154849e-05, + "loss": 0.158, + "mean_token_accuracy": 0.9177903532981873, + "step": 1822 + }, + { + "epoch": 5.732704402515723, + "grad_norm": 0.12288343161344528, + "learning_rate": 1.86278567946717e-05, + "loss": 0.18, + "mean_token_accuracy": 0.911756694316864, + "step": 1823 + }, + { + "epoch": 5.735849056603773, + "grad_norm": 0.11919686943292618, + "learning_rate": 1.860984704706108e-05, + "loss": 0.1709, + "mean_token_accuracy": 0.9164305925369263, + "step": 1824 + }, + { + "epoch": 5.738993710691824, + "grad_norm": 0.12749914824962616, + "learning_rate": 1.8591840817405317e-05, + "loss": 0.1828, + "mean_token_accuracy": 0.9121153950691223, + "step": 1825 + }, + { + "epoch": 5.7421383647798745, + "grad_norm": 0.11674331873655319, + "learning_rate": 1.8573838124389433e-05, + "loss": 0.1784, + "mean_token_accuracy": 0.9080796837806702, + "step": 1826 + }, + { + "epoch": 5.745283018867925, + "grad_norm": 0.12508632242679596, + "learning_rate": 1.855583898669479e-05, + "loss": 0.1792, + "mean_token_accuracy": 0.9131340980529785, + "step": 1827 + }, + { + "epoch": 5.748427672955975, + "grad_norm": 0.12435121834278107, + "learning_rate": 1.8537843422999057e-05, + "loss": 0.171, + "mean_token_accuracy": 0.9142288565635681, + "step": 1828 + }, + { + "epoch": 5.751572327044025, + "grad_norm": 0.12231216579675674, + "learning_rate": 1.8519851451976182e-05, + "loss": 0.1597, + "mean_token_accuracy": 0.9187856316566467, + "step": 1829 + }, + { + "epoch": 5.754716981132075, + "grad_norm": 0.1260603815317154, + "learning_rate": 1.8501863092296415e-05, + "loss": 0.18, + "mean_token_accuracy": 0.9122654795646667, + "step": 1830 + }, + { + "epoch": 5.7578616352201255, + "grad_norm": 0.12207242846488953, + "learning_rate": 1.8483878362626227e-05, + "loss": 0.1543, + "mean_token_accuracy": 0.9220340251922607, + "step": 1831 + }, + { + "epoch": 5.761006289308176, + "grad_norm": 0.12331138551235199, + "learning_rate": 1.8465897281628345e-05, + "loss": 0.1837, + "mean_token_accuracy": 0.9105554223060608, + "step": 1832 + }, + { + "epoch": 5.764150943396227, + "grad_norm": 0.1287182718515396, + "learning_rate": 1.8447919867961697e-05, + "loss": 0.157, + "mean_token_accuracy": 0.9220572710037231, + "step": 1833 + }, + { + "epoch": 5.767295597484277, + "grad_norm": 0.1238393783569336, + "learning_rate": 1.84299461402814e-05, + "loss": 0.1662, + "mean_token_accuracy": 0.915483295917511, + "step": 1834 + }, + { + "epoch": 5.770440251572327, + "grad_norm": 0.12729449570178986, + "learning_rate": 1.8411976117238772e-05, + "loss": 0.1559, + "mean_token_accuracy": 0.9172618985176086, + "step": 1835 + }, + { + "epoch": 5.773584905660377, + "grad_norm": 0.1212867870926857, + "learning_rate": 1.8394009817481257e-05, + "loss": 0.183, + "mean_token_accuracy": 0.9098618030548096, + "step": 1836 + }, + { + "epoch": 5.776729559748428, + "grad_norm": 0.1259167343378067, + "learning_rate": 1.8376047259652448e-05, + "loss": 0.1708, + "mean_token_accuracy": 0.9136357307434082, + "step": 1837 + }, + { + "epoch": 5.779874213836478, + "grad_norm": 0.12603263556957245, + "learning_rate": 1.8358088462392057e-05, + "loss": 0.1713, + "mean_token_accuracy": 0.9129566550254822, + "step": 1838 + }, + { + "epoch": 5.783018867924528, + "grad_norm": 0.12520869076251984, + "learning_rate": 1.8340133444335894e-05, + "loss": 0.1777, + "mean_token_accuracy": 0.9123414158821106, + "step": 1839 + }, + { + "epoch": 5.786163522012579, + "grad_norm": 0.1280491203069687, + "learning_rate": 1.832218222411584e-05, + "loss": 0.161, + "mean_token_accuracy": 0.9205901026725769, + "step": 1840 + }, + { + "epoch": 5.789308176100629, + "grad_norm": 0.12988995015621185, + "learning_rate": 1.8304234820359852e-05, + "loss": 0.1622, + "mean_token_accuracy": 0.9184454679489136, + "step": 1841 + }, + { + "epoch": 5.7924528301886795, + "grad_norm": 0.12439626455307007, + "learning_rate": 1.8286291251691897e-05, + "loss": 0.1705, + "mean_token_accuracy": 0.9161950945854187, + "step": 1842 + }, + { + "epoch": 5.79559748427673, + "grad_norm": 0.13772748410701752, + "learning_rate": 1.826835153673199e-05, + "loss": 0.1752, + "mean_token_accuracy": 0.9136615991592407, + "step": 1843 + }, + { + "epoch": 5.79874213836478, + "grad_norm": 0.1208910346031189, + "learning_rate": 1.825041569409614e-05, + "loss": 0.1845, + "mean_token_accuracy": 0.9119904637336731, + "step": 1844 + }, + { + "epoch": 5.80188679245283, + "grad_norm": 0.1381557136774063, + "learning_rate": 1.8232483742396327e-05, + "loss": 0.1734, + "mean_token_accuracy": 0.9145846962928772, + "step": 1845 + }, + { + "epoch": 5.80503144654088, + "grad_norm": 0.13252446055412292, + "learning_rate": 1.8214555700240498e-05, + "loss": 0.1712, + "mean_token_accuracy": 0.9147006869316101, + "step": 1846 + }, + { + "epoch": 5.8081761006289305, + "grad_norm": 0.1245056763291359, + "learning_rate": 1.819663158623256e-05, + "loss": 0.1695, + "mean_token_accuracy": 0.9146434664726257, + "step": 1847 + }, + { + "epoch": 5.811320754716981, + "grad_norm": 0.1266990602016449, + "learning_rate": 1.817871141897232e-05, + "loss": 0.1776, + "mean_token_accuracy": 0.9120575785636902, + "step": 1848 + }, + { + "epoch": 5.814465408805032, + "grad_norm": 0.12160525470972061, + "learning_rate": 1.8160795217055496e-05, + "loss": 0.1824, + "mean_token_accuracy": 0.9122020602226257, + "step": 1849 + }, + { + "epoch": 5.817610062893082, + "grad_norm": 0.127620130777359, + "learning_rate": 1.8142882999073704e-05, + "loss": 0.1719, + "mean_token_accuracy": 0.915385901927948, + "step": 1850 + }, + { + "epoch": 5.820754716981132, + "grad_norm": 0.11782664060592651, + "learning_rate": 1.8124974783614414e-05, + "loss": 0.1701, + "mean_token_accuracy": 0.9131607413291931, + "step": 1851 + }, + { + "epoch": 5.823899371069182, + "grad_norm": 0.13343071937561035, + "learning_rate": 1.8107070589260943e-05, + "loss": 0.1533, + "mean_token_accuracy": 0.9223355650901794, + "step": 1852 + }, + { + "epoch": 5.827044025157233, + "grad_norm": 0.11941219866275787, + "learning_rate": 1.808917043459243e-05, + "loss": 0.1745, + "mean_token_accuracy": 0.9113269448280334, + "step": 1853 + }, + { + "epoch": 5.830188679245283, + "grad_norm": 0.11961886286735535, + "learning_rate": 1.807127433818384e-05, + "loss": 0.1745, + "mean_token_accuracy": 0.9124999642372131, + "step": 1854 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 0.12838539481163025, + "learning_rate": 1.8053382318605907e-05, + "loss": 0.1952, + "mean_token_accuracy": 0.9072304964065552, + "step": 1855 + }, + { + "epoch": 5.836477987421384, + "grad_norm": 0.11784368008375168, + "learning_rate": 1.8035494394425158e-05, + "loss": 0.1655, + "mean_token_accuracy": 0.9166972041130066, + "step": 1856 + }, + { + "epoch": 5.839622641509434, + "grad_norm": 0.12901045382022858, + "learning_rate": 1.801761058420384e-05, + "loss": 0.1855, + "mean_token_accuracy": 0.9092808365821838, + "step": 1857 + }, + { + "epoch": 5.8427672955974845, + "grad_norm": 0.11686600744724274, + "learning_rate": 1.799973090649995e-05, + "loss": 0.1754, + "mean_token_accuracy": 0.9145554900169373, + "step": 1858 + }, + { + "epoch": 5.845911949685535, + "grad_norm": 0.12752406299114227, + "learning_rate": 1.7981855379867204e-05, + "loss": 0.1793, + "mean_token_accuracy": 0.913017213344574, + "step": 1859 + }, + { + "epoch": 5.849056603773585, + "grad_norm": 0.12086496502161026, + "learning_rate": 1.796398402285499e-05, + "loss": 0.1873, + "mean_token_accuracy": 0.9081063866615295, + "step": 1860 + }, + { + "epoch": 5.852201257861635, + "grad_norm": 0.1289493292570114, + "learning_rate": 1.7946116854008383e-05, + "loss": 0.1715, + "mean_token_accuracy": 0.9141929149627686, + "step": 1861 + }, + { + "epoch": 5.855345911949685, + "grad_norm": 0.12896886467933655, + "learning_rate": 1.7928253891868103e-05, + "loss": 0.174, + "mean_token_accuracy": 0.9146431684494019, + "step": 1862 + }, + { + "epoch": 5.8584905660377355, + "grad_norm": 0.1219131350517273, + "learning_rate": 1.7910395154970505e-05, + "loss": 0.1889, + "mean_token_accuracy": 0.9074649214744568, + "step": 1863 + }, + { + "epoch": 5.861635220125786, + "grad_norm": 0.13309486210346222, + "learning_rate": 1.789254066184758e-05, + "loss": 0.1715, + "mean_token_accuracy": 0.9138280749320984, + "step": 1864 + }, + { + "epoch": 5.864779874213837, + "grad_norm": 0.11922544986009598, + "learning_rate": 1.7874690431026887e-05, + "loss": 0.1702, + "mean_token_accuracy": 0.9120256304740906, + "step": 1865 + }, + { + "epoch": 5.867924528301887, + "grad_norm": 0.12653562426567078, + "learning_rate": 1.7856844481031585e-05, + "loss": 0.1836, + "mean_token_accuracy": 0.9097647666931152, + "step": 1866 + }, + { + "epoch": 5.871069182389937, + "grad_norm": 0.12282025814056396, + "learning_rate": 1.7839002830380366e-05, + "loss": 0.1778, + "mean_token_accuracy": 0.9133232831954956, + "step": 1867 + }, + { + "epoch": 5.8742138364779874, + "grad_norm": 0.12195201963186264, + "learning_rate": 1.7821165497587488e-05, + "loss": 0.1622, + "mean_token_accuracy": 0.9166160821914673, + "step": 1868 + }, + { + "epoch": 5.877358490566038, + "grad_norm": 0.12466923892498016, + "learning_rate": 1.780333250116271e-05, + "loss": 0.1641, + "mean_token_accuracy": 0.9177669286727905, + "step": 1869 + }, + { + "epoch": 5.880503144654088, + "grad_norm": 0.13061249256134033, + "learning_rate": 1.778550385961129e-05, + "loss": 0.1789, + "mean_token_accuracy": 0.913748562335968, + "step": 1870 + }, + { + "epoch": 5.883647798742138, + "grad_norm": 0.12507785856723785, + "learning_rate": 1.7767679591433984e-05, + "loss": 0.1836, + "mean_token_accuracy": 0.9113100171089172, + "step": 1871 + }, + { + "epoch": 5.886792452830189, + "grad_norm": 0.13062918186187744, + "learning_rate": 1.774985971512699e-05, + "loss": 0.1872, + "mean_token_accuracy": 0.9101560115814209, + "step": 1872 + }, + { + "epoch": 5.889937106918239, + "grad_norm": 0.12669727206230164, + "learning_rate": 1.773204424918196e-05, + "loss": 0.1863, + "mean_token_accuracy": 0.9074638485908508, + "step": 1873 + }, + { + "epoch": 5.8930817610062896, + "grad_norm": 0.13299503922462463, + "learning_rate": 1.771423321208597e-05, + "loss": 0.1741, + "mean_token_accuracy": 0.9125397205352783, + "step": 1874 + }, + { + "epoch": 5.89622641509434, + "grad_norm": 0.13146135210990906, + "learning_rate": 1.7696426622321503e-05, + "loss": 0.1784, + "mean_token_accuracy": 0.9100609421730042, + "step": 1875 + }, + { + "epoch": 5.89937106918239, + "grad_norm": 0.13344912230968475, + "learning_rate": 1.7678624498366406e-05, + "loss": 0.1886, + "mean_token_accuracy": 0.9081234931945801, + "step": 1876 + }, + { + "epoch": 5.90251572327044, + "grad_norm": 0.13202603161334991, + "learning_rate": 1.7660826858693914e-05, + "loss": 0.1778, + "mean_token_accuracy": 0.9113733768463135, + "step": 1877 + }, + { + "epoch": 5.90566037735849, + "grad_norm": 0.12540169060230255, + "learning_rate": 1.7643033721772608e-05, + "loss": 0.1765, + "mean_token_accuracy": 0.912645697593689, + "step": 1878 + }, + { + "epoch": 5.908805031446541, + "grad_norm": 0.13628825545310974, + "learning_rate": 1.7625245106066372e-05, + "loss": 0.1614, + "mean_token_accuracy": 0.917786717414856, + "step": 1879 + }, + { + "epoch": 5.911949685534591, + "grad_norm": 0.12427868694067001, + "learning_rate": 1.7607461030034437e-05, + "loss": 0.1746, + "mean_token_accuracy": 0.9142725467681885, + "step": 1880 + }, + { + "epoch": 5.915094339622642, + "grad_norm": 0.13920536637306213, + "learning_rate": 1.7589681512131295e-05, + "loss": 0.1803, + "mean_token_accuracy": 0.9117159247398376, + "step": 1881 + }, + { + "epoch": 5.918238993710692, + "grad_norm": 0.13377030193805695, + "learning_rate": 1.7571906570806718e-05, + "loss": 0.184, + "mean_token_accuracy": 0.9084151983261108, + "step": 1882 + }, + { + "epoch": 5.921383647798742, + "grad_norm": 0.12509870529174805, + "learning_rate": 1.755413622450572e-05, + "loss": 0.1859, + "mean_token_accuracy": 0.907892107963562, + "step": 1883 + }, + { + "epoch": 5.9245283018867925, + "grad_norm": 0.13371077179908752, + "learning_rate": 1.7536370491668556e-05, + "loss": 0.175, + "mean_token_accuracy": 0.9129257798194885, + "step": 1884 + }, + { + "epoch": 5.927672955974843, + "grad_norm": 0.12641875445842743, + "learning_rate": 1.7518609390730694e-05, + "loss": 0.1706, + "mean_token_accuracy": 0.9163611531257629, + "step": 1885 + }, + { + "epoch": 5.930817610062893, + "grad_norm": 0.1306959092617035, + "learning_rate": 1.7500852940122796e-05, + "loss": 0.1617, + "mean_token_accuracy": 0.9185019731521606, + "step": 1886 + }, + { + "epoch": 5.933962264150943, + "grad_norm": 0.1335182934999466, + "learning_rate": 1.7483101158270683e-05, + "loss": 0.185, + "mean_token_accuracy": 0.9097692966461182, + "step": 1887 + }, + { + "epoch": 5.937106918238994, + "grad_norm": 0.12307420372962952, + "learning_rate": 1.7465354063595354e-05, + "loss": 0.1744, + "mean_token_accuracy": 0.9118009209632874, + "step": 1888 + }, + { + "epoch": 5.940251572327044, + "grad_norm": 0.13397091627120972, + "learning_rate": 1.744761167451292e-05, + "loss": 0.1726, + "mean_token_accuracy": 0.9162708520889282, + "step": 1889 + }, + { + "epoch": 5.943396226415095, + "grad_norm": 0.1265205293893814, + "learning_rate": 1.7429874009434642e-05, + "loss": 0.1685, + "mean_token_accuracy": 0.9149485230445862, + "step": 1890 + }, + { + "epoch": 5.946540880503145, + "grad_norm": 0.12802155315876007, + "learning_rate": 1.741214108676684e-05, + "loss": 0.1845, + "mean_token_accuracy": 0.9122422337532043, + "step": 1891 + }, + { + "epoch": 5.949685534591195, + "grad_norm": 0.12957172095775604, + "learning_rate": 1.7394412924910946e-05, + "loss": 0.1953, + "mean_token_accuracy": 0.9067781567573547, + "step": 1892 + }, + { + "epoch": 5.952830188679245, + "grad_norm": 0.1264718621969223, + "learning_rate": 1.7376689542263424e-05, + "loss": 0.1666, + "mean_token_accuracy": 0.9175181984901428, + "step": 1893 + }, + { + "epoch": 5.955974842767295, + "grad_norm": 0.12541256844997406, + "learning_rate": 1.7358970957215805e-05, + "loss": 0.1789, + "mean_token_accuracy": 0.9094677567481995, + "step": 1894 + }, + { + "epoch": 5.959119496855346, + "grad_norm": 0.12028246372938156, + "learning_rate": 1.7341257188154625e-05, + "loss": 0.1975, + "mean_token_accuracy": 0.9086244106292725, + "step": 1895 + }, + { + "epoch": 5.962264150943396, + "grad_norm": 0.13436433672904968, + "learning_rate": 1.7323548253461425e-05, + "loss": 0.1808, + "mean_token_accuracy": 0.9111320972442627, + "step": 1896 + }, + { + "epoch": 5.965408805031447, + "grad_norm": 0.11996529251337051, + "learning_rate": 1.730584417151273e-05, + "loss": 0.1758, + "mean_token_accuracy": 0.9111454486846924, + "step": 1897 + }, + { + "epoch": 5.968553459119497, + "grad_norm": 0.12315444648265839, + "learning_rate": 1.728814496068003e-05, + "loss": 0.1925, + "mean_token_accuracy": 0.9099141955375671, + "step": 1898 + }, + { + "epoch": 5.971698113207547, + "grad_norm": 0.12306390702724457, + "learning_rate": 1.7270450639329762e-05, + "loss": 0.1768, + "mean_token_accuracy": 0.9125403761863708, + "step": 1899 + }, + { + "epoch": 5.9748427672955975, + "grad_norm": 0.1273583173751831, + "learning_rate": 1.725276122582329e-05, + "loss": 0.1891, + "mean_token_accuracy": 0.9100738167762756, + "step": 1900 + }, + { + "epoch": 5.977987421383648, + "grad_norm": 0.12477988004684448, + "learning_rate": 1.723507673851687e-05, + "loss": 0.1697, + "mean_token_accuracy": 0.9162830710411072, + "step": 1901 + }, + { + "epoch": 5.981132075471698, + "grad_norm": 0.1281619518995285, + "learning_rate": 1.721739719576167e-05, + "loss": 0.1838, + "mean_token_accuracy": 0.9113216996192932, + "step": 1902 + }, + { + "epoch": 5.984276729559748, + "grad_norm": 0.12437441945075989, + "learning_rate": 1.7199722615903722e-05, + "loss": 0.1768, + "mean_token_accuracy": 0.9131054878234863, + "step": 1903 + }, + { + "epoch": 5.987421383647799, + "grad_norm": 0.12587052583694458, + "learning_rate": 1.7182053017283884e-05, + "loss": 0.1914, + "mean_token_accuracy": 0.9063124656677246, + "step": 1904 + }, + { + "epoch": 5.990566037735849, + "grad_norm": 0.1270790845155716, + "learning_rate": 1.7164388418237878e-05, + "loss": 0.1814, + "mean_token_accuracy": 0.9116953015327454, + "step": 1905 + }, + { + "epoch": 5.9937106918239, + "grad_norm": 0.11693906038999557, + "learning_rate": 1.714672883709621e-05, + "loss": 0.1712, + "mean_token_accuracy": 0.9138330817222595, + "step": 1906 + }, + { + "epoch": 5.99685534591195, + "grad_norm": 0.12291273474693298, + "learning_rate": 1.71290742921842e-05, + "loss": 0.1838, + "mean_token_accuracy": 0.9125950932502747, + "step": 1907 + }, + { + "epoch": 6.0, + "grad_norm": 0.12398134917020798, + "learning_rate": 1.7111424801821933e-05, + "loss": 0.1596, + "mean_token_accuracy": 0.9150497913360596, + "step": 1908 + } + ], + "logging_steps": 1, + "max_steps": 3180, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.92346119564768e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}