{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 1908, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0031446540880503146, "grad_norm": 1.9843024015426636, "learning_rate": 4.1666666666666667e-07, "loss": 0.9991, "mean_token_accuracy": 0.7260034680366516, "step": 1 }, { "epoch": 0.006289308176100629, "grad_norm": 1.9983922243118286, "learning_rate": 8.333333333333333e-07, "loss": 0.9796, "mean_token_accuracy": 0.7287212610244751, "step": 2 }, { "epoch": 0.009433962264150943, "grad_norm": 2.008734941482544, "learning_rate": 1.25e-06, "loss": 0.9978, "mean_token_accuracy": 0.7293687462806702, "step": 3 }, { "epoch": 0.012578616352201259, "grad_norm": 1.9662363529205322, "learning_rate": 1.6666666666666667e-06, "loss": 0.995, "mean_token_accuracy": 0.7301269769668579, "step": 4 }, { "epoch": 0.015723270440251572, "grad_norm": 1.9593746662139893, "learning_rate": 2.0833333333333334e-06, "loss": 1.0211, "mean_token_accuracy": 0.7250465154647827, "step": 5 }, { "epoch": 0.018867924528301886, "grad_norm": 1.769201636314392, "learning_rate": 2.5e-06, "loss": 0.9955, "mean_token_accuracy": 0.7267187833786011, "step": 6 }, { "epoch": 0.0220125786163522, "grad_norm": 1.6486475467681885, "learning_rate": 2.916666666666667e-06, "loss": 0.9717, "mean_token_accuracy": 0.7299320101737976, "step": 7 }, { "epoch": 0.025157232704402517, "grad_norm": 1.1724086999893188, "learning_rate": 3.3333333333333333e-06, "loss": 0.9318, "mean_token_accuracy": 0.7348803877830505, "step": 8 }, { "epoch": 0.02830188679245283, "grad_norm": 1.0883439779281616, "learning_rate": 3.7500000000000005e-06, "loss": 0.9175, "mean_token_accuracy": 0.7360632419586182, "step": 9 }, { "epoch": 0.031446540880503145, "grad_norm": 1.0831983089447021, "learning_rate": 4.166666666666667e-06, "loss": 0.9115, "mean_token_accuracy": 0.7367658019065857, "step": 10 }, { "epoch": 0.03459119496855346, "grad_norm": 0.9518892765045166, "learning_rate": 4.583333333333333e-06, "loss": 0.891, "mean_token_accuracy": 0.7373512387275696, "step": 11 }, { "epoch": 0.03773584905660377, "grad_norm": 0.9544086456298828, "learning_rate": 5e-06, "loss": 0.904, "mean_token_accuracy": 0.7343592047691345, "step": 12 }, { "epoch": 0.040880503144654086, "grad_norm": 0.8863177299499512, "learning_rate": 5.416666666666667e-06, "loss": 0.8712, "mean_token_accuracy": 0.7427428364753723, "step": 13 }, { "epoch": 0.0440251572327044, "grad_norm": 0.803634524345398, "learning_rate": 5.833333333333334e-06, "loss": 0.8434, "mean_token_accuracy": 0.7462121844291687, "step": 14 }, { "epoch": 0.04716981132075472, "grad_norm": 0.9318463802337646, "learning_rate": 6.25e-06, "loss": 0.8115, "mean_token_accuracy": 0.7500566244125366, "step": 15 }, { "epoch": 0.050314465408805034, "grad_norm": 0.888688325881958, "learning_rate": 6.666666666666667e-06, "loss": 0.8099, "mean_token_accuracy": 0.7527286410331726, "step": 16 }, { "epoch": 0.05345911949685535, "grad_norm": 0.6901258230209351, "learning_rate": 7.083333333333335e-06, "loss": 0.7951, "mean_token_accuracy": 0.7522338032722473, "step": 17 }, { "epoch": 0.05660377358490566, "grad_norm": 0.5095185041427612, "learning_rate": 7.500000000000001e-06, "loss": 0.7702, "mean_token_accuracy": 0.7567635774612427, "step": 18 }, { "epoch": 0.059748427672955975, "grad_norm": 0.41788381338119507, "learning_rate": 7.916666666666667e-06, "loss": 0.7482, "mean_token_accuracy": 0.7630857825279236, "step": 19 }, { "epoch": 0.06289308176100629, "grad_norm": 0.49949783086776733, "learning_rate": 8.333333333333334e-06, "loss": 0.7309, "mean_token_accuracy": 0.7646651864051819, "step": 20 }, { "epoch": 0.0660377358490566, "grad_norm": 0.4411962032318115, "learning_rate": 8.750000000000001e-06, "loss": 0.7133, "mean_token_accuracy": 0.7669786810874939, "step": 21 }, { "epoch": 0.06918238993710692, "grad_norm": 0.381130188703537, "learning_rate": 9.166666666666666e-06, "loss": 0.7034, "mean_token_accuracy": 0.7698013782501221, "step": 22 }, { "epoch": 0.07232704402515723, "grad_norm": 0.32305705547332764, "learning_rate": 9.583333333333335e-06, "loss": 0.6969, "mean_token_accuracy": 0.7718903422355652, "step": 23 }, { "epoch": 0.07547169811320754, "grad_norm": 0.3746313154697418, "learning_rate": 1e-05, "loss": 0.6906, "mean_token_accuracy": 0.7724348306655884, "step": 24 }, { "epoch": 0.07861635220125786, "grad_norm": 0.378999263048172, "learning_rate": 1.0416666666666668e-05, "loss": 0.6826, "mean_token_accuracy": 0.7719895243644714, "step": 25 }, { "epoch": 0.08176100628930817, "grad_norm": 0.35218483209609985, "learning_rate": 1.0833333333333334e-05, "loss": 0.675, "mean_token_accuracy": 0.7760418057441711, "step": 26 }, { "epoch": 0.08490566037735849, "grad_norm": 0.3063074052333832, "learning_rate": 1.125e-05, "loss": 0.6707, "mean_token_accuracy": 0.7769221067428589, "step": 27 }, { "epoch": 0.0880503144654088, "grad_norm": 0.3150438070297241, "learning_rate": 1.1666666666666668e-05, "loss": 0.6542, "mean_token_accuracy": 0.7808550000190735, "step": 28 }, { "epoch": 0.09119496855345911, "grad_norm": 0.2937377393245697, "learning_rate": 1.2083333333333333e-05, "loss": 0.6654, "mean_token_accuracy": 0.7757168412208557, "step": 29 }, { "epoch": 0.09433962264150944, "grad_norm": 0.2754563093185425, "learning_rate": 1.25e-05, "loss": 0.6553, "mean_token_accuracy": 0.7805117964744568, "step": 30 }, { "epoch": 0.09748427672955975, "grad_norm": 0.2745518386363983, "learning_rate": 1.2916666666666668e-05, "loss": 0.6383, "mean_token_accuracy": 0.7834630012512207, "step": 31 }, { "epoch": 0.10062893081761007, "grad_norm": 0.2507205307483673, "learning_rate": 1.3333333333333333e-05, "loss": 0.6382, "mean_token_accuracy": 0.7827038764953613, "step": 32 }, { "epoch": 0.10377358490566038, "grad_norm": 0.26597172021865845, "learning_rate": 1.375e-05, "loss": 0.644, "mean_token_accuracy": 0.7815650105476379, "step": 33 }, { "epoch": 0.1069182389937107, "grad_norm": 0.2784893810749054, "learning_rate": 1.416666666666667e-05, "loss": 0.6175, "mean_token_accuracy": 0.7878676056861877, "step": 34 }, { "epoch": 0.11006289308176101, "grad_norm": 0.2109018713235855, "learning_rate": 1.4583333333333333e-05, "loss": 0.6174, "mean_token_accuracy": 0.7876963019371033, "step": 35 }, { "epoch": 0.11320754716981132, "grad_norm": 0.2487485110759735, "learning_rate": 1.5000000000000002e-05, "loss": 0.6112, "mean_token_accuracy": 0.7883297204971313, "step": 36 }, { "epoch": 0.11635220125786164, "grad_norm": 0.21007870137691498, "learning_rate": 1.5416666666666668e-05, "loss": 0.6243, "mean_token_accuracy": 0.7878017425537109, "step": 37 }, { "epoch": 0.11949685534591195, "grad_norm": 0.19941003620624542, "learning_rate": 1.5833333333333333e-05, "loss": 0.6225, "mean_token_accuracy": 0.7866714000701904, "step": 38 }, { "epoch": 0.12264150943396226, "grad_norm": 0.22181154787540436, "learning_rate": 1.6250000000000002e-05, "loss": 0.6172, "mean_token_accuracy": 0.7870483994483948, "step": 39 }, { "epoch": 0.12578616352201258, "grad_norm": 0.21963359415531158, "learning_rate": 1.6666666666666667e-05, "loss": 0.6145, "mean_token_accuracy": 0.7886669039726257, "step": 40 }, { "epoch": 0.1289308176100629, "grad_norm": 0.19179226458072662, "learning_rate": 1.7083333333333333e-05, "loss": 0.6039, "mean_token_accuracy": 0.7911710739135742, "step": 41 }, { "epoch": 0.1320754716981132, "grad_norm": 0.16477134823799133, "learning_rate": 1.7500000000000002e-05, "loss": 0.6086, "mean_token_accuracy": 0.7912865281105042, "step": 42 }, { "epoch": 0.13522012578616352, "grad_norm": 0.22841010987758636, "learning_rate": 1.7916666666666667e-05, "loss": 0.6055, "mean_token_accuracy": 0.789507269859314, "step": 43 }, { "epoch": 0.13836477987421383, "grad_norm": 0.1824677586555481, "learning_rate": 1.8333333333333333e-05, "loss": 0.6083, "mean_token_accuracy": 0.7901920080184937, "step": 44 }, { "epoch": 0.14150943396226415, "grad_norm": 0.2477484941482544, "learning_rate": 1.8750000000000002e-05, "loss": 0.5883, "mean_token_accuracy": 0.7944508790969849, "step": 45 }, { "epoch": 0.14465408805031446, "grad_norm": 0.21475744247436523, "learning_rate": 1.916666666666667e-05, "loss": 0.6063, "mean_token_accuracy": 0.793271005153656, "step": 46 }, { "epoch": 0.14779874213836477, "grad_norm": 0.23494695127010345, "learning_rate": 1.9583333333333333e-05, "loss": 0.6035, "mean_token_accuracy": 0.790216326713562, "step": 47 }, { "epoch": 0.1509433962264151, "grad_norm": 0.19835183024406433, "learning_rate": 2e-05, "loss": 0.5912, "mean_token_accuracy": 0.7936276197433472, "step": 48 }, { "epoch": 0.1540880503144654, "grad_norm": 0.20389200747013092, "learning_rate": 2.0416666666666667e-05, "loss": 0.5883, "mean_token_accuracy": 0.7947663068771362, "step": 49 }, { "epoch": 0.15723270440251572, "grad_norm": 0.25244179368019104, "learning_rate": 2.0833333333333336e-05, "loss": 0.593, "mean_token_accuracy": 0.7952125072479248, "step": 50 }, { "epoch": 0.16037735849056603, "grad_norm": 0.18739324808120728, "learning_rate": 2.125e-05, "loss": 0.5892, "mean_token_accuracy": 0.7951387763023376, "step": 51 }, { "epoch": 0.16352201257861634, "grad_norm": 0.2114628702402115, "learning_rate": 2.1666666666666667e-05, "loss": 0.5838, "mean_token_accuracy": 0.7957105040550232, "step": 52 }, { "epoch": 0.16666666666666666, "grad_norm": 0.2226199507713318, "learning_rate": 2.2083333333333336e-05, "loss": 0.5943, "mean_token_accuracy": 0.7944356799125671, "step": 53 }, { "epoch": 0.16981132075471697, "grad_norm": 0.19234661757946014, "learning_rate": 2.25e-05, "loss": 0.5899, "mean_token_accuracy": 0.7960485219955444, "step": 54 }, { "epoch": 0.17295597484276728, "grad_norm": 0.25666317343711853, "learning_rate": 2.2916666666666667e-05, "loss": 0.5811, "mean_token_accuracy": 0.7957990169525146, "step": 55 }, { "epoch": 0.1761006289308176, "grad_norm": 0.2783811390399933, "learning_rate": 2.3333333333333336e-05, "loss": 0.5946, "mean_token_accuracy": 0.7940381169319153, "step": 56 }, { "epoch": 0.1792452830188679, "grad_norm": 0.20457403361797333, "learning_rate": 2.375e-05, "loss": 0.5816, "mean_token_accuracy": 0.7978816628456116, "step": 57 }, { "epoch": 0.18238993710691823, "grad_norm": 0.3623892068862915, "learning_rate": 2.4166666666666667e-05, "loss": 0.5802, "mean_token_accuracy": 0.7961991429328918, "step": 58 }, { "epoch": 0.18553459119496854, "grad_norm": 0.24155688285827637, "learning_rate": 2.4583333333333336e-05, "loss": 0.5683, "mean_token_accuracy": 0.7976905703544617, "step": 59 }, { "epoch": 0.18867924528301888, "grad_norm": 0.377756804227829, "learning_rate": 2.5e-05, "loss": 0.5762, "mean_token_accuracy": 0.8002050518989563, "step": 60 }, { "epoch": 0.1918238993710692, "grad_norm": 0.3166685104370117, "learning_rate": 2.5416666666666667e-05, "loss": 0.584, "mean_token_accuracy": 0.7962762713432312, "step": 61 }, { "epoch": 0.1949685534591195, "grad_norm": 0.29503804445266724, "learning_rate": 2.5833333333333336e-05, "loss": 0.5784, "mean_token_accuracy": 0.7990941405296326, "step": 62 }, { "epoch": 0.19811320754716982, "grad_norm": 0.3305257558822632, "learning_rate": 2.625e-05, "loss": 0.5846, "mean_token_accuracy": 0.7966292500495911, "step": 63 }, { "epoch": 0.20125786163522014, "grad_norm": 0.24742531776428223, "learning_rate": 2.6666666666666667e-05, "loss": 0.5732, "mean_token_accuracy": 0.7980184555053711, "step": 64 }, { "epoch": 0.20440251572327045, "grad_norm": 0.2809283137321472, "learning_rate": 2.7083333333333335e-05, "loss": 0.5749, "mean_token_accuracy": 0.7999182939529419, "step": 65 }, { "epoch": 0.20754716981132076, "grad_norm": 0.1994399130344391, "learning_rate": 2.75e-05, "loss": 0.5731, "mean_token_accuracy": 0.7985623478889465, "step": 66 }, { "epoch": 0.21069182389937108, "grad_norm": 0.28836458921432495, "learning_rate": 2.7916666666666666e-05, "loss": 0.5596, "mean_token_accuracy": 0.8041678071022034, "step": 67 }, { "epoch": 0.2138364779874214, "grad_norm": 0.25330910086631775, "learning_rate": 2.833333333333334e-05, "loss": 0.5756, "mean_token_accuracy": 0.7999730110168457, "step": 68 }, { "epoch": 0.2169811320754717, "grad_norm": 0.6575194001197815, "learning_rate": 2.875e-05, "loss": 0.5811, "mean_token_accuracy": 0.7950687408447266, "step": 69 }, { "epoch": 0.22012578616352202, "grad_norm": 0.318123996257782, "learning_rate": 2.9166666666666666e-05, "loss": 0.5735, "mean_token_accuracy": 0.7979779243469238, "step": 70 }, { "epoch": 0.22327044025157233, "grad_norm": 0.21200031042099, "learning_rate": 2.958333333333334e-05, "loss": 0.5595, "mean_token_accuracy": 0.800320029258728, "step": 71 }, { "epoch": 0.22641509433962265, "grad_norm": 0.3333161771297455, "learning_rate": 3.0000000000000004e-05, "loss": 0.5587, "mean_token_accuracy": 0.8015144467353821, "step": 72 }, { "epoch": 0.22955974842767296, "grad_norm": 0.2528008818626404, "learning_rate": 3.0416666666666666e-05, "loss": 0.5552, "mean_token_accuracy": 0.8024976253509521, "step": 73 }, { "epoch": 0.23270440251572327, "grad_norm": 0.29139530658721924, "learning_rate": 3.0833333333333335e-05, "loss": 0.5737, "mean_token_accuracy": 0.7972533106803894, "step": 74 }, { "epoch": 0.2358490566037736, "grad_norm": 0.31739315390586853, "learning_rate": 3.125e-05, "loss": 0.5744, "mean_token_accuracy": 0.7987572550773621, "step": 75 }, { "epoch": 0.2389937106918239, "grad_norm": 0.20459042489528656, "learning_rate": 3.1666666666666666e-05, "loss": 0.5678, "mean_token_accuracy": 0.7998490929603577, "step": 76 }, { "epoch": 0.24213836477987422, "grad_norm": 0.3210413455963135, "learning_rate": 3.208333333333334e-05, "loss": 0.5679, "mean_token_accuracy": 0.8000871539115906, "step": 77 }, { "epoch": 0.24528301886792453, "grad_norm": 0.2642245292663574, "learning_rate": 3.2500000000000004e-05, "loss": 0.5688, "mean_token_accuracy": 0.8012592792510986, "step": 78 }, { "epoch": 0.24842767295597484, "grad_norm": 0.3026284873485565, "learning_rate": 3.291666666666667e-05, "loss": 0.5601, "mean_token_accuracy": 0.802608847618103, "step": 79 }, { "epoch": 0.25157232704402516, "grad_norm": 0.2567735016345978, "learning_rate": 3.3333333333333335e-05, "loss": 0.5488, "mean_token_accuracy": 0.8038848638534546, "step": 80 }, { "epoch": 0.25471698113207547, "grad_norm": 0.26682281494140625, "learning_rate": 3.375e-05, "loss": 0.5601, "mean_token_accuracy": 0.801401674747467, "step": 81 }, { "epoch": 0.2578616352201258, "grad_norm": 0.30776286125183105, "learning_rate": 3.4166666666666666e-05, "loss": 0.5619, "mean_token_accuracy": 0.802801251411438, "step": 82 }, { "epoch": 0.2610062893081761, "grad_norm": 0.2427256852388382, "learning_rate": 3.458333333333334e-05, "loss": 0.5563, "mean_token_accuracy": 0.80390864610672, "step": 83 }, { "epoch": 0.2641509433962264, "grad_norm": 0.2617015838623047, "learning_rate": 3.5000000000000004e-05, "loss": 0.5626, "mean_token_accuracy": 0.8013002276420593, "step": 84 }, { "epoch": 0.2672955974842767, "grad_norm": 0.34007972478866577, "learning_rate": 3.541666666666667e-05, "loss": 0.5618, "mean_token_accuracy": 0.8023524880409241, "step": 85 }, { "epoch": 0.27044025157232704, "grad_norm": 0.22972999513149261, "learning_rate": 3.5833333333333335e-05, "loss": 0.5605, "mean_token_accuracy": 0.8025376796722412, "step": 86 }, { "epoch": 0.27358490566037735, "grad_norm": 0.37038788199424744, "learning_rate": 3.625e-05, "loss": 0.561, "mean_token_accuracy": 0.8009118437767029, "step": 87 }, { "epoch": 0.27672955974842767, "grad_norm": 0.23523087799549103, "learning_rate": 3.6666666666666666e-05, "loss": 0.5531, "mean_token_accuracy": 0.8030558228492737, "step": 88 }, { "epoch": 0.279874213836478, "grad_norm": 0.3976694941520691, "learning_rate": 3.708333333333334e-05, "loss": 0.5502, "mean_token_accuracy": 0.8050774335861206, "step": 89 }, { "epoch": 0.2830188679245283, "grad_norm": 0.2722809314727783, "learning_rate": 3.7500000000000003e-05, "loss": 0.5506, "mean_token_accuracy": 0.8029981255531311, "step": 90 }, { "epoch": 0.2861635220125786, "grad_norm": 0.4583202004432678, "learning_rate": 3.791666666666667e-05, "loss": 0.5542, "mean_token_accuracy": 0.8037380576133728, "step": 91 }, { "epoch": 0.2893081761006289, "grad_norm": 0.4009253978729248, "learning_rate": 3.833333333333334e-05, "loss": 0.5612, "mean_token_accuracy": 0.7998917698860168, "step": 92 }, { "epoch": 0.29245283018867924, "grad_norm": 0.320212721824646, "learning_rate": 3.875e-05, "loss": 0.5554, "mean_token_accuracy": 0.8016734719276428, "step": 93 }, { "epoch": 0.29559748427672955, "grad_norm": 0.3593039810657501, "learning_rate": 3.9166666666666665e-05, "loss": 0.5599, "mean_token_accuracy": 0.8014413714408875, "step": 94 }, { "epoch": 0.29874213836477986, "grad_norm": 0.2468472719192505, "learning_rate": 3.958333333333334e-05, "loss": 0.5611, "mean_token_accuracy": 0.8008609414100647, "step": 95 }, { "epoch": 0.3018867924528302, "grad_norm": 0.4157026410102844, "learning_rate": 4e-05, "loss": 0.5603, "mean_token_accuracy": 0.801600456237793, "step": 96 }, { "epoch": 0.3050314465408805, "grad_norm": 0.2701054513454437, "learning_rate": 3.9999990660718234e-05, "loss": 0.5551, "mean_token_accuracy": 0.8028652667999268, "step": 97 }, { "epoch": 0.3081761006289308, "grad_norm": 0.38107001781463623, "learning_rate": 3.999996264288261e-05, "loss": 0.5554, "mean_token_accuracy": 0.8040093183517456, "step": 98 }, { "epoch": 0.3113207547169811, "grad_norm": 0.2702213525772095, "learning_rate": 3.999991594652221e-05, "loss": 0.5479, "mean_token_accuracy": 0.8046407699584961, "step": 99 }, { "epoch": 0.31446540880503143, "grad_norm": 0.274862676858902, "learning_rate": 3.999985057168549e-05, "loss": 0.5433, "mean_token_accuracy": 0.8047059774398804, "step": 100 }, { "epoch": 0.31761006289308175, "grad_norm": 0.2879992723464966, "learning_rate": 3.999976651844029e-05, "loss": 0.5565, "mean_token_accuracy": 0.8013796210289001, "step": 101 }, { "epoch": 0.32075471698113206, "grad_norm": 0.3317185342311859, "learning_rate": 3.9999663786873816e-05, "loss": 0.5556, "mean_token_accuracy": 0.8010111451148987, "step": 102 }, { "epoch": 0.3238993710691824, "grad_norm": 0.2893899083137512, "learning_rate": 3.99995423770927e-05, "loss": 0.5392, "mean_token_accuracy": 0.8065787553787231, "step": 103 }, { "epoch": 0.3270440251572327, "grad_norm": 0.27327823638916016, "learning_rate": 3.999940228922291e-05, "loss": 0.5418, "mean_token_accuracy": 0.8071911334991455, "step": 104 }, { "epoch": 0.330188679245283, "grad_norm": 0.301395058631897, "learning_rate": 3.9999243523409826e-05, "loss": 0.5467, "mean_token_accuracy": 0.8041232228279114, "step": 105 }, { "epoch": 0.3333333333333333, "grad_norm": 0.2282310575246811, "learning_rate": 3.999906607981819e-05, "loss": 0.5353, "mean_token_accuracy": 0.8070929050445557, "step": 106 }, { "epoch": 0.33647798742138363, "grad_norm": 0.27638840675354004, "learning_rate": 3.999886995863214e-05, "loss": 0.5448, "mean_token_accuracy": 0.8050588369369507, "step": 107 }, { "epoch": 0.33962264150943394, "grad_norm": 0.257600873708725, "learning_rate": 3.999865516005519e-05, "loss": 0.5502, "mean_token_accuracy": 0.8030370473861694, "step": 108 }, { "epoch": 0.34276729559748426, "grad_norm": 0.23493166267871857, "learning_rate": 3.999842168431023e-05, "loss": 0.5461, "mean_token_accuracy": 0.8061801195144653, "step": 109 }, { "epoch": 0.34591194968553457, "grad_norm": 0.33576369285583496, "learning_rate": 3.9998169531639546e-05, "loss": 0.5515, "mean_token_accuracy": 0.8048064708709717, "step": 110 }, { "epoch": 0.3490566037735849, "grad_norm": 0.28176188468933105, "learning_rate": 3.999789870230479e-05, "loss": 0.551, "mean_token_accuracy": 0.8034514784812927, "step": 111 }, { "epoch": 0.3522012578616352, "grad_norm": 0.27955150604248047, "learning_rate": 3.9997609196587005e-05, "loss": 0.5475, "mean_token_accuracy": 0.803336501121521, "step": 112 }, { "epoch": 0.3553459119496855, "grad_norm": 0.2750742435455322, "learning_rate": 3.999730101478661e-05, "loss": 0.5487, "mean_token_accuracy": 0.8029544353485107, "step": 113 }, { "epoch": 0.3584905660377358, "grad_norm": 0.27893856167793274, "learning_rate": 3.9996974157223395e-05, "loss": 0.5481, "mean_token_accuracy": 0.8042543530464172, "step": 114 }, { "epoch": 0.36163522012578614, "grad_norm": 0.3115741014480591, "learning_rate": 3.9996628624236556e-05, "loss": 0.5463, "mean_token_accuracy": 0.8052036762237549, "step": 115 }, { "epoch": 0.36477987421383645, "grad_norm": 0.21932333707809448, "learning_rate": 3.999626441618464e-05, "loss": 0.5362, "mean_token_accuracy": 0.8065166473388672, "step": 116 }, { "epoch": 0.36792452830188677, "grad_norm": 0.3026050329208374, "learning_rate": 3.999588153344559e-05, "loss": 0.5307, "mean_token_accuracy": 0.8093136548995972, "step": 117 }, { "epoch": 0.3710691823899371, "grad_norm": 0.3253287076950073, "learning_rate": 3.9995479976416725e-05, "loss": 0.5503, "mean_token_accuracy": 0.8045260310173035, "step": 118 }, { "epoch": 0.3742138364779874, "grad_norm": 0.30211326479911804, "learning_rate": 3.999505974551473e-05, "loss": 0.5492, "mean_token_accuracy": 0.8044459223747253, "step": 119 }, { "epoch": 0.37735849056603776, "grad_norm": 0.2566331923007965, "learning_rate": 3.9994620841175694e-05, "loss": 0.5533, "mean_token_accuracy": 0.8031808733940125, "step": 120 }, { "epoch": 0.3805031446540881, "grad_norm": 0.32716792821884155, "learning_rate": 3.999416326385505e-05, "loss": 0.5421, "mean_token_accuracy": 0.8031218647956848, "step": 121 }, { "epoch": 0.3836477987421384, "grad_norm": 0.2625165581703186, "learning_rate": 3.999368701402763e-05, "loss": 0.5465, "mean_token_accuracy": 0.8032469749450684, "step": 122 }, { "epoch": 0.3867924528301887, "grad_norm": 0.334791362285614, "learning_rate": 3.9993192092187644e-05, "loss": 0.5469, "mean_token_accuracy": 0.8045857548713684, "step": 123 }, { "epoch": 0.389937106918239, "grad_norm": 0.28799203038215637, "learning_rate": 3.9992678498848664e-05, "loss": 0.544, "mean_token_accuracy": 0.8061292171478271, "step": 124 }, { "epoch": 0.39308176100628933, "grad_norm": 0.3053019642829895, "learning_rate": 3.9992146234543645e-05, "loss": 0.5359, "mean_token_accuracy": 0.807460606098175, "step": 125 }, { "epoch": 0.39622641509433965, "grad_norm": 0.26873913407325745, "learning_rate": 3.999159529982493e-05, "loss": 0.5514, "mean_token_accuracy": 0.8021828532218933, "step": 126 }, { "epoch": 0.39937106918238996, "grad_norm": 0.2870161533355713, "learning_rate": 3.9991025695264205e-05, "loss": 0.539, "mean_token_accuracy": 0.8052948713302612, "step": 127 }, { "epoch": 0.4025157232704403, "grad_norm": 0.2600720226764679, "learning_rate": 3.9990437421452556e-05, "loss": 0.5475, "mean_token_accuracy": 0.8043171763420105, "step": 128 }, { "epoch": 0.4056603773584906, "grad_norm": 0.31923291087150574, "learning_rate": 3.9989830479000435e-05, "loss": 0.536, "mean_token_accuracy": 0.8079201579093933, "step": 129 }, { "epoch": 0.4088050314465409, "grad_norm": 0.2548767924308777, "learning_rate": 3.9989204868537654e-05, "loss": 0.5412, "mean_token_accuracy": 0.8052085041999817, "step": 130 }, { "epoch": 0.4119496855345912, "grad_norm": 0.2763267755508423, "learning_rate": 3.998856059071342e-05, "loss": 0.5434, "mean_token_accuracy": 0.8055262565612793, "step": 131 }, { "epoch": 0.41509433962264153, "grad_norm": 0.2880299985408783, "learning_rate": 3.99878976461963e-05, "loss": 0.5349, "mean_token_accuracy": 0.8085739016532898, "step": 132 }, { "epoch": 0.41823899371069184, "grad_norm": 0.2466239184141159, "learning_rate": 3.998721603567422e-05, "loss": 0.5464, "mean_token_accuracy": 0.8047433495521545, "step": 133 }, { "epoch": 0.42138364779874216, "grad_norm": 0.3075404763221741, "learning_rate": 3.9986515759854495e-05, "loss": 0.5366, "mean_token_accuracy": 0.8063505291938782, "step": 134 }, { "epoch": 0.42452830188679247, "grad_norm": 0.2623771131038666, "learning_rate": 3.9985796819463795e-05, "loss": 0.5354, "mean_token_accuracy": 0.8071600198745728, "step": 135 }, { "epoch": 0.4276729559748428, "grad_norm": 0.2850979268550873, "learning_rate": 3.998505921524816e-05, "loss": 0.5383, "mean_token_accuracy": 0.8051438331604004, "step": 136 }, { "epoch": 0.4308176100628931, "grad_norm": 0.21707168221473694, "learning_rate": 3.9984302947973005e-05, "loss": 0.5326, "mean_token_accuracy": 0.8092407584190369, "step": 137 }, { "epoch": 0.4339622641509434, "grad_norm": 0.2864557206630707, "learning_rate": 3.9983528018423106e-05, "loss": 0.5376, "mean_token_accuracy": 0.8068653345108032, "step": 138 }, { "epoch": 0.4371069182389937, "grad_norm": 0.227940633893013, "learning_rate": 3.998273442740261e-05, "loss": 0.5355, "mean_token_accuracy": 0.8075131177902222, "step": 139 }, { "epoch": 0.44025157232704404, "grad_norm": 0.2534600496292114, "learning_rate": 3.9981922175735014e-05, "loss": 0.536, "mean_token_accuracy": 0.8077684640884399, "step": 140 }, { "epoch": 0.44339622641509435, "grad_norm": 0.3271535038948059, "learning_rate": 3.9981091264263205e-05, "loss": 0.5227, "mean_token_accuracy": 0.8108183145523071, "step": 141 }, { "epoch": 0.44654088050314467, "grad_norm": 0.2163369208574295, "learning_rate": 3.998024169384941e-05, "loss": 0.5402, "mean_token_accuracy": 0.8060819506645203, "step": 142 }, { "epoch": 0.449685534591195, "grad_norm": 0.4553479850292206, "learning_rate": 3.997937346537522e-05, "loss": 0.5296, "mean_token_accuracy": 0.8097839951515198, "step": 143 }, { "epoch": 0.4528301886792453, "grad_norm": 0.31248709559440613, "learning_rate": 3.9978486579741596e-05, "loss": 0.5374, "mean_token_accuracy": 0.8072243928909302, "step": 144 }, { "epoch": 0.4559748427672956, "grad_norm": 0.5377129316329956, "learning_rate": 3.9977581037868874e-05, "loss": 0.5358, "mean_token_accuracy": 0.8072550296783447, "step": 145 }, { "epoch": 0.4591194968553459, "grad_norm": 0.443276584148407, "learning_rate": 3.997665684069671e-05, "loss": 0.5346, "mean_token_accuracy": 0.8065415024757385, "step": 146 }, { "epoch": 0.46226415094339623, "grad_norm": 0.4334987699985504, "learning_rate": 3.997571398918415e-05, "loss": 0.5322, "mean_token_accuracy": 0.8077476620674133, "step": 147 }, { "epoch": 0.46540880503144655, "grad_norm": 0.4879336655139923, "learning_rate": 3.99747524843096e-05, "loss": 0.5402, "mean_token_accuracy": 0.8054540157318115, "step": 148 }, { "epoch": 0.46855345911949686, "grad_norm": 0.3232817053794861, "learning_rate": 3.9973772327070805e-05, "loss": 0.5281, "mean_token_accuracy": 0.8070737719535828, "step": 149 }, { "epoch": 0.4716981132075472, "grad_norm": 0.373993843793869, "learning_rate": 3.997277351848486e-05, "loss": 0.542, "mean_token_accuracy": 0.8036733865737915, "step": 150 }, { "epoch": 0.4748427672955975, "grad_norm": 0.2840160131454468, "learning_rate": 3.997175605958825e-05, "loss": 0.525, "mean_token_accuracy": 0.8079817891120911, "step": 151 }, { "epoch": 0.4779874213836478, "grad_norm": 0.34674006700515747, "learning_rate": 3.997071995143676e-05, "loss": 0.5432, "mean_token_accuracy": 0.8057137727737427, "step": 152 }, { "epoch": 0.4811320754716981, "grad_norm": 0.2753613591194153, "learning_rate": 3.9969665195105585e-05, "loss": 0.5401, "mean_token_accuracy": 0.8051686882972717, "step": 153 }, { "epoch": 0.48427672955974843, "grad_norm": 0.338226854801178, "learning_rate": 3.996859179168923e-05, "loss": 0.5343, "mean_token_accuracy": 0.8090754747390747, "step": 154 }, { "epoch": 0.48742138364779874, "grad_norm": 0.32556068897247314, "learning_rate": 3.996749974230157e-05, "loss": 0.5375, "mean_token_accuracy": 0.8059588670730591, "step": 155 }, { "epoch": 0.49056603773584906, "grad_norm": 0.2867760956287384, "learning_rate": 3.9966389048075815e-05, "loss": 0.5243, "mean_token_accuracy": 0.8106968998908997, "step": 156 }, { "epoch": 0.4937106918238994, "grad_norm": 0.33583030104637146, "learning_rate": 3.996525971016453e-05, "loss": 0.5333, "mean_token_accuracy": 0.8073939681053162, "step": 157 }, { "epoch": 0.4968553459119497, "grad_norm": 0.25750523805618286, "learning_rate": 3.9964111729739636e-05, "loss": 0.5366, "mean_token_accuracy": 0.8069300651550293, "step": 158 }, { "epoch": 0.5, "grad_norm": 0.3350702226161957, "learning_rate": 3.9962945107992384e-05, "loss": 0.5297, "mean_token_accuracy": 0.8082327246665955, "step": 159 }, { "epoch": 0.5031446540880503, "grad_norm": 0.25913313031196594, "learning_rate": 3.996175984613337e-05, "loss": 0.5346, "mean_token_accuracy": 0.8069829940795898, "step": 160 }, { "epoch": 0.5062893081761006, "grad_norm": 0.26537439227104187, "learning_rate": 3.996055594539255e-05, "loss": 0.5227, "mean_token_accuracy": 0.8109566569328308, "step": 161 }, { "epoch": 0.5094339622641509, "grad_norm": 0.29593321681022644, "learning_rate": 3.99593334070192e-05, "loss": 0.523, "mean_token_accuracy": 0.8093637228012085, "step": 162 }, { "epoch": 0.5125786163522013, "grad_norm": 0.2575869858264923, "learning_rate": 3.995809223228195e-05, "loss": 0.5406, "mean_token_accuracy": 0.8029673099517822, "step": 163 }, { "epoch": 0.5157232704402516, "grad_norm": 0.301028847694397, "learning_rate": 3.995683242246876e-05, "loss": 0.5331, "mean_token_accuracy": 0.8078096508979797, "step": 164 }, { "epoch": 0.5188679245283019, "grad_norm": 0.29553452134132385, "learning_rate": 3.995555397888693e-05, "loss": 0.5213, "mean_token_accuracy": 0.8104479312896729, "step": 165 }, { "epoch": 0.5220125786163522, "grad_norm": 0.30347883701324463, "learning_rate": 3.995425690286311e-05, "loss": 0.5386, "mean_token_accuracy": 0.8064728379249573, "step": 166 }, { "epoch": 0.5251572327044025, "grad_norm": 0.3146721422672272, "learning_rate": 3.995294119574326e-05, "loss": 0.5259, "mean_token_accuracy": 0.8080021739006042, "step": 167 }, { "epoch": 0.5283018867924528, "grad_norm": 0.26118120551109314, "learning_rate": 3.99516068588927e-05, "loss": 0.5187, "mean_token_accuracy": 0.812569260597229, "step": 168 }, { "epoch": 0.5314465408805031, "grad_norm": 0.3231758177280426, "learning_rate": 3.995025389369606e-05, "loss": 0.5239, "mean_token_accuracy": 0.8090735673904419, "step": 169 }, { "epoch": 0.5345911949685535, "grad_norm": 0.21853643655776978, "learning_rate": 3.9948882301557306e-05, "loss": 0.5162, "mean_token_accuracy": 0.8130766153335571, "step": 170 }, { "epoch": 0.5377358490566038, "grad_norm": 0.2971287667751312, "learning_rate": 3.9947492083899746e-05, "loss": 0.5249, "mean_token_accuracy": 0.8092015385627747, "step": 171 }, { "epoch": 0.5408805031446541, "grad_norm": 0.2766264081001282, "learning_rate": 3.9946083242166e-05, "loss": 0.5202, "mean_token_accuracy": 0.8101739287376404, "step": 172 }, { "epoch": 0.5440251572327044, "grad_norm": 0.22849610447883606, "learning_rate": 3.9944655777818024e-05, "loss": 0.5173, "mean_token_accuracy": 0.8131073117256165, "step": 173 }, { "epoch": 0.5471698113207547, "grad_norm": 0.3846280872821808, "learning_rate": 3.994320969233709e-05, "loss": 0.5295, "mean_token_accuracy": 0.807365357875824, "step": 174 }, { "epoch": 0.550314465408805, "grad_norm": 0.2353799194097519, "learning_rate": 3.9941744987223796e-05, "loss": 0.5241, "mean_token_accuracy": 0.810299813747406, "step": 175 }, { "epoch": 0.5534591194968553, "grad_norm": 0.36173367500305176, "learning_rate": 3.994026166399808e-05, "loss": 0.531, "mean_token_accuracy": 0.8072293996810913, "step": 176 }, { "epoch": 0.5566037735849056, "grad_norm": 0.25161418318748474, "learning_rate": 3.993875972419916e-05, "loss": 0.5245, "mean_token_accuracy": 0.8084467053413391, "step": 177 }, { "epoch": 0.559748427672956, "grad_norm": 0.411323219537735, "learning_rate": 3.993723916938561e-05, "loss": 0.5297, "mean_token_accuracy": 0.8063352704048157, "step": 178 }, { "epoch": 0.5628930817610063, "grad_norm": 0.28566470742225647, "learning_rate": 3.9935700001135307e-05, "loss": 0.5246, "mean_token_accuracy": 0.8076804280281067, "step": 179 }, { "epoch": 0.5660377358490566, "grad_norm": 0.3744773864746094, "learning_rate": 3.9934142221045434e-05, "loss": 0.5219, "mean_token_accuracy": 0.8113803863525391, "step": 180 }, { "epoch": 0.5691823899371069, "grad_norm": 0.3109516501426697, "learning_rate": 3.9932565830732505e-05, "loss": 0.5152, "mean_token_accuracy": 0.8134464025497437, "step": 181 }, { "epoch": 0.5723270440251572, "grad_norm": 0.40750232338905334, "learning_rate": 3.993097083183233e-05, "loss": 0.5119, "mean_token_accuracy": 0.8139127492904663, "step": 182 }, { "epoch": 0.5754716981132075, "grad_norm": 0.3474627733230591, "learning_rate": 3.9929357226000045e-05, "loss": 0.538, "mean_token_accuracy": 0.8060604333877563, "step": 183 }, { "epoch": 0.5786163522012578, "grad_norm": 0.39326900243759155, "learning_rate": 3.992772501491008e-05, "loss": 0.5209, "mean_token_accuracy": 0.8119919300079346, "step": 184 }, { "epoch": 0.5817610062893082, "grad_norm": 0.390663743019104, "learning_rate": 3.992607420025618e-05, "loss": 0.5165, "mean_token_accuracy": 0.8117203712463379, "step": 185 }, { "epoch": 0.5849056603773585, "grad_norm": 0.2819702923297882, "learning_rate": 3.9924404783751385e-05, "loss": 0.5287, "mean_token_accuracy": 0.8078507781028748, "step": 186 }, { "epoch": 0.5880503144654088, "grad_norm": 0.3405737280845642, "learning_rate": 3.992271676712805e-05, "loss": 0.5264, "mean_token_accuracy": 0.8081623315811157, "step": 187 }, { "epoch": 0.5911949685534591, "grad_norm": 0.24368081986904144, "learning_rate": 3.9921010152137824e-05, "loss": 0.5298, "mean_token_accuracy": 0.8077815771102905, "step": 188 }, { "epoch": 0.5943396226415094, "grad_norm": 0.2949293553829193, "learning_rate": 3.991928494055166e-05, "loss": 0.5169, "mean_token_accuracy": 0.8117429614067078, "step": 189 }, { "epoch": 0.5974842767295597, "grad_norm": 0.2298242747783661, "learning_rate": 3.991754113415981e-05, "loss": 0.5209, "mean_token_accuracy": 0.8099380731582642, "step": 190 }, { "epoch": 0.60062893081761, "grad_norm": 0.2921850383281708, "learning_rate": 3.9915778734771816e-05, "loss": 0.5249, "mean_token_accuracy": 0.8096524477005005, "step": 191 }, { "epoch": 0.6037735849056604, "grad_norm": 0.19392161071300507, "learning_rate": 3.991399774421651e-05, "loss": 0.5114, "mean_token_accuracy": 0.8131522536277771, "step": 192 }, { "epoch": 0.6069182389937107, "grad_norm": 0.3399420976638794, "learning_rate": 3.991219816434204e-05, "loss": 0.5156, "mean_token_accuracy": 0.8106433749198914, "step": 193 }, { "epoch": 0.610062893081761, "grad_norm": 0.2603597939014435, "learning_rate": 3.99103799970158e-05, "loss": 0.5152, "mean_token_accuracy": 0.8112037777900696, "step": 194 }, { "epoch": 0.6132075471698113, "grad_norm": 0.2951433062553406, "learning_rate": 3.990854324412453e-05, "loss": 0.5243, "mean_token_accuracy": 0.8115384578704834, "step": 195 }, { "epoch": 0.6163522012578616, "grad_norm": 0.2732630968093872, "learning_rate": 3.9906687907574186e-05, "loss": 0.525, "mean_token_accuracy": 0.8100634813308716, "step": 196 }, { "epoch": 0.6194968553459119, "grad_norm": 0.24679310619831085, "learning_rate": 3.9904813989290084e-05, "loss": 0.5178, "mean_token_accuracy": 0.8117086291313171, "step": 197 }, { "epoch": 0.6226415094339622, "grad_norm": 0.2945667505264282, "learning_rate": 3.990292149121675e-05, "loss": 0.5088, "mean_token_accuracy": 0.8141837120056152, "step": 198 }, { "epoch": 0.6257861635220126, "grad_norm": 0.24672532081604004, "learning_rate": 3.9901010415318066e-05, "loss": 0.5146, "mean_token_accuracy": 0.8105906844139099, "step": 199 }, { "epoch": 0.6289308176100629, "grad_norm": 0.32529643177986145, "learning_rate": 3.9899080763577126e-05, "loss": 0.5253, "mean_token_accuracy": 0.8105737566947937, "step": 200 }, { "epoch": 0.6320754716981132, "grad_norm": 0.2720421552658081, "learning_rate": 3.9897132537996326e-05, "loss": 0.5182, "mean_token_accuracy": 0.8114479780197144, "step": 201 }, { "epoch": 0.6352201257861635, "grad_norm": 0.2893914580345154, "learning_rate": 3.9895165740597336e-05, "loss": 0.5259, "mean_token_accuracy": 0.8097965121269226, "step": 202 }, { "epoch": 0.6383647798742138, "grad_norm": 0.2724241018295288, "learning_rate": 3.989318037342111e-05, "loss": 0.5149, "mean_token_accuracy": 0.8123500347137451, "step": 203 }, { "epoch": 0.6415094339622641, "grad_norm": 0.2764907479286194, "learning_rate": 3.989117643852785e-05, "loss": 0.5153, "mean_token_accuracy": 0.8120966553688049, "step": 204 }, { "epoch": 0.6446540880503144, "grad_norm": 0.24415266513824463, "learning_rate": 3.988915393799703e-05, "loss": 0.5111, "mean_token_accuracy": 0.8132697343826294, "step": 205 }, { "epoch": 0.6477987421383647, "grad_norm": 0.24454502761363983, "learning_rate": 3.988711287392741e-05, "loss": 0.5063, "mean_token_accuracy": 0.8129962682723999, "step": 206 }, { "epoch": 0.6509433962264151, "grad_norm": 0.25321948528289795, "learning_rate": 3.9885053248436986e-05, "loss": 0.5117, "mean_token_accuracy": 0.8123859763145447, "step": 207 }, { "epoch": 0.6540880503144654, "grad_norm": 0.21559487283229828, "learning_rate": 3.9882975063663026e-05, "loss": 0.5242, "mean_token_accuracy": 0.8082608580589294, "step": 208 }, { "epoch": 0.6572327044025157, "grad_norm": 0.2384122759103775, "learning_rate": 3.9880878321762066e-05, "loss": 0.5156, "mean_token_accuracy": 0.8102776408195496, "step": 209 }, { "epoch": 0.660377358490566, "grad_norm": 0.24567940831184387, "learning_rate": 3.9878763024909884e-05, "loss": 0.5135, "mean_token_accuracy": 0.8105263113975525, "step": 210 }, { "epoch": 0.6635220125786163, "grad_norm": 0.2328757643699646, "learning_rate": 3.987662917530153e-05, "loss": 0.5117, "mean_token_accuracy": 0.812843918800354, "step": 211 }, { "epoch": 0.6666666666666666, "grad_norm": 0.22722886502742767, "learning_rate": 3.987447677515128e-05, "loss": 0.5118, "mean_token_accuracy": 0.8124209642410278, "step": 212 }, { "epoch": 0.6698113207547169, "grad_norm": 0.24911589920520782, "learning_rate": 3.9872305826692686e-05, "loss": 0.5164, "mean_token_accuracy": 0.809599757194519, "step": 213 }, { "epoch": 0.6729559748427673, "grad_norm": 0.21305599808692932, "learning_rate": 3.987011633217853e-05, "loss": 0.5184, "mean_token_accuracy": 0.8106415271759033, "step": 214 }, { "epoch": 0.6761006289308176, "grad_norm": 0.2465692013502121, "learning_rate": 3.986790829388086e-05, "loss": 0.5182, "mean_token_accuracy": 0.8113357424736023, "step": 215 }, { "epoch": 0.6792452830188679, "grad_norm": 0.23801495134830475, "learning_rate": 3.986568171409094e-05, "loss": 0.5161, "mean_token_accuracy": 0.8109778761863708, "step": 216 }, { "epoch": 0.6823899371069182, "grad_norm": 0.22071239352226257, "learning_rate": 3.9863436595119284e-05, "loss": 0.5156, "mean_token_accuracy": 0.8117619156837463, "step": 217 }, { "epoch": 0.6855345911949685, "grad_norm": 0.21508371829986572, "learning_rate": 3.986117293929566e-05, "loss": 0.4984, "mean_token_accuracy": 0.8161824345588684, "step": 218 }, { "epoch": 0.6886792452830188, "grad_norm": 0.21123456954956055, "learning_rate": 3.985889074896904e-05, "loss": 0.5068, "mean_token_accuracy": 0.8125125169754028, "step": 219 }, { "epoch": 0.6918238993710691, "grad_norm": 0.2076350450515747, "learning_rate": 3.985659002650767e-05, "loss": 0.5173, "mean_token_accuracy": 0.8120942115783691, "step": 220 }, { "epoch": 0.6949685534591195, "grad_norm": 0.20349815487861633, "learning_rate": 3.9854270774299e-05, "loss": 0.5088, "mean_token_accuracy": 0.8142285943031311, "step": 221 }, { "epoch": 0.6981132075471698, "grad_norm": 0.2199343591928482, "learning_rate": 3.9851932994749704e-05, "loss": 0.5158, "mean_token_accuracy": 0.811479926109314, "step": 222 }, { "epoch": 0.7012578616352201, "grad_norm": 0.2165432721376419, "learning_rate": 3.98495766902857e-05, "loss": 0.5092, "mean_token_accuracy": 0.8122998476028442, "step": 223 }, { "epoch": 0.7044025157232704, "grad_norm": 0.21602460741996765, "learning_rate": 3.984720186335211e-05, "loss": 0.5192, "mean_token_accuracy": 0.811272382736206, "step": 224 }, { "epoch": 0.7075471698113207, "grad_norm": 0.2049703598022461, "learning_rate": 3.9844808516413305e-05, "loss": 0.5076, "mean_token_accuracy": 0.8121902346611023, "step": 225 }, { "epoch": 0.710691823899371, "grad_norm": 0.22164130210876465, "learning_rate": 3.9842396651952836e-05, "loss": 0.5131, "mean_token_accuracy": 0.8126029968261719, "step": 226 }, { "epoch": 0.7138364779874213, "grad_norm": 0.19871044158935547, "learning_rate": 3.98399662724735e-05, "loss": 0.5163, "mean_token_accuracy": 0.8120744228363037, "step": 227 }, { "epoch": 0.7169811320754716, "grad_norm": 0.25331369042396545, "learning_rate": 3.98375173804973e-05, "loss": 0.512, "mean_token_accuracy": 0.811780571937561, "step": 228 }, { "epoch": 0.720125786163522, "grad_norm": 0.21906249225139618, "learning_rate": 3.983504997856544e-05, "loss": 0.5169, "mean_token_accuracy": 0.8110173344612122, "step": 229 }, { "epoch": 0.7232704402515723, "grad_norm": 0.22297775745391846, "learning_rate": 3.983256406923835e-05, "loss": 0.5185, "mean_token_accuracy": 0.8114131689071655, "step": 230 }, { "epoch": 0.7264150943396226, "grad_norm": 0.19183681905269623, "learning_rate": 3.9830059655095625e-05, "loss": 0.516, "mean_token_accuracy": 0.8132800459861755, "step": 231 }, { "epoch": 0.7295597484276729, "grad_norm": 0.24273158609867096, "learning_rate": 3.9827536738736115e-05, "loss": 0.5179, "mean_token_accuracy": 0.8097060322761536, "step": 232 }, { "epoch": 0.7327044025157232, "grad_norm": 0.21596857905387878, "learning_rate": 3.982499532277785e-05, "loss": 0.491, "mean_token_accuracy": 0.8167231678962708, "step": 233 }, { "epoch": 0.7358490566037735, "grad_norm": 0.23281732201576233, "learning_rate": 3.982243540985803e-05, "loss": 0.5211, "mean_token_accuracy": 0.8096528649330139, "step": 234 }, { "epoch": 0.7389937106918238, "grad_norm": 0.1850733757019043, "learning_rate": 3.9819857002633084e-05, "loss": 0.5119, "mean_token_accuracy": 0.812339723110199, "step": 235 }, { "epoch": 0.7421383647798742, "grad_norm": 0.2426994889974594, "learning_rate": 3.981726010377862e-05, "loss": 0.5155, "mean_token_accuracy": 0.8118655681610107, "step": 236 }, { "epoch": 0.7452830188679245, "grad_norm": 0.20813558995723724, "learning_rate": 3.981464471598943e-05, "loss": 0.5147, "mean_token_accuracy": 0.8122422099113464, "step": 237 }, { "epoch": 0.7484276729559748, "grad_norm": 0.25706005096435547, "learning_rate": 3.98120108419795e-05, "loss": 0.5029, "mean_token_accuracy": 0.814775824546814, "step": 238 }, { "epoch": 0.7515723270440252, "grad_norm": 0.22012631595134735, "learning_rate": 3.9809358484482e-05, "loss": 0.5086, "mean_token_accuracy": 0.8135779500007629, "step": 239 }, { "epoch": 0.7547169811320755, "grad_norm": 0.2591609060764313, "learning_rate": 3.980668764624927e-05, "loss": 0.5166, "mean_token_accuracy": 0.811547577381134, "step": 240 }, { "epoch": 0.7578616352201258, "grad_norm": 0.2425551414489746, "learning_rate": 3.9803998330052834e-05, "loss": 0.5106, "mean_token_accuracy": 0.8126974105834961, "step": 241 }, { "epoch": 0.7610062893081762, "grad_norm": 0.19677698612213135, "learning_rate": 3.980129053868339e-05, "loss": 0.5025, "mean_token_accuracy": 0.8132906556129456, "step": 242 }, { "epoch": 0.7641509433962265, "grad_norm": 0.24536176025867462, "learning_rate": 3.9798564274950815e-05, "loss": 0.4983, "mean_token_accuracy": 0.8168281316757202, "step": 243 }, { "epoch": 0.7672955974842768, "grad_norm": 0.23609694838523865, "learning_rate": 3.979581954168414e-05, "loss": 0.4942, "mean_token_accuracy": 0.8176018595695496, "step": 244 }, { "epoch": 0.7704402515723271, "grad_norm": 0.2168692797422409, "learning_rate": 3.9793056341731556e-05, "loss": 0.5168, "mean_token_accuracy": 0.8108464479446411, "step": 245 }, { "epoch": 0.7735849056603774, "grad_norm": 0.21630840003490448, "learning_rate": 3.979027467796046e-05, "loss": 0.501, "mean_token_accuracy": 0.8146473169326782, "step": 246 }, { "epoch": 0.7767295597484277, "grad_norm": 0.20055299997329712, "learning_rate": 3.978747455325736e-05, "loss": 0.5082, "mean_token_accuracy": 0.8135594129562378, "step": 247 }, { "epoch": 0.779874213836478, "grad_norm": 0.282925546169281, "learning_rate": 3.9784655970527934e-05, "loss": 0.5144, "mean_token_accuracy": 0.8131219744682312, "step": 248 }, { "epoch": 0.7830188679245284, "grad_norm": 0.21999655663967133, "learning_rate": 3.978181893269703e-05, "loss": 0.5046, "mean_token_accuracy": 0.8133974671363831, "step": 249 }, { "epoch": 0.7861635220125787, "grad_norm": 0.2439885139465332, "learning_rate": 3.977896344270864e-05, "loss": 0.5052, "mean_token_accuracy": 0.8127160668373108, "step": 250 }, { "epoch": 0.789308176100629, "grad_norm": 0.2688886225223541, "learning_rate": 3.9776089503525895e-05, "loss": 0.5109, "mean_token_accuracy": 0.81061190366745, "step": 251 }, { "epoch": 0.7924528301886793, "grad_norm": 0.2625892162322998, "learning_rate": 3.977319711813107e-05, "loss": 0.5172, "mean_token_accuracy": 0.8132520914077759, "step": 252 }, { "epoch": 0.7955974842767296, "grad_norm": 0.1982739269733429, "learning_rate": 3.97702862895256e-05, "loss": 0.499, "mean_token_accuracy": 0.8147685527801514, "step": 253 }, { "epoch": 0.7987421383647799, "grad_norm": 0.2820964455604553, "learning_rate": 3.976735702073003e-05, "loss": 0.5134, "mean_token_accuracy": 0.8140535354614258, "step": 254 }, { "epoch": 0.8018867924528302, "grad_norm": 0.19811579585075378, "learning_rate": 3.9764409314784074e-05, "loss": 0.5176, "mean_token_accuracy": 0.8116709589958191, "step": 255 }, { "epoch": 0.8050314465408805, "grad_norm": 0.26263049244880676, "learning_rate": 3.9761443174746556e-05, "loss": 0.5162, "mean_token_accuracy": 0.8113158345222473, "step": 256 }, { "epoch": 0.8081761006289309, "grad_norm": 0.24067486822605133, "learning_rate": 3.975845860369542e-05, "loss": 0.5055, "mean_token_accuracy": 0.814015805721283, "step": 257 }, { "epoch": 0.8113207547169812, "grad_norm": 0.19591133296489716, "learning_rate": 3.975545560472776e-05, "loss": 0.5074, "mean_token_accuracy": 0.8133794069290161, "step": 258 }, { "epoch": 0.8144654088050315, "grad_norm": 0.24649196863174438, "learning_rate": 3.975243418095978e-05, "loss": 0.5141, "mean_token_accuracy": 0.8121534585952759, "step": 259 }, { "epoch": 0.8176100628930818, "grad_norm": 0.20178139209747314, "learning_rate": 3.9749394335526806e-05, "loss": 0.5016, "mean_token_accuracy": 0.8149169087409973, "step": 260 }, { "epoch": 0.8207547169811321, "grad_norm": 0.2396860122680664, "learning_rate": 3.9746336071583284e-05, "loss": 0.5112, "mean_token_accuracy": 0.8116641640663147, "step": 261 }, { "epoch": 0.8238993710691824, "grad_norm": 0.2169322371482849, "learning_rate": 3.9743259392302765e-05, "loss": 0.5033, "mean_token_accuracy": 0.8144574761390686, "step": 262 }, { "epoch": 0.8270440251572327, "grad_norm": 0.27799248695373535, "learning_rate": 3.9740164300877905e-05, "loss": 0.5134, "mean_token_accuracy": 0.8135626316070557, "step": 263 }, { "epoch": 0.8301886792452831, "grad_norm": 0.2029547095298767, "learning_rate": 3.9737050800520484e-05, "loss": 0.4998, "mean_token_accuracy": 0.8158847689628601, "step": 264 }, { "epoch": 0.8333333333333334, "grad_norm": 0.23742541670799255, "learning_rate": 3.973391889446137e-05, "loss": 0.5002, "mean_token_accuracy": 0.8158310651779175, "step": 265 }, { "epoch": 0.8364779874213837, "grad_norm": 0.23540346324443817, "learning_rate": 3.973076858595054e-05, "loss": 0.5075, "mean_token_accuracy": 0.8140506148338318, "step": 266 }, { "epoch": 0.839622641509434, "grad_norm": 0.21212562918663025, "learning_rate": 3.972759987825706e-05, "loss": 0.4964, "mean_token_accuracy": 0.817358672618866, "step": 267 }, { "epoch": 0.8427672955974843, "grad_norm": 0.2811741828918457, "learning_rate": 3.972441277466909e-05, "loss": 0.5045, "mean_token_accuracy": 0.8151137828826904, "step": 268 }, { "epoch": 0.8459119496855346, "grad_norm": 0.22324548661708832, "learning_rate": 3.972120727849388e-05, "loss": 0.5184, "mean_token_accuracy": 0.8102690577507019, "step": 269 }, { "epoch": 0.8490566037735849, "grad_norm": 0.2826754152774811, "learning_rate": 3.9717983393057774e-05, "loss": 0.5033, "mean_token_accuracy": 0.8149948120117188, "step": 270 }, { "epoch": 0.8522012578616353, "grad_norm": 0.20113104581832886, "learning_rate": 3.971474112170618e-05, "loss": 0.4963, "mean_token_accuracy": 0.8169792890548706, "step": 271 }, { "epoch": 0.8553459119496856, "grad_norm": 0.31522729992866516, "learning_rate": 3.971148046780361e-05, "loss": 0.5043, "mean_token_accuracy": 0.8139733672142029, "step": 272 }, { "epoch": 0.8584905660377359, "grad_norm": 0.24440452456474304, "learning_rate": 3.970820143473363e-05, "loss": 0.506, "mean_token_accuracy": 0.8141810297966003, "step": 273 }, { "epoch": 0.8616352201257862, "grad_norm": 0.34243300557136536, "learning_rate": 3.970490402589889e-05, "loss": 0.5009, "mean_token_accuracy": 0.8145892024040222, "step": 274 }, { "epoch": 0.8647798742138365, "grad_norm": 0.3184676468372345, "learning_rate": 3.970158824472109e-05, "loss": 0.5019, "mean_token_accuracy": 0.8156532049179077, "step": 275 }, { "epoch": 0.8679245283018868, "grad_norm": 0.2528303861618042, "learning_rate": 3.969825409464103e-05, "loss": 0.5099, "mean_token_accuracy": 0.8132652044296265, "step": 276 }, { "epoch": 0.8710691823899371, "grad_norm": 0.2652378976345062, "learning_rate": 3.969490157911854e-05, "loss": 0.4965, "mean_token_accuracy": 0.8170227408409119, "step": 277 }, { "epoch": 0.8742138364779874, "grad_norm": 0.2514651119709015, "learning_rate": 3.9691530701632525e-05, "loss": 0.4947, "mean_token_accuracy": 0.8181447386741638, "step": 278 }, { "epoch": 0.8773584905660378, "grad_norm": 0.24551190435886383, "learning_rate": 3.968814146568093e-05, "loss": 0.5039, "mean_token_accuracy": 0.8154646158218384, "step": 279 }, { "epoch": 0.8805031446540881, "grad_norm": 0.250418096780777, "learning_rate": 3.9684733874780764e-05, "loss": 0.5057, "mean_token_accuracy": 0.8137528896331787, "step": 280 }, { "epoch": 0.8836477987421384, "grad_norm": 0.24162472784519196, "learning_rate": 3.9681307932468066e-05, "loss": 0.499, "mean_token_accuracy": 0.8132847547531128, "step": 281 }, { "epoch": 0.8867924528301887, "grad_norm": 0.25049126148223877, "learning_rate": 3.967786364229794e-05, "loss": 0.5016, "mean_token_accuracy": 0.8135824203491211, "step": 282 }, { "epoch": 0.889937106918239, "grad_norm": 0.21691341698169708, "learning_rate": 3.9674401007844525e-05, "loss": 0.4909, "mean_token_accuracy": 0.8152883052825928, "step": 283 }, { "epoch": 0.8930817610062893, "grad_norm": 0.22291503846645355, "learning_rate": 3.967092003270098e-05, "loss": 0.4959, "mean_token_accuracy": 0.8131790161132812, "step": 284 }, { "epoch": 0.8962264150943396, "grad_norm": 0.21946825087070465, "learning_rate": 3.96674207204795e-05, "loss": 0.4943, "mean_token_accuracy": 0.8152416348457336, "step": 285 }, { "epoch": 0.89937106918239, "grad_norm": 0.21576789021492004, "learning_rate": 3.966390307481133e-05, "loss": 0.4963, "mean_token_accuracy": 0.8158524036407471, "step": 286 }, { "epoch": 0.9025157232704403, "grad_norm": 0.18200084567070007, "learning_rate": 3.966036709934671e-05, "loss": 0.4983, "mean_token_accuracy": 0.8171305656433105, "step": 287 }, { "epoch": 0.9056603773584906, "grad_norm": 0.20305195450782776, "learning_rate": 3.9656812797754924e-05, "loss": 0.5004, "mean_token_accuracy": 0.8154840469360352, "step": 288 }, { "epoch": 0.9088050314465409, "grad_norm": 0.17034946382045746, "learning_rate": 3.965324017372426e-05, "loss": 0.4975, "mean_token_accuracy": 0.8167921304702759, "step": 289 }, { "epoch": 0.9119496855345912, "grad_norm": 0.2208290696144104, "learning_rate": 3.964964923096202e-05, "loss": 0.5022, "mean_token_accuracy": 0.815450131893158, "step": 290 }, { "epoch": 0.9150943396226415, "grad_norm": 0.18164092302322388, "learning_rate": 3.964603997319452e-05, "loss": 0.4964, "mean_token_accuracy": 0.8163471221923828, "step": 291 }, { "epoch": 0.9182389937106918, "grad_norm": 0.18271009624004364, "learning_rate": 3.964241240416708e-05, "loss": 0.489, "mean_token_accuracy": 0.8168913125991821, "step": 292 }, { "epoch": 0.9213836477987422, "grad_norm": 0.19366014003753662, "learning_rate": 3.963876652764402e-05, "loss": 0.5002, "mean_token_accuracy": 0.815049946308136, "step": 293 }, { "epoch": 0.9245283018867925, "grad_norm": 0.21568985283374786, "learning_rate": 3.963510234740866e-05, "loss": 0.5057, "mean_token_accuracy": 0.8131378293037415, "step": 294 }, { "epoch": 0.9276729559748428, "grad_norm": 0.16388444602489471, "learning_rate": 3.963141986726332e-05, "loss": 0.4991, "mean_token_accuracy": 0.8145990371704102, "step": 295 }, { "epoch": 0.9308176100628931, "grad_norm": 0.21059076488018036, "learning_rate": 3.962771909102928e-05, "loss": 0.5068, "mean_token_accuracy": 0.8122072815895081, "step": 296 }, { "epoch": 0.9339622641509434, "grad_norm": 0.20338748395442963, "learning_rate": 3.962400002254685e-05, "loss": 0.5021, "mean_token_accuracy": 0.8148384690284729, "step": 297 }, { "epoch": 0.9371069182389937, "grad_norm": 0.2134150117635727, "learning_rate": 3.962026266567529e-05, "loss": 0.5077, "mean_token_accuracy": 0.8135893940925598, "step": 298 }, { "epoch": 0.940251572327044, "grad_norm": 0.2366155982017517, "learning_rate": 3.961650702429285e-05, "loss": 0.5028, "mean_token_accuracy": 0.8151034712791443, "step": 299 }, { "epoch": 0.9433962264150944, "grad_norm": 0.19872386753559113, "learning_rate": 3.9612733102296757e-05, "loss": 0.49, "mean_token_accuracy": 0.818320095539093, "step": 300 }, { "epoch": 0.9465408805031447, "grad_norm": 0.24557925760746002, "learning_rate": 3.960894090360319e-05, "loss": 0.4978, "mean_token_accuracy": 0.8179511427879333, "step": 301 }, { "epoch": 0.949685534591195, "grad_norm": 0.18634836375713348, "learning_rate": 3.960513043214731e-05, "loss": 0.508, "mean_token_accuracy": 0.8148860335350037, "step": 302 }, { "epoch": 0.9528301886792453, "grad_norm": 0.2479529231786728, "learning_rate": 3.960130169188324e-05, "loss": 0.5061, "mean_token_accuracy": 0.8134167194366455, "step": 303 }, { "epoch": 0.9559748427672956, "grad_norm": 0.2314770370721817, "learning_rate": 3.959745468678407e-05, "loss": 0.506, "mean_token_accuracy": 0.8139018416404724, "step": 304 }, { "epoch": 0.9591194968553459, "grad_norm": 0.18175768852233887, "learning_rate": 3.959358942084179e-05, "loss": 0.5025, "mean_token_accuracy": 0.814060389995575, "step": 305 }, { "epoch": 0.9622641509433962, "grad_norm": 0.26930001378059387, "learning_rate": 3.958970589806741e-05, "loss": 0.4899, "mean_token_accuracy": 0.8199323415756226, "step": 306 }, { "epoch": 0.9654088050314465, "grad_norm": 0.18011023104190826, "learning_rate": 3.958580412249085e-05, "loss": 0.4998, "mean_token_accuracy": 0.8157727122306824, "step": 307 }, { "epoch": 0.9685534591194969, "grad_norm": 0.20771653950214386, "learning_rate": 3.9581884098160965e-05, "loss": 0.4946, "mean_token_accuracy": 0.8169547915458679, "step": 308 }, { "epoch": 0.9716981132075472, "grad_norm": 0.20119163393974304, "learning_rate": 3.9577945829145565e-05, "loss": 0.493, "mean_token_accuracy": 0.816156268119812, "step": 309 }, { "epoch": 0.9748427672955975, "grad_norm": 0.1962110996246338, "learning_rate": 3.9573989319531376e-05, "loss": 0.4838, "mean_token_accuracy": 0.8181387782096863, "step": 310 }, { "epoch": 0.9779874213836478, "grad_norm": 0.19966855645179749, "learning_rate": 3.957001457342407e-05, "loss": 0.494, "mean_token_accuracy": 0.8177944421768188, "step": 311 }, { "epoch": 0.9811320754716981, "grad_norm": 0.16196459531784058, "learning_rate": 3.956602159494822e-05, "loss": 0.4999, "mean_token_accuracy": 0.8170042037963867, "step": 312 }, { "epoch": 0.9842767295597484, "grad_norm": 0.2277836948633194, "learning_rate": 3.956201038824734e-05, "loss": 0.5079, "mean_token_accuracy": 0.8125689625740051, "step": 313 }, { "epoch": 0.9874213836477987, "grad_norm": 0.15822745859622955, "learning_rate": 3.9557980957483857e-05, "loss": 0.4944, "mean_token_accuracy": 0.8152970671653748, "step": 314 }, { "epoch": 0.9905660377358491, "grad_norm": 0.23701541125774384, "learning_rate": 3.955393330683909e-05, "loss": 0.4949, "mean_token_accuracy": 0.8175249695777893, "step": 315 }, { "epoch": 0.9937106918238994, "grad_norm": 0.17451459169387817, "learning_rate": 3.954986744051329e-05, "loss": 0.4903, "mean_token_accuracy": 0.8196095824241638, "step": 316 }, { "epoch": 0.9968553459119497, "grad_norm": 0.20685537159442902, "learning_rate": 3.954578336272559e-05, "loss": 0.4872, "mean_token_accuracy": 0.8189926147460938, "step": 317 }, { "epoch": 1.0, "grad_norm": 0.24290406703948975, "learning_rate": 3.954168107771402e-05, "loss": 0.4891, "mean_token_accuracy": 0.8178141713142395, "step": 318 }, { "epoch": 1.0031446540880504, "grad_norm": 0.17915529012680054, "learning_rate": 3.953756058973554e-05, "loss": 0.4626, "mean_token_accuracy": 0.8239535689353943, "step": 319 }, { "epoch": 1.0062893081761006, "grad_norm": 0.24146446585655212, "learning_rate": 3.9533421903065946e-05, "loss": 0.4629, "mean_token_accuracy": 0.8243134617805481, "step": 320 }, { "epoch": 1.009433962264151, "grad_norm": 0.24396361410617828, "learning_rate": 3.9529265021999965e-05, "loss": 0.4522, "mean_token_accuracy": 0.8263998031616211, "step": 321 }, { "epoch": 1.0125786163522013, "grad_norm": 0.21227137744426727, "learning_rate": 3.952508995085117e-05, "loss": 0.4574, "mean_token_accuracy": 0.8258650898933411, "step": 322 }, { "epoch": 1.0157232704402517, "grad_norm": 0.26239192485809326, "learning_rate": 3.952089669395203e-05, "loss": 0.4477, "mean_token_accuracy": 0.826561689376831, "step": 323 }, { "epoch": 1.0188679245283019, "grad_norm": 0.21801580488681793, "learning_rate": 3.951668525565387e-05, "loss": 0.4621, "mean_token_accuracy": 0.8246572017669678, "step": 324 }, { "epoch": 1.0220125786163523, "grad_norm": 0.24545596539974213, "learning_rate": 3.95124556403269e-05, "loss": 0.442, "mean_token_accuracy": 0.8272949457168579, "step": 325 }, { "epoch": 1.0251572327044025, "grad_norm": 0.21632397174835205, "learning_rate": 3.950820785236018e-05, "loss": 0.4616, "mean_token_accuracy": 0.8258722424507141, "step": 326 }, { "epoch": 1.028301886792453, "grad_norm": 0.25987663865089417, "learning_rate": 3.950394189616164e-05, "loss": 0.4433, "mean_token_accuracy": 0.8266246318817139, "step": 327 }, { "epoch": 1.0314465408805031, "grad_norm": 0.21071231365203857, "learning_rate": 3.949965777615804e-05, "loss": 0.4587, "mean_token_accuracy": 0.8251431584358215, "step": 328 }, { "epoch": 1.0345911949685536, "grad_norm": 0.25732994079589844, "learning_rate": 3.949535549679502e-05, "loss": 0.4542, "mean_token_accuracy": 0.825411856174469, "step": 329 }, { "epoch": 1.0377358490566038, "grad_norm": 0.2170933187007904, "learning_rate": 3.9491035062537026e-05, "loss": 0.4488, "mean_token_accuracy": 0.8288849592208862, "step": 330 }, { "epoch": 1.0408805031446542, "grad_norm": 0.23197437822818756, "learning_rate": 3.948669647786738e-05, "loss": 0.4475, "mean_token_accuracy": 0.8281545042991638, "step": 331 }, { "epoch": 1.0440251572327044, "grad_norm": 0.2003556787967682, "learning_rate": 3.948233974728821e-05, "loss": 0.4581, "mean_token_accuracy": 0.8240156173706055, "step": 332 }, { "epoch": 1.0471698113207548, "grad_norm": 0.24654971063137054, "learning_rate": 3.94779648753205e-05, "loss": 0.4593, "mean_token_accuracy": 0.8252534866333008, "step": 333 }, { "epoch": 1.050314465408805, "grad_norm": 0.1977737993001938, "learning_rate": 3.947357186650403e-05, "loss": 0.4569, "mean_token_accuracy": 0.826411247253418, "step": 334 }, { "epoch": 1.0534591194968554, "grad_norm": 0.1900518238544464, "learning_rate": 3.9469160725397426e-05, "loss": 0.4448, "mean_token_accuracy": 0.8280865550041199, "step": 335 }, { "epoch": 1.0566037735849056, "grad_norm": 0.20230652391910553, "learning_rate": 3.946473145657813e-05, "loss": 0.461, "mean_token_accuracy": 0.8244823813438416, "step": 336 }, { "epoch": 1.059748427672956, "grad_norm": 0.21481932699680328, "learning_rate": 3.946028406464236e-05, "loss": 0.4429, "mean_token_accuracy": 0.8295682668685913, "step": 337 }, { "epoch": 1.0628930817610063, "grad_norm": 0.20926858484745026, "learning_rate": 3.9455818554205186e-05, "loss": 0.4472, "mean_token_accuracy": 0.8260708451271057, "step": 338 }, { "epoch": 1.0660377358490567, "grad_norm": 0.19324615597724915, "learning_rate": 3.945133492990045e-05, "loss": 0.4695, "mean_token_accuracy": 0.8230428099632263, "step": 339 }, { "epoch": 1.069182389937107, "grad_norm": 0.22138728201389313, "learning_rate": 3.944683319638081e-05, "loss": 0.4436, "mean_token_accuracy": 0.8289992213249207, "step": 340 }, { "epoch": 1.0723270440251573, "grad_norm": 0.18856871128082275, "learning_rate": 3.9442313358317694e-05, "loss": 0.4393, "mean_token_accuracy": 0.8271635174751282, "step": 341 }, { "epoch": 1.0754716981132075, "grad_norm": 0.19933389127254486, "learning_rate": 3.943777542040134e-05, "loss": 0.4654, "mean_token_accuracy": 0.8237259387969971, "step": 342 }, { "epoch": 1.078616352201258, "grad_norm": 0.24634775519371033, "learning_rate": 3.943321938734074e-05, "loss": 0.4401, "mean_token_accuracy": 0.829694926738739, "step": 343 }, { "epoch": 1.0817610062893082, "grad_norm": 0.19513890147209167, "learning_rate": 3.942864526386369e-05, "loss": 0.4558, "mean_token_accuracy": 0.825875997543335, "step": 344 }, { "epoch": 1.0849056603773586, "grad_norm": 0.23661689460277557, "learning_rate": 3.9424053054716755e-05, "loss": 0.4544, "mean_token_accuracy": 0.8247132301330566, "step": 345 }, { "epoch": 1.0880503144654088, "grad_norm": 0.23907917737960815, "learning_rate": 3.941944276466526e-05, "loss": 0.464, "mean_token_accuracy": 0.8243969082832336, "step": 346 }, { "epoch": 1.0911949685534592, "grad_norm": 0.21444515883922577, "learning_rate": 3.941481439849328e-05, "loss": 0.4479, "mean_token_accuracy": 0.8274782299995422, "step": 347 }, { "epoch": 1.0943396226415094, "grad_norm": 0.23370276391506195, "learning_rate": 3.941016796100368e-05, "loss": 0.4586, "mean_token_accuracy": 0.8257689476013184, "step": 348 }, { "epoch": 1.0974842767295598, "grad_norm": 0.24292296171188354, "learning_rate": 3.9405503457018045e-05, "loss": 0.4514, "mean_token_accuracy": 0.826076865196228, "step": 349 }, { "epoch": 1.10062893081761, "grad_norm": 0.2037438154220581, "learning_rate": 3.940082089137673e-05, "loss": 0.4601, "mean_token_accuracy": 0.8253549933433533, "step": 350 }, { "epoch": 1.1037735849056605, "grad_norm": 0.2162904292345047, "learning_rate": 3.939612026893881e-05, "loss": 0.4383, "mean_token_accuracy": 0.8296216726303101, "step": 351 }, { "epoch": 1.1069182389937107, "grad_norm": 0.254822313785553, "learning_rate": 3.939140159458213e-05, "loss": 0.4668, "mean_token_accuracy": 0.8235269784927368, "step": 352 }, { "epoch": 1.110062893081761, "grad_norm": 0.1879785656929016, "learning_rate": 3.938666487320323e-05, "loss": 0.4435, "mean_token_accuracy": 0.8316510915756226, "step": 353 }, { "epoch": 1.1132075471698113, "grad_norm": 0.19271206855773926, "learning_rate": 3.9381910109717415e-05, "loss": 0.4598, "mean_token_accuracy": 0.823648989200592, "step": 354 }, { "epoch": 1.1163522012578617, "grad_norm": 0.22165369987487793, "learning_rate": 3.937713730905868e-05, "loss": 0.4611, "mean_token_accuracy": 0.8243672847747803, "step": 355 }, { "epoch": 1.119496855345912, "grad_norm": 0.17029280960559845, "learning_rate": 3.937234647617975e-05, "loss": 0.4536, "mean_token_accuracy": 0.8255760073661804, "step": 356 }, { "epoch": 1.1226415094339623, "grad_norm": 0.21115252375602722, "learning_rate": 3.936753761605208e-05, "loss": 0.4593, "mean_token_accuracy": 0.8264986276626587, "step": 357 }, { "epoch": 1.1257861635220126, "grad_norm": 0.19516626000404358, "learning_rate": 3.936271073366579e-05, "loss": 0.4509, "mean_token_accuracy": 0.8269539475440979, "step": 358 }, { "epoch": 1.128930817610063, "grad_norm": 0.1787671595811844, "learning_rate": 3.935786583402975e-05, "loss": 0.4511, "mean_token_accuracy": 0.8269217014312744, "step": 359 }, { "epoch": 1.1320754716981132, "grad_norm": 0.19312171638011932, "learning_rate": 3.935300292217148e-05, "loss": 0.4382, "mean_token_accuracy": 0.8278734087944031, "step": 360 }, { "epoch": 1.1352201257861636, "grad_norm": 0.1753867268562317, "learning_rate": 3.9348122003137224e-05, "loss": 0.4494, "mean_token_accuracy": 0.8283815979957581, "step": 361 }, { "epoch": 1.1383647798742138, "grad_norm": 0.17436008155345917, "learning_rate": 3.9343223081991904e-05, "loss": 0.4375, "mean_token_accuracy": 0.8322175145149231, "step": 362 }, { "epoch": 1.1415094339622642, "grad_norm": 0.17123448848724365, "learning_rate": 3.933830616381912e-05, "loss": 0.4603, "mean_token_accuracy": 0.8241879940032959, "step": 363 }, { "epoch": 1.1446540880503144, "grad_norm": 0.17863644659519196, "learning_rate": 3.933337125372115e-05, "loss": 0.443, "mean_token_accuracy": 0.8289136290550232, "step": 364 }, { "epoch": 1.1477987421383649, "grad_norm": 0.17425286769866943, "learning_rate": 3.932841835681893e-05, "loss": 0.4463, "mean_token_accuracy": 0.8275046348571777, "step": 365 }, { "epoch": 1.150943396226415, "grad_norm": 0.18034741282463074, "learning_rate": 3.932344747825208e-05, "loss": 0.4535, "mean_token_accuracy": 0.8258575201034546, "step": 366 }, { "epoch": 1.1540880503144655, "grad_norm": 0.166525736451149, "learning_rate": 3.931845862317887e-05, "loss": 0.4585, "mean_token_accuracy": 0.8256432414054871, "step": 367 }, { "epoch": 1.1572327044025157, "grad_norm": 0.1865280121564865, "learning_rate": 3.931345179677622e-05, "loss": 0.4384, "mean_token_accuracy": 0.8310089111328125, "step": 368 }, { "epoch": 1.1603773584905661, "grad_norm": 0.15174388885498047, "learning_rate": 3.930842700423971e-05, "loss": 0.4501, "mean_token_accuracy": 0.8273698091506958, "step": 369 }, { "epoch": 1.1635220125786163, "grad_norm": 0.19328919053077698, "learning_rate": 3.9303384250783555e-05, "loss": 0.4473, "mean_token_accuracy": 0.8262863755226135, "step": 370 }, { "epoch": 1.1666666666666667, "grad_norm": 0.19845743477344513, "learning_rate": 3.929832354164061e-05, "loss": 0.4517, "mean_token_accuracy": 0.8262023329734802, "step": 371 }, { "epoch": 1.169811320754717, "grad_norm": 0.18804894387722015, "learning_rate": 3.929324488206235e-05, "loss": 0.4416, "mean_token_accuracy": 0.8311719298362732, "step": 372 }, { "epoch": 1.1729559748427674, "grad_norm": 0.2016197293996811, "learning_rate": 3.928814827731892e-05, "loss": 0.4496, "mean_token_accuracy": 0.8284710049629211, "step": 373 }, { "epoch": 1.1761006289308176, "grad_norm": 0.1766403168439865, "learning_rate": 3.928303373269903e-05, "loss": 0.4613, "mean_token_accuracy": 0.826279878616333, "step": 374 }, { "epoch": 1.179245283018868, "grad_norm": 0.19094939529895782, "learning_rate": 3.927790125351004e-05, "loss": 0.446, "mean_token_accuracy": 0.8275184035301208, "step": 375 }, { "epoch": 1.1823899371069182, "grad_norm": 0.18953315913677216, "learning_rate": 3.927275084507791e-05, "loss": 0.4448, "mean_token_accuracy": 0.8282947540283203, "step": 376 }, { "epoch": 1.1855345911949686, "grad_norm": 0.19597028195858002, "learning_rate": 3.9267582512747214e-05, "loss": 0.4535, "mean_token_accuracy": 0.825032114982605, "step": 377 }, { "epoch": 1.1886792452830188, "grad_norm": 0.23007996380329132, "learning_rate": 3.926239626188112e-05, "loss": 0.4504, "mean_token_accuracy": 0.8239325284957886, "step": 378 }, { "epoch": 1.1918238993710693, "grad_norm": 0.15653976798057556, "learning_rate": 3.925719209786139e-05, "loss": 0.4599, "mean_token_accuracy": 0.8246617317199707, "step": 379 }, { "epoch": 1.1949685534591195, "grad_norm": 0.21206559240818024, "learning_rate": 3.925197002608837e-05, "loss": 0.4627, "mean_token_accuracy": 0.8227834701538086, "step": 380 }, { "epoch": 1.1981132075471699, "grad_norm": 0.15478071570396423, "learning_rate": 3.924673005198099e-05, "loss": 0.4491, "mean_token_accuracy": 0.8279451727867126, "step": 381 }, { "epoch": 1.20125786163522, "grad_norm": 0.19152425229549408, "learning_rate": 3.924147218097678e-05, "loss": 0.4454, "mean_token_accuracy": 0.8282057046890259, "step": 382 }, { "epoch": 1.2044025157232705, "grad_norm": 0.18045583367347717, "learning_rate": 3.923619641853179e-05, "loss": 0.4457, "mean_token_accuracy": 0.8302314877510071, "step": 383 }, { "epoch": 1.2075471698113207, "grad_norm": 0.17766040563583374, "learning_rate": 3.9230902770120705e-05, "loss": 0.4428, "mean_token_accuracy": 0.8286284804344177, "step": 384 }, { "epoch": 1.2106918238993711, "grad_norm": 0.21487727761268616, "learning_rate": 3.9225591241236706e-05, "loss": 0.4427, "mean_token_accuracy": 0.829619824886322, "step": 385 }, { "epoch": 1.2138364779874213, "grad_norm": 0.1838655024766922, "learning_rate": 3.922026183739156e-05, "loss": 0.4597, "mean_token_accuracy": 0.823946475982666, "step": 386 }, { "epoch": 1.2169811320754718, "grad_norm": 0.19127146899700165, "learning_rate": 3.921491456411559e-05, "loss": 0.45, "mean_token_accuracy": 0.8261735439300537, "step": 387 }, { "epoch": 1.220125786163522, "grad_norm": 0.22842063009738922, "learning_rate": 3.920954942695764e-05, "loss": 0.4478, "mean_token_accuracy": 0.8252214789390564, "step": 388 }, { "epoch": 1.2232704402515724, "grad_norm": 0.15844713151454926, "learning_rate": 3.9204166431485116e-05, "loss": 0.4488, "mean_token_accuracy": 0.8267233967781067, "step": 389 }, { "epoch": 1.2264150943396226, "grad_norm": 0.19568133354187012, "learning_rate": 3.9198765583283915e-05, "loss": 0.4478, "mean_token_accuracy": 0.8291775584220886, "step": 390 }, { "epoch": 1.229559748427673, "grad_norm": 0.1693885773420334, "learning_rate": 3.919334688795851e-05, "loss": 0.4669, "mean_token_accuracy": 0.8229762315750122, "step": 391 }, { "epoch": 1.2327044025157232, "grad_norm": 0.18789900839328766, "learning_rate": 3.918791035113187e-05, "loss": 0.4546, "mean_token_accuracy": 0.8253647685050964, "step": 392 }, { "epoch": 1.2358490566037736, "grad_norm": 0.16432693600654602, "learning_rate": 3.918245597844546e-05, "loss": 0.4563, "mean_token_accuracy": 0.8253912329673767, "step": 393 }, { "epoch": 1.2389937106918238, "grad_norm": 0.1900831013917923, "learning_rate": 3.9176983775559285e-05, "loss": 0.4435, "mean_token_accuracy": 0.8277944326400757, "step": 394 }, { "epoch": 1.2421383647798743, "grad_norm": 0.17624549567699432, "learning_rate": 3.9171493748151836e-05, "loss": 0.4594, "mean_token_accuracy": 0.8259245753288269, "step": 395 }, { "epoch": 1.2452830188679245, "grad_norm": 0.1721552014350891, "learning_rate": 3.91659859019201e-05, "loss": 0.464, "mean_token_accuracy": 0.8246681094169617, "step": 396 }, { "epoch": 1.248427672955975, "grad_norm": 0.17541447281837463, "learning_rate": 3.916046024257957e-05, "loss": 0.4527, "mean_token_accuracy": 0.8268889784812927, "step": 397 }, { "epoch": 1.251572327044025, "grad_norm": 0.17931130528450012, "learning_rate": 3.91549167758642e-05, "loss": 0.4416, "mean_token_accuracy": 0.8300866484642029, "step": 398 }, { "epoch": 1.2547169811320755, "grad_norm": 0.17346803843975067, "learning_rate": 3.914935550752643e-05, "loss": 0.4532, "mean_token_accuracy": 0.825555145740509, "step": 399 }, { "epoch": 1.2578616352201257, "grad_norm": 0.17272527515888214, "learning_rate": 3.91437764433372e-05, "loss": 0.4552, "mean_token_accuracy": 0.8265715837478638, "step": 400 }, { "epoch": 1.2610062893081762, "grad_norm": 0.18587706983089447, "learning_rate": 3.913817958908587e-05, "loss": 0.4512, "mean_token_accuracy": 0.8276833295822144, "step": 401 }, { "epoch": 1.2641509433962264, "grad_norm": 0.1864849030971527, "learning_rate": 3.9132564950580286e-05, "loss": 0.4439, "mean_token_accuracy": 0.8282519578933716, "step": 402 }, { "epoch": 1.2672955974842768, "grad_norm": 0.19797006249427795, "learning_rate": 3.9126932533646756e-05, "loss": 0.4452, "mean_token_accuracy": 0.8268986344337463, "step": 403 }, { "epoch": 1.270440251572327, "grad_norm": 0.2109120786190033, "learning_rate": 3.912128234413002e-05, "loss": 0.4516, "mean_token_accuracy": 0.8287456631660461, "step": 404 }, { "epoch": 1.2735849056603774, "grad_norm": 0.20502309501171112, "learning_rate": 3.9115614387893284e-05, "loss": 0.4585, "mean_token_accuracy": 0.8249683380126953, "step": 405 }, { "epoch": 1.2767295597484276, "grad_norm": 0.195379376411438, "learning_rate": 3.910992867081815e-05, "loss": 0.4478, "mean_token_accuracy": 0.8271308541297913, "step": 406 }, { "epoch": 1.279874213836478, "grad_norm": 0.19706301391124725, "learning_rate": 3.9104225198804697e-05, "loss": 0.4538, "mean_token_accuracy": 0.8262473344802856, "step": 407 }, { "epoch": 1.2830188679245282, "grad_norm": 0.22555312514305115, "learning_rate": 3.90985039777714e-05, "loss": 0.4541, "mean_token_accuracy": 0.8253132104873657, "step": 408 }, { "epoch": 1.2861635220125787, "grad_norm": 0.1870194524526596, "learning_rate": 3.909276501365515e-05, "loss": 0.4496, "mean_token_accuracy": 0.8271132111549377, "step": 409 }, { "epoch": 1.2893081761006289, "grad_norm": 0.20130424201488495, "learning_rate": 3.9087008312411266e-05, "loss": 0.4508, "mean_token_accuracy": 0.8274027705192566, "step": 410 }, { "epoch": 1.2924528301886793, "grad_norm": 0.19089150428771973, "learning_rate": 3.908123388001347e-05, "loss": 0.4437, "mean_token_accuracy": 0.8283129930496216, "step": 411 }, { "epoch": 1.2955974842767295, "grad_norm": 0.18513773381710052, "learning_rate": 3.907544172245386e-05, "loss": 0.4512, "mean_token_accuracy": 0.8256914615631104, "step": 412 }, { "epoch": 1.29874213836478, "grad_norm": 0.2461722195148468, "learning_rate": 3.906963184574297e-05, "loss": 0.4398, "mean_token_accuracy": 0.8291003108024597, "step": 413 }, { "epoch": 1.3018867924528301, "grad_norm": 0.16066083312034607, "learning_rate": 3.906380425590969e-05, "loss": 0.4622, "mean_token_accuracy": 0.8245102167129517, "step": 414 }, { "epoch": 1.3050314465408805, "grad_norm": 0.23007777333259583, "learning_rate": 3.905795895900129e-05, "loss": 0.4508, "mean_token_accuracy": 0.8270963430404663, "step": 415 }, { "epoch": 1.3081761006289307, "grad_norm": 0.1997261345386505, "learning_rate": 3.905209596108342e-05, "loss": 0.4428, "mean_token_accuracy": 0.8287739753723145, "step": 416 }, { "epoch": 1.3113207547169812, "grad_norm": 0.19289380311965942, "learning_rate": 3.904621526824011e-05, "loss": 0.4494, "mean_token_accuracy": 0.8278332948684692, "step": 417 }, { "epoch": 1.3144654088050314, "grad_norm": 0.20662692189216614, "learning_rate": 3.904031688657375e-05, "loss": 0.4481, "mean_token_accuracy": 0.828018844127655, "step": 418 }, { "epoch": 1.3176100628930818, "grad_norm": 0.17998790740966797, "learning_rate": 3.903440082220506e-05, "loss": 0.4627, "mean_token_accuracy": 0.8233857154846191, "step": 419 }, { "epoch": 1.320754716981132, "grad_norm": 0.18025898933410645, "learning_rate": 3.902846708127315e-05, "loss": 0.4435, "mean_token_accuracy": 0.8296754956245422, "step": 420 }, { "epoch": 1.3238993710691824, "grad_norm": 0.1748049110174179, "learning_rate": 3.902251566993543e-05, "loss": 0.4504, "mean_token_accuracy": 0.8265761137008667, "step": 421 }, { "epoch": 1.3270440251572326, "grad_norm": 0.1689056158065796, "learning_rate": 3.901654659436768e-05, "loss": 0.4475, "mean_token_accuracy": 0.8279913067817688, "step": 422 }, { "epoch": 1.330188679245283, "grad_norm": 0.18498939275741577, "learning_rate": 3.901055986076399e-05, "loss": 0.4612, "mean_token_accuracy": 0.8243342638015747, "step": 423 }, { "epoch": 1.3333333333333333, "grad_norm": 0.20584383606910706, "learning_rate": 3.900455547533679e-05, "loss": 0.4561, "mean_token_accuracy": 0.8269695043563843, "step": 424 }, { "epoch": 1.3364779874213837, "grad_norm": 0.19053617119789124, "learning_rate": 3.899853344431681e-05, "loss": 0.4503, "mean_token_accuracy": 0.8270792961120605, "step": 425 }, { "epoch": 1.3396226415094339, "grad_norm": 0.23964080214500427, "learning_rate": 3.8992493773953103e-05, "loss": 0.444, "mean_token_accuracy": 0.8272242546081543, "step": 426 }, { "epoch": 1.3427672955974843, "grad_norm": 0.17868532240390778, "learning_rate": 3.898643647051303e-05, "loss": 0.4478, "mean_token_accuracy": 0.8284726738929749, "step": 427 }, { "epoch": 1.3459119496855345, "grad_norm": 0.2097913771867752, "learning_rate": 3.8980361540282226e-05, "loss": 0.449, "mean_token_accuracy": 0.8262530565261841, "step": 428 }, { "epoch": 1.349056603773585, "grad_norm": 0.1811225712299347, "learning_rate": 3.8974268989564655e-05, "loss": 0.4539, "mean_token_accuracy": 0.8266646265983582, "step": 429 }, { "epoch": 1.3522012578616351, "grad_norm": 0.1976926028728485, "learning_rate": 3.896815882468253e-05, "loss": 0.4537, "mean_token_accuracy": 0.8261896967887878, "step": 430 }, { "epoch": 1.3553459119496856, "grad_norm": 0.17476551234722137, "learning_rate": 3.8962031051976356e-05, "loss": 0.45, "mean_token_accuracy": 0.8286691904067993, "step": 431 }, { "epoch": 1.3584905660377358, "grad_norm": 0.1752653270959854, "learning_rate": 3.895588567780491e-05, "loss": 0.4391, "mean_token_accuracy": 0.830578088760376, "step": 432 }, { "epoch": 1.3616352201257862, "grad_norm": 0.22124166786670685, "learning_rate": 3.894972270854525e-05, "loss": 0.4442, "mean_token_accuracy": 0.8274651169776917, "step": 433 }, { "epoch": 1.3647798742138364, "grad_norm": 0.16567222774028778, "learning_rate": 3.894354215059265e-05, "loss": 0.4586, "mean_token_accuracy": 0.8255594968795776, "step": 434 }, { "epoch": 1.3679245283018868, "grad_norm": 0.2491973489522934, "learning_rate": 3.893734401036069e-05, "loss": 0.4567, "mean_token_accuracy": 0.825359046459198, "step": 435 }, { "epoch": 1.371069182389937, "grad_norm": 0.16659407317638397, "learning_rate": 3.8931128294281154e-05, "loss": 0.4487, "mean_token_accuracy": 0.8270349502563477, "step": 436 }, { "epoch": 1.3742138364779874, "grad_norm": 0.21300457417964935, "learning_rate": 3.892489500880408e-05, "loss": 0.4584, "mean_token_accuracy": 0.8247126936912537, "step": 437 }, { "epoch": 1.3773584905660377, "grad_norm": 0.22345586121082306, "learning_rate": 3.891864416039772e-05, "loss": 0.4463, "mean_token_accuracy": 0.8268424868583679, "step": 438 }, { "epoch": 1.380503144654088, "grad_norm": 0.18328611552715302, "learning_rate": 3.89123757555486e-05, "loss": 0.441, "mean_token_accuracy": 0.8294317126274109, "step": 439 }, { "epoch": 1.3836477987421385, "grad_norm": 0.21173825860023499, "learning_rate": 3.89060898007614e-05, "loss": 0.4532, "mean_token_accuracy": 0.826548159122467, "step": 440 }, { "epoch": 1.3867924528301887, "grad_norm": 0.14746391773223877, "learning_rate": 3.889978630255907e-05, "loss": 0.4555, "mean_token_accuracy": 0.826274037361145, "step": 441 }, { "epoch": 1.389937106918239, "grad_norm": 0.2072405368089676, "learning_rate": 3.8893465267482705e-05, "loss": 0.4512, "mean_token_accuracy": 0.8269615173339844, "step": 442 }, { "epoch": 1.3930817610062893, "grad_norm": 0.18459253013134003, "learning_rate": 3.8887126702091665e-05, "loss": 0.4499, "mean_token_accuracy": 0.8274534940719604, "step": 443 }, { "epoch": 1.3962264150943398, "grad_norm": 0.1969769448041916, "learning_rate": 3.8880770612963436e-05, "loss": 0.4515, "mean_token_accuracy": 0.8273787498474121, "step": 444 }, { "epoch": 1.39937106918239, "grad_norm": 0.1519605964422226, "learning_rate": 3.887439700669373e-05, "loss": 0.4438, "mean_token_accuracy": 0.8282424211502075, "step": 445 }, { "epoch": 1.4025157232704402, "grad_norm": 0.2099246382713318, "learning_rate": 3.8868005889896434e-05, "loss": 0.4606, "mean_token_accuracy": 0.8256029486656189, "step": 446 }, { "epoch": 1.4056603773584906, "grad_norm": 0.16457189619541168, "learning_rate": 3.886159726920359e-05, "loss": 0.4593, "mean_token_accuracy": 0.8245981335639954, "step": 447 }, { "epoch": 1.408805031446541, "grad_norm": 0.2020101696252823, "learning_rate": 3.8855171151265415e-05, "loss": 0.4628, "mean_token_accuracy": 0.8243246674537659, "step": 448 }, { "epoch": 1.4119496855345912, "grad_norm": 0.1827785223722458, "learning_rate": 3.884872754275027e-05, "loss": 0.462, "mean_token_accuracy": 0.8254520297050476, "step": 449 }, { "epoch": 1.4150943396226414, "grad_norm": 0.1720219850540161, "learning_rate": 3.8842266450344676e-05, "loss": 0.4429, "mean_token_accuracy": 0.8271845579147339, "step": 450 }, { "epoch": 1.4182389937106918, "grad_norm": 0.1863139569759369, "learning_rate": 3.8835787880753305e-05, "loss": 0.4505, "mean_token_accuracy": 0.8274635076522827, "step": 451 }, { "epoch": 1.4213836477987423, "grad_norm": 0.16976246237754822, "learning_rate": 3.882929184069894e-05, "loss": 0.4666, "mean_token_accuracy": 0.8241984844207764, "step": 452 }, { "epoch": 1.4245283018867925, "grad_norm": 0.1592787802219391, "learning_rate": 3.882277833692253e-05, "loss": 0.4547, "mean_token_accuracy": 0.8265584111213684, "step": 453 }, { "epoch": 1.4276729559748427, "grad_norm": 0.18003453314304352, "learning_rate": 3.8816247376183105e-05, "loss": 0.456, "mean_token_accuracy": 0.8237389922142029, "step": 454 }, { "epoch": 1.430817610062893, "grad_norm": 0.15752705931663513, "learning_rate": 3.880969896525784e-05, "loss": 0.4535, "mean_token_accuracy": 0.8278123140335083, "step": 455 }, { "epoch": 1.4339622641509435, "grad_norm": 0.15204060077667236, "learning_rate": 3.8803133110942e-05, "loss": 0.4504, "mean_token_accuracy": 0.8270664215087891, "step": 456 }, { "epoch": 1.4371069182389937, "grad_norm": 0.1571756899356842, "learning_rate": 3.879654982004897e-05, "loss": 0.4558, "mean_token_accuracy": 0.8265605568885803, "step": 457 }, { "epoch": 1.440251572327044, "grad_norm": 0.1584654152393341, "learning_rate": 3.8789949099410206e-05, "loss": 0.4518, "mean_token_accuracy": 0.8270092606544495, "step": 458 }, { "epoch": 1.4433962264150944, "grad_norm": 0.15152078866958618, "learning_rate": 3.878333095587527e-05, "loss": 0.4461, "mean_token_accuracy": 0.8276041150093079, "step": 459 }, { "epoch": 1.4465408805031448, "grad_norm": 0.15868431329727173, "learning_rate": 3.87766953963118e-05, "loss": 0.4438, "mean_token_accuracy": 0.8289068937301636, "step": 460 }, { "epoch": 1.449685534591195, "grad_norm": 0.14925162494182587, "learning_rate": 3.8770042427605486e-05, "loss": 0.4513, "mean_token_accuracy": 0.8267138600349426, "step": 461 }, { "epoch": 1.4528301886792452, "grad_norm": 0.15520182251930237, "learning_rate": 3.876337205666011e-05, "loss": 0.4627, "mean_token_accuracy": 0.825510561466217, "step": 462 }, { "epoch": 1.4559748427672956, "grad_norm": 0.15460386872291565, "learning_rate": 3.875668429039751e-05, "loss": 0.4524, "mean_token_accuracy": 0.8280324935913086, "step": 463 }, { "epoch": 1.459119496855346, "grad_norm": 0.1757625788450241, "learning_rate": 3.8749979135757564e-05, "loss": 0.4476, "mean_token_accuracy": 0.8284898996353149, "step": 464 }, { "epoch": 1.4622641509433962, "grad_norm": 0.17886289954185486, "learning_rate": 3.874325659969819e-05, "loss": 0.4525, "mean_token_accuracy": 0.8280431628227234, "step": 465 }, { "epoch": 1.4654088050314464, "grad_norm": 0.17093369364738464, "learning_rate": 3.873651668919535e-05, "loss": 0.461, "mean_token_accuracy": 0.8263329267501831, "step": 466 }, { "epoch": 1.4685534591194969, "grad_norm": 0.1534358263015747, "learning_rate": 3.872975941124305e-05, "loss": 0.4662, "mean_token_accuracy": 0.8242008090019226, "step": 467 }, { "epoch": 1.4716981132075473, "grad_norm": 0.16316460072994232, "learning_rate": 3.8722984772853276e-05, "loss": 0.4401, "mean_token_accuracy": 0.8306291103363037, "step": 468 }, { "epoch": 1.4748427672955975, "grad_norm": 0.14488621056079865, "learning_rate": 3.8716192781056086e-05, "loss": 0.4529, "mean_token_accuracy": 0.8279455900192261, "step": 469 }, { "epoch": 1.4779874213836477, "grad_norm": 0.15546061098575592, "learning_rate": 3.870938344289951e-05, "loss": 0.4524, "mean_token_accuracy": 0.8272173404693604, "step": 470 }, { "epoch": 1.4811320754716981, "grad_norm": 0.14502915740013123, "learning_rate": 3.8702556765449564e-05, "loss": 0.4365, "mean_token_accuracy": 0.8309008479118347, "step": 471 }, { "epoch": 1.4842767295597485, "grad_norm": 0.14981509745121002, "learning_rate": 3.8695712755790296e-05, "loss": 0.4553, "mean_token_accuracy": 0.8265926241874695, "step": 472 }, { "epoch": 1.4874213836477987, "grad_norm": 0.1419067680835724, "learning_rate": 3.8688851421023724e-05, "loss": 0.4467, "mean_token_accuracy": 0.8291380405426025, "step": 473 }, { "epoch": 1.490566037735849, "grad_norm": 0.14329911768436432, "learning_rate": 3.868197276826983e-05, "loss": 0.4404, "mean_token_accuracy": 0.8298222422599792, "step": 474 }, { "epoch": 1.4937106918238994, "grad_norm": 0.14086921513080597, "learning_rate": 3.8675076804666574e-05, "loss": 0.4517, "mean_token_accuracy": 0.8272147178649902, "step": 475 }, { "epoch": 1.4968553459119498, "grad_norm": 0.13164976239204407, "learning_rate": 3.86681635373699e-05, "loss": 0.4543, "mean_token_accuracy": 0.8258587121963501, "step": 476 }, { "epoch": 1.5, "grad_norm": 0.14340752363204956, "learning_rate": 3.866123297355368e-05, "loss": 0.4547, "mean_token_accuracy": 0.8274818062782288, "step": 477 }, { "epoch": 1.5031446540880502, "grad_norm": 0.15563100576400757, "learning_rate": 3.865428512040975e-05, "loss": 0.4546, "mean_token_accuracy": 0.8245983123779297, "step": 478 }, { "epoch": 1.5062893081761006, "grad_norm": 0.15186692774295807, "learning_rate": 3.864731998514788e-05, "loss": 0.4471, "mean_token_accuracy": 0.8284518718719482, "step": 479 }, { "epoch": 1.509433962264151, "grad_norm": 0.1551363319158554, "learning_rate": 3.864033757499578e-05, "loss": 0.4506, "mean_token_accuracy": 0.8288803696632385, "step": 480 }, { "epoch": 1.5125786163522013, "grad_norm": 0.19744457304477692, "learning_rate": 3.863333789719908e-05, "loss": 0.4459, "mean_token_accuracy": 0.8297019600868225, "step": 481 }, { "epoch": 1.5157232704402515, "grad_norm": 0.1606452614068985, "learning_rate": 3.8626320959021336e-05, "loss": 0.4574, "mean_token_accuracy": 0.8262441754341125, "step": 482 }, { "epoch": 1.5188679245283019, "grad_norm": 0.14951243996620178, "learning_rate": 3.8619286767744e-05, "loss": 0.4503, "mean_token_accuracy": 0.8295007348060608, "step": 483 }, { "epoch": 1.5220125786163523, "grad_norm": 0.1706402450799942, "learning_rate": 3.8612235330666455e-05, "loss": 0.4522, "mean_token_accuracy": 0.8276010751724243, "step": 484 }, { "epoch": 1.5251572327044025, "grad_norm": 0.18848121166229248, "learning_rate": 3.860516665510595e-05, "loss": 0.4478, "mean_token_accuracy": 0.8272507190704346, "step": 485 }, { "epoch": 1.5283018867924527, "grad_norm": 0.18110370635986328, "learning_rate": 3.859808074839764e-05, "loss": 0.4633, "mean_token_accuracy": 0.8253755569458008, "step": 486 }, { "epoch": 1.5314465408805031, "grad_norm": 0.15408791601657867, "learning_rate": 3.859097761789455e-05, "loss": 0.4608, "mean_token_accuracy": 0.8244016766548157, "step": 487 }, { "epoch": 1.5345911949685536, "grad_norm": 0.209492027759552, "learning_rate": 3.858385727096759e-05, "loss": 0.4536, "mean_token_accuracy": 0.8256571292877197, "step": 488 }, { "epoch": 1.5377358490566038, "grad_norm": 0.13831724226474762, "learning_rate": 3.8576719715005534e-05, "loss": 0.4471, "mean_token_accuracy": 0.8289450407028198, "step": 489 }, { "epoch": 1.540880503144654, "grad_norm": 0.1807355135679245, "learning_rate": 3.856956495741501e-05, "loss": 0.457, "mean_token_accuracy": 0.8249189853668213, "step": 490 }, { "epoch": 1.5440251572327044, "grad_norm": 0.14947283267974854, "learning_rate": 3.856239300562047e-05, "loss": 0.4517, "mean_token_accuracy": 0.827312171459198, "step": 491 }, { "epoch": 1.5471698113207548, "grad_norm": 0.1528450846672058, "learning_rate": 3.855520386706427e-05, "loss": 0.4469, "mean_token_accuracy": 0.8281989097595215, "step": 492 }, { "epoch": 1.550314465408805, "grad_norm": 0.1903686374425888, "learning_rate": 3.854799754920654e-05, "loss": 0.4517, "mean_token_accuracy": 0.8267768025398254, "step": 493 }, { "epoch": 1.5534591194968552, "grad_norm": 0.14876671135425568, "learning_rate": 3.854077405952527e-05, "loss": 0.4471, "mean_token_accuracy": 0.8272234797477722, "step": 494 }, { "epoch": 1.5566037735849056, "grad_norm": 0.15654216706752777, "learning_rate": 3.853353340551626e-05, "loss": 0.4426, "mean_token_accuracy": 0.8292421698570251, "step": 495 }, { "epoch": 1.559748427672956, "grad_norm": 0.15252597630023956, "learning_rate": 3.852627559469313e-05, "loss": 0.4466, "mean_token_accuracy": 0.828421413898468, "step": 496 }, { "epoch": 1.5628930817610063, "grad_norm": 0.15407709777355194, "learning_rate": 3.8519000634587274e-05, "loss": 0.4522, "mean_token_accuracy": 0.8288334608078003, "step": 497 }, { "epoch": 1.5660377358490565, "grad_norm": 0.15139752626419067, "learning_rate": 3.851170853274793e-05, "loss": 0.4591, "mean_token_accuracy": 0.8242106437683105, "step": 498 }, { "epoch": 1.569182389937107, "grad_norm": 0.16278305649757385, "learning_rate": 3.8504399296742076e-05, "loss": 0.4536, "mean_token_accuracy": 0.8283582329750061, "step": 499 }, { "epoch": 1.5723270440251573, "grad_norm": 0.21534794569015503, "learning_rate": 3.84970729341545e-05, "loss": 0.4464, "mean_token_accuracy": 0.8293619155883789, "step": 500 }, { "epoch": 1.5754716981132075, "grad_norm": 0.16945937275886536, "learning_rate": 3.848972945258776e-05, "loss": 0.4366, "mean_token_accuracy": 0.8301461338996887, "step": 501 }, { "epoch": 1.5786163522012577, "grad_norm": 0.19320879876613617, "learning_rate": 3.8482368859662156e-05, "loss": 0.4546, "mean_token_accuracy": 0.8255838751792908, "step": 502 }, { "epoch": 1.5817610062893082, "grad_norm": 0.16316726803779602, "learning_rate": 3.847499116301577e-05, "loss": 0.4535, "mean_token_accuracy": 0.8292089104652405, "step": 503 }, { "epoch": 1.5849056603773586, "grad_norm": 0.20331525802612305, "learning_rate": 3.846759637030443e-05, "loss": 0.4455, "mean_token_accuracy": 0.8304234743118286, "step": 504 }, { "epoch": 1.5880503144654088, "grad_norm": 0.14327698945999146, "learning_rate": 3.846018448920168e-05, "loss": 0.4388, "mean_token_accuracy": 0.8320124745368958, "step": 505 }, { "epoch": 1.591194968553459, "grad_norm": 0.1536184698343277, "learning_rate": 3.845275552739883e-05, "loss": 0.4611, "mean_token_accuracy": 0.8233290910720825, "step": 506 }, { "epoch": 1.5943396226415094, "grad_norm": 0.17109455168247223, "learning_rate": 3.844530949260489e-05, "loss": 0.4581, "mean_token_accuracy": 0.8267108798027039, "step": 507 }, { "epoch": 1.5974842767295598, "grad_norm": 0.1569495052099228, "learning_rate": 3.8437846392546603e-05, "loss": 0.454, "mean_token_accuracy": 0.8257220387458801, "step": 508 }, { "epoch": 1.60062893081761, "grad_norm": 0.15163108706474304, "learning_rate": 3.84303662349684e-05, "loss": 0.4459, "mean_token_accuracy": 0.8274716734886169, "step": 509 }, { "epoch": 1.6037735849056602, "grad_norm": 0.15992477536201477, "learning_rate": 3.842286902763245e-05, "loss": 0.4506, "mean_token_accuracy": 0.8289050459861755, "step": 510 }, { "epoch": 1.6069182389937107, "grad_norm": 0.16461218893527985, "learning_rate": 3.841535477831855e-05, "loss": 0.4523, "mean_token_accuracy": 0.8275653123855591, "step": 511 }, { "epoch": 1.610062893081761, "grad_norm": 0.15126198530197144, "learning_rate": 3.840782349482426e-05, "loss": 0.4442, "mean_token_accuracy": 0.8301685452461243, "step": 512 }, { "epoch": 1.6132075471698113, "grad_norm": 0.19573630392551422, "learning_rate": 3.840027518496475e-05, "loss": 0.4575, "mean_token_accuracy": 0.8253160715103149, "step": 513 }, { "epoch": 1.6163522012578615, "grad_norm": 0.16733072698116302, "learning_rate": 3.8392709856572904e-05, "loss": 0.4481, "mean_token_accuracy": 0.8281794190406799, "step": 514 }, { "epoch": 1.619496855345912, "grad_norm": 0.1560571938753128, "learning_rate": 3.838512751749924e-05, "loss": 0.4472, "mean_token_accuracy": 0.8281599879264832, "step": 515 }, { "epoch": 1.6226415094339623, "grad_norm": 0.17163477838039398, "learning_rate": 3.837752817561194e-05, "loss": 0.45, "mean_token_accuracy": 0.8275222778320312, "step": 516 }, { "epoch": 1.6257861635220126, "grad_norm": 0.16268257796764374, "learning_rate": 3.8369911838796816e-05, "loss": 0.4483, "mean_token_accuracy": 0.8267813920974731, "step": 517 }, { "epoch": 1.6289308176100628, "grad_norm": 0.16738560795783997, "learning_rate": 3.8362278514957336e-05, "loss": 0.4375, "mean_token_accuracy": 0.8297300934791565, "step": 518 }, { "epoch": 1.6320754716981132, "grad_norm": 0.16072408854961395, "learning_rate": 3.8354628212014587e-05, "loss": 0.4417, "mean_token_accuracy": 0.8280481100082397, "step": 519 }, { "epoch": 1.6352201257861636, "grad_norm": 0.17322075366973877, "learning_rate": 3.8346960937907264e-05, "loss": 0.4569, "mean_token_accuracy": 0.8253117799758911, "step": 520 }, { "epoch": 1.6383647798742138, "grad_norm": 0.1540379822254181, "learning_rate": 3.833927670059168e-05, "loss": 0.4352, "mean_token_accuracy": 0.8304820656776428, "step": 521 }, { "epoch": 1.641509433962264, "grad_norm": 0.20093458890914917, "learning_rate": 3.833157550804176e-05, "loss": 0.4502, "mean_token_accuracy": 0.8276907801628113, "step": 522 }, { "epoch": 1.6446540880503144, "grad_norm": 0.17706024646759033, "learning_rate": 3.8323857368249014e-05, "loss": 0.4472, "mean_token_accuracy": 0.8285207748413086, "step": 523 }, { "epoch": 1.6477987421383649, "grad_norm": 0.2173389196395874, "learning_rate": 3.8316122289222535e-05, "loss": 0.4482, "mean_token_accuracy": 0.8268563747406006, "step": 524 }, { "epoch": 1.650943396226415, "grad_norm": 0.14905473589897156, "learning_rate": 3.8308370278989e-05, "loss": 0.445, "mean_token_accuracy": 0.8302363157272339, "step": 525 }, { "epoch": 1.6540880503144653, "grad_norm": 0.1895207017660141, "learning_rate": 3.8300601345592675e-05, "loss": 0.455, "mean_token_accuracy": 0.8297334909439087, "step": 526 }, { "epoch": 1.6572327044025157, "grad_norm": 0.190410777926445, "learning_rate": 3.829281549709533e-05, "loss": 0.4456, "mean_token_accuracy": 0.8273657560348511, "step": 527 }, { "epoch": 1.6603773584905661, "grad_norm": 0.15508361160755157, "learning_rate": 3.828501274157635e-05, "loss": 0.4444, "mean_token_accuracy": 0.8293198347091675, "step": 528 }, { "epoch": 1.6635220125786163, "grad_norm": 0.17064864933490753, "learning_rate": 3.8277193087132634e-05, "loss": 0.4534, "mean_token_accuracy": 0.8277645707130432, "step": 529 }, { "epoch": 1.6666666666666665, "grad_norm": 0.18299253284931183, "learning_rate": 3.826935654187861e-05, "loss": 0.4529, "mean_token_accuracy": 0.8259576559066772, "step": 530 }, { "epoch": 1.669811320754717, "grad_norm": 0.14436650276184082, "learning_rate": 3.826150311394627e-05, "loss": 0.4424, "mean_token_accuracy": 0.8301630020141602, "step": 531 }, { "epoch": 1.6729559748427674, "grad_norm": 0.16234935820102692, "learning_rate": 3.825363281148507e-05, "loss": 0.4414, "mean_token_accuracy": 0.8282803297042847, "step": 532 }, { "epoch": 1.6761006289308176, "grad_norm": 0.1424507051706314, "learning_rate": 3.8245745642662025e-05, "loss": 0.45, "mean_token_accuracy": 0.8285747170448303, "step": 533 }, { "epoch": 1.6792452830188678, "grad_norm": 0.17243961989879608, "learning_rate": 3.8237841615661636e-05, "loss": 0.4519, "mean_token_accuracy": 0.8274285197257996, "step": 534 }, { "epoch": 1.6823899371069182, "grad_norm": 0.15060077607631683, "learning_rate": 3.8229920738685886e-05, "loss": 0.4485, "mean_token_accuracy": 0.827806293964386, "step": 535 }, { "epoch": 1.6855345911949686, "grad_norm": 0.16962994635105133, "learning_rate": 3.8221983019954254e-05, "loss": 0.4526, "mean_token_accuracy": 0.8264437317848206, "step": 536 }, { "epoch": 1.6886792452830188, "grad_norm": 0.16717560589313507, "learning_rate": 3.82140284677037e-05, "loss": 0.4453, "mean_token_accuracy": 0.8285322785377502, "step": 537 }, { "epoch": 1.691823899371069, "grad_norm": 0.15772481262683868, "learning_rate": 3.820605709018865e-05, "loss": 0.4506, "mean_token_accuracy": 0.8271482586860657, "step": 538 }, { "epoch": 1.6949685534591195, "grad_norm": 0.17806661128997803, "learning_rate": 3.819806889568098e-05, "loss": 0.4482, "mean_token_accuracy": 0.8278267979621887, "step": 539 }, { "epoch": 1.6981132075471699, "grad_norm": 0.18837103247642517, "learning_rate": 3.819006389247002e-05, "loss": 0.4491, "mean_token_accuracy": 0.8294506669044495, "step": 540 }, { "epoch": 1.70125786163522, "grad_norm": 0.18263404071331024, "learning_rate": 3.8182042088862555e-05, "loss": 0.4549, "mean_token_accuracy": 0.8264862895011902, "step": 541 }, { "epoch": 1.7044025157232703, "grad_norm": 0.17201568186283112, "learning_rate": 3.8174003493182784e-05, "loss": 0.4479, "mean_token_accuracy": 0.826499879360199, "step": 542 }, { "epoch": 1.7075471698113207, "grad_norm": 0.2106884866952896, "learning_rate": 3.816594811377235e-05, "loss": 0.448, "mean_token_accuracy": 0.8292680382728577, "step": 543 }, { "epoch": 1.7106918238993711, "grad_norm": 0.17805208265781403, "learning_rate": 3.81578759589903e-05, "loss": 0.451, "mean_token_accuracy": 0.8258050680160522, "step": 544 }, { "epoch": 1.7138364779874213, "grad_norm": 0.20148631930351257, "learning_rate": 3.814978703721309e-05, "loss": 0.4491, "mean_token_accuracy": 0.8281079530715942, "step": 545 }, { "epoch": 1.7169811320754715, "grad_norm": 0.2121734768152237, "learning_rate": 3.8141681356834587e-05, "loss": 0.4503, "mean_token_accuracy": 0.8274951577186584, "step": 546 }, { "epoch": 1.720125786163522, "grad_norm": 0.16927511990070343, "learning_rate": 3.813355892626603e-05, "loss": 0.4358, "mean_token_accuracy": 0.8312507271766663, "step": 547 }, { "epoch": 1.7232704402515724, "grad_norm": 0.2374507337808609, "learning_rate": 3.8125419753936055e-05, "loss": 0.4433, "mean_token_accuracy": 0.8288567662239075, "step": 548 }, { "epoch": 1.7264150943396226, "grad_norm": 0.14739219844341278, "learning_rate": 3.8117263848290656e-05, "loss": 0.4432, "mean_token_accuracy": 0.8277235627174377, "step": 549 }, { "epoch": 1.7295597484276728, "grad_norm": 0.22958379983901978, "learning_rate": 3.810909121779321e-05, "loss": 0.4522, "mean_token_accuracy": 0.8266805410385132, "step": 550 }, { "epoch": 1.7327044025157232, "grad_norm": 0.1478036642074585, "learning_rate": 3.810090187092443e-05, "loss": 0.4362, "mean_token_accuracy": 0.8309704065322876, "step": 551 }, { "epoch": 1.7358490566037736, "grad_norm": 0.2060670554637909, "learning_rate": 3.809269581618239e-05, "loss": 0.4463, "mean_token_accuracy": 0.8310072422027588, "step": 552 }, { "epoch": 1.7389937106918238, "grad_norm": 0.157624289393425, "learning_rate": 3.80844730620825e-05, "loss": 0.4489, "mean_token_accuracy": 0.82773357629776, "step": 553 }, { "epoch": 1.742138364779874, "grad_norm": 0.18612465262413025, "learning_rate": 3.8076233617157486e-05, "loss": 0.4506, "mean_token_accuracy": 0.8275358080863953, "step": 554 }, { "epoch": 1.7452830188679245, "grad_norm": 0.15709009766578674, "learning_rate": 3.806797748995741e-05, "loss": 0.452, "mean_token_accuracy": 0.829462468624115, "step": 555 }, { "epoch": 1.748427672955975, "grad_norm": 0.17328409850597382, "learning_rate": 3.805970468904964e-05, "loss": 0.4439, "mean_token_accuracy": 0.8290309906005859, "step": 556 }, { "epoch": 1.751572327044025, "grad_norm": 0.17372116446495056, "learning_rate": 3.805141522301884e-05, "loss": 0.4524, "mean_token_accuracy": 0.8271963000297546, "step": 557 }, { "epoch": 1.7547169811320755, "grad_norm": 0.161929190158844, "learning_rate": 3.804310910046697e-05, "loss": 0.4399, "mean_token_accuracy": 0.8296563029289246, "step": 558 }, { "epoch": 1.757861635220126, "grad_norm": 0.1848653256893158, "learning_rate": 3.803478633001328e-05, "loss": 0.4576, "mean_token_accuracy": 0.8277556896209717, "step": 559 }, { "epoch": 1.7610062893081762, "grad_norm": 0.1677953600883484, "learning_rate": 3.8026446920294295e-05, "loss": 0.4554, "mean_token_accuracy": 0.8266874551773071, "step": 560 }, { "epoch": 1.7641509433962264, "grad_norm": 0.15664350986480713, "learning_rate": 3.80180908799638e-05, "loss": 0.4499, "mean_token_accuracy": 0.8271031379699707, "step": 561 }, { "epoch": 1.7672955974842768, "grad_norm": 0.1716042309999466, "learning_rate": 3.800971821769284e-05, "loss": 0.4477, "mean_token_accuracy": 0.8271938562393188, "step": 562 }, { "epoch": 1.7704402515723272, "grad_norm": 0.15244047343730927, "learning_rate": 3.800132894216971e-05, "loss": 0.4373, "mean_token_accuracy": 0.8322244882583618, "step": 563 }, { "epoch": 1.7735849056603774, "grad_norm": 0.1561097949743271, "learning_rate": 3.799292306209995e-05, "loss": 0.4342, "mean_token_accuracy": 0.8296236395835876, "step": 564 }, { "epoch": 1.7767295597484276, "grad_norm": 0.13890111446380615, "learning_rate": 3.7984500586206324e-05, "loss": 0.4552, "mean_token_accuracy": 0.825447678565979, "step": 565 }, { "epoch": 1.779874213836478, "grad_norm": 0.16650985181331635, "learning_rate": 3.79760615232288e-05, "loss": 0.4474, "mean_token_accuracy": 0.8286757469177246, "step": 566 }, { "epoch": 1.7830188679245285, "grad_norm": 0.14276254177093506, "learning_rate": 3.7967605881924605e-05, "loss": 0.4608, "mean_token_accuracy": 0.8258764147758484, "step": 567 }, { "epoch": 1.7861635220125787, "grad_norm": 0.13229092955589294, "learning_rate": 3.795913367106812e-05, "loss": 0.4489, "mean_token_accuracy": 0.8294503688812256, "step": 568 }, { "epoch": 1.7893081761006289, "grad_norm": 0.15833866596221924, "learning_rate": 3.795064489945095e-05, "loss": 0.4442, "mean_token_accuracy": 0.829393744468689, "step": 569 }, { "epoch": 1.7924528301886793, "grad_norm": 0.14272266626358032, "learning_rate": 3.7942139575881875e-05, "loss": 0.4488, "mean_token_accuracy": 0.8297043442726135, "step": 570 }, { "epoch": 1.7955974842767297, "grad_norm": 0.15338532626628876, "learning_rate": 3.7933617709186845e-05, "loss": 0.4637, "mean_token_accuracy": 0.8241843581199646, "step": 571 }, { "epoch": 1.79874213836478, "grad_norm": 0.16218815743923187, "learning_rate": 3.7925079308209e-05, "loss": 0.4581, "mean_token_accuracy": 0.8253941535949707, "step": 572 }, { "epoch": 1.8018867924528301, "grad_norm": 0.1723911464214325, "learning_rate": 3.7916524381808606e-05, "loss": 0.4485, "mean_token_accuracy": 0.8298059701919556, "step": 573 }, { "epoch": 1.8050314465408805, "grad_norm": 0.1315835565328598, "learning_rate": 3.7907952938863095e-05, "loss": 0.4409, "mean_token_accuracy": 0.8307161331176758, "step": 574 }, { "epoch": 1.808176100628931, "grad_norm": 0.1815989911556244, "learning_rate": 3.7899364988267045e-05, "loss": 0.4533, "mean_token_accuracy": 0.8261933326721191, "step": 575 }, { "epoch": 1.8113207547169812, "grad_norm": 0.13026003539562225, "learning_rate": 3.789076053893214e-05, "loss": 0.4425, "mean_token_accuracy": 0.8303501009941101, "step": 576 }, { "epoch": 1.8144654088050314, "grad_norm": 0.15672063827514648, "learning_rate": 3.788213959978722e-05, "loss": 0.4492, "mean_token_accuracy": 0.8289505839347839, "step": 577 }, { "epoch": 1.8176100628930818, "grad_norm": 0.1376017928123474, "learning_rate": 3.7873502179778204e-05, "loss": 0.4531, "mean_token_accuracy": 0.8275585174560547, "step": 578 }, { "epoch": 1.8207547169811322, "grad_norm": 0.16440635919570923, "learning_rate": 3.786484828786812e-05, "loss": 0.4357, "mean_token_accuracy": 0.8302841782569885, "step": 579 }, { "epoch": 1.8238993710691824, "grad_norm": 0.15636101365089417, "learning_rate": 3.78561779330371e-05, "loss": 0.4612, "mean_token_accuracy": 0.8249790072441101, "step": 580 }, { "epoch": 1.8270440251572326, "grad_norm": 0.14285589754581451, "learning_rate": 3.7847491124282354e-05, "loss": 0.4456, "mean_token_accuracy": 0.828575611114502, "step": 581 }, { "epoch": 1.830188679245283, "grad_norm": 0.16128715872764587, "learning_rate": 3.783878787061817e-05, "loss": 0.4499, "mean_token_accuracy": 0.8283258676528931, "step": 582 }, { "epoch": 1.8333333333333335, "grad_norm": 0.14683718979358673, "learning_rate": 3.783006818107589e-05, "loss": 0.4493, "mean_token_accuracy": 0.8294415473937988, "step": 583 }, { "epoch": 1.8364779874213837, "grad_norm": 0.17366930842399597, "learning_rate": 3.782133206470392e-05, "loss": 0.4569, "mean_token_accuracy": 0.8255544900894165, "step": 584 }, { "epoch": 1.8396226415094339, "grad_norm": 0.18530210852622986, "learning_rate": 3.781257953056771e-05, "loss": 0.4469, "mean_token_accuracy": 0.8293200731277466, "step": 585 }, { "epoch": 1.8427672955974843, "grad_norm": 0.1758362352848053, "learning_rate": 3.780381058774975e-05, "loss": 0.4444, "mean_token_accuracy": 0.8289125561714172, "step": 586 }, { "epoch": 1.8459119496855347, "grad_norm": 0.17143899202346802, "learning_rate": 3.779502524534955e-05, "loss": 0.4437, "mean_token_accuracy": 0.8296340703964233, "step": 587 }, { "epoch": 1.849056603773585, "grad_norm": 0.15893438458442688, "learning_rate": 3.778622351248365e-05, "loss": 0.461, "mean_token_accuracy": 0.8271942734718323, "step": 588 }, { "epoch": 1.8522012578616351, "grad_norm": 0.15634989738464355, "learning_rate": 3.777740539828558e-05, "loss": 0.4646, "mean_token_accuracy": 0.8230259418487549, "step": 589 }, { "epoch": 1.8553459119496856, "grad_norm": 0.15147466957569122, "learning_rate": 3.776857091190588e-05, "loss": 0.4628, "mean_token_accuracy": 0.8259610533714294, "step": 590 }, { "epoch": 1.858490566037736, "grad_norm": 0.1858360767364502, "learning_rate": 3.775972006251209e-05, "loss": 0.4608, "mean_token_accuracy": 0.8260266184806824, "step": 591 }, { "epoch": 1.8616352201257862, "grad_norm": 0.1573314070701599, "learning_rate": 3.77508528592887e-05, "loss": 0.4638, "mean_token_accuracy": 0.8259332180023193, "step": 592 }, { "epoch": 1.8647798742138364, "grad_norm": 0.1616523116827011, "learning_rate": 3.7741969311437203e-05, "loss": 0.4354, "mean_token_accuracy": 0.8312195539474487, "step": 593 }, { "epoch": 1.8679245283018868, "grad_norm": 0.161224365234375, "learning_rate": 3.7733069428176044e-05, "loss": 0.4606, "mean_token_accuracy": 0.8260555267333984, "step": 594 }, { "epoch": 1.8710691823899372, "grad_norm": 0.14999954402446747, "learning_rate": 3.772415321874059e-05, "loss": 0.4526, "mean_token_accuracy": 0.829216480255127, "step": 595 }, { "epoch": 1.8742138364779874, "grad_norm": 0.13627400994300842, "learning_rate": 3.7715220692383206e-05, "loss": 0.4451, "mean_token_accuracy": 0.8285694718360901, "step": 596 }, { "epoch": 1.8773584905660377, "grad_norm": 0.1767715960741043, "learning_rate": 3.770627185837313e-05, "loss": 0.4477, "mean_token_accuracy": 0.8299650549888611, "step": 597 }, { "epoch": 1.880503144654088, "grad_norm": 0.17853030562400818, "learning_rate": 3.769730672599655e-05, "loss": 0.4455, "mean_token_accuracy": 0.8283320665359497, "step": 598 }, { "epoch": 1.8836477987421385, "grad_norm": 0.1425485908985138, "learning_rate": 3.768832530455658e-05, "loss": 0.4537, "mean_token_accuracy": 0.8282998204231262, "step": 599 }, { "epoch": 1.8867924528301887, "grad_norm": 0.14554312825202942, "learning_rate": 3.7679327603373224e-05, "loss": 0.4365, "mean_token_accuracy": 0.830973744392395, "step": 600 }, { "epoch": 1.889937106918239, "grad_norm": 0.14549976587295532, "learning_rate": 3.767031363178336e-05, "loss": 0.4476, "mean_token_accuracy": 0.8286392092704773, "step": 601 }, { "epoch": 1.8930817610062893, "grad_norm": 0.1506306231021881, "learning_rate": 3.766128339914079e-05, "loss": 0.4344, "mean_token_accuracy": 0.8309949636459351, "step": 602 }, { "epoch": 1.8962264150943398, "grad_norm": 0.18175862729549408, "learning_rate": 3.765223691481615e-05, "loss": 0.4414, "mean_token_accuracy": 0.8330844044685364, "step": 603 }, { "epoch": 1.89937106918239, "grad_norm": 0.14774861931800842, "learning_rate": 3.764317418819696e-05, "loss": 0.447, "mean_token_accuracy": 0.8281767964363098, "step": 604 }, { "epoch": 1.9025157232704402, "grad_norm": 0.2164774239063263, "learning_rate": 3.7634095228687606e-05, "loss": 0.449, "mean_token_accuracy": 0.827027440071106, "step": 605 }, { "epoch": 1.9056603773584906, "grad_norm": 0.1532975137233734, "learning_rate": 3.762500004570929e-05, "loss": 0.439, "mean_token_accuracy": 0.8315529823303223, "step": 606 }, { "epoch": 1.908805031446541, "grad_norm": 0.17712901532649994, "learning_rate": 3.761588864870009e-05, "loss": 0.4586, "mean_token_accuracy": 0.8272659182548523, "step": 607 }, { "epoch": 1.9119496855345912, "grad_norm": 0.1497354805469513, "learning_rate": 3.760676104711485e-05, "loss": 0.4487, "mean_token_accuracy": 0.8286811709403992, "step": 608 }, { "epoch": 1.9150943396226414, "grad_norm": 0.16662414371967316, "learning_rate": 3.759761725042529e-05, "loss": 0.4597, "mean_token_accuracy": 0.8276802897453308, "step": 609 }, { "epoch": 1.9182389937106918, "grad_norm": 0.17103828489780426, "learning_rate": 3.75884572681199e-05, "loss": 0.4465, "mean_token_accuracy": 0.8278771638870239, "step": 610 }, { "epoch": 1.9213836477987423, "grad_norm": 0.18412946164608002, "learning_rate": 3.7579281109703976e-05, "loss": 0.4396, "mean_token_accuracy": 0.8283287882804871, "step": 611 }, { "epoch": 1.9245283018867925, "grad_norm": 0.21794135868549347, "learning_rate": 3.757008878469959e-05, "loss": 0.4531, "mean_token_accuracy": 0.8257644772529602, "step": 612 }, { "epoch": 1.9276729559748427, "grad_norm": 0.16859892010688782, "learning_rate": 3.756088030264562e-05, "loss": 0.4477, "mean_token_accuracy": 0.8286134600639343, "step": 613 }, { "epoch": 1.930817610062893, "grad_norm": 0.16001489758491516, "learning_rate": 3.7551655673097664e-05, "loss": 0.4477, "mean_token_accuracy": 0.8258501887321472, "step": 614 }, { "epoch": 1.9339622641509435, "grad_norm": 0.16016210615634918, "learning_rate": 3.7542414905628125e-05, "loss": 0.4464, "mean_token_accuracy": 0.8278330564498901, "step": 615 }, { "epoch": 1.9371069182389937, "grad_norm": 0.16711488366127014, "learning_rate": 3.753315800982611e-05, "loss": 0.4529, "mean_token_accuracy": 0.8275772333145142, "step": 616 }, { "epoch": 1.940251572327044, "grad_norm": 0.14468665421009064, "learning_rate": 3.75238849952975e-05, "loss": 0.4498, "mean_token_accuracy": 0.8276384472846985, "step": 617 }, { "epoch": 1.9433962264150944, "grad_norm": 0.16502094268798828, "learning_rate": 3.751459587166486e-05, "loss": 0.4418, "mean_token_accuracy": 0.8288312554359436, "step": 618 }, { "epoch": 1.9465408805031448, "grad_norm": 0.15773820877075195, "learning_rate": 3.750529064856752e-05, "loss": 0.4467, "mean_token_accuracy": 0.8294534683227539, "step": 619 }, { "epoch": 1.949685534591195, "grad_norm": 0.15135358273983002, "learning_rate": 3.749596933566146e-05, "loss": 0.4547, "mean_token_accuracy": 0.826626181602478, "step": 620 }, { "epoch": 1.9528301886792452, "grad_norm": 0.14081326127052307, "learning_rate": 3.748663194261942e-05, "loss": 0.4491, "mean_token_accuracy": 0.8284742832183838, "step": 621 }, { "epoch": 1.9559748427672956, "grad_norm": 0.14700694382190704, "learning_rate": 3.7477278479130774e-05, "loss": 0.4505, "mean_token_accuracy": 0.828678548336029, "step": 622 }, { "epoch": 1.959119496855346, "grad_norm": 0.14334739744663239, "learning_rate": 3.7467908954901607e-05, "loss": 0.4515, "mean_token_accuracy": 0.8276963829994202, "step": 623 }, { "epoch": 1.9622641509433962, "grad_norm": 0.13543659448623657, "learning_rate": 3.745852337965463e-05, "loss": 0.4372, "mean_token_accuracy": 0.8290265202522278, "step": 624 }, { "epoch": 1.9654088050314464, "grad_norm": 0.13767287135124207, "learning_rate": 3.744912176312926e-05, "loss": 0.4536, "mean_token_accuracy": 0.829569935798645, "step": 625 }, { "epoch": 1.9685534591194969, "grad_norm": 0.14214669167995453, "learning_rate": 3.743970411508153e-05, "loss": 0.4492, "mean_token_accuracy": 0.8299821019172668, "step": 626 }, { "epoch": 1.9716981132075473, "grad_norm": 0.1446889191865921, "learning_rate": 3.7430270445284113e-05, "loss": 0.4507, "mean_token_accuracy": 0.8302611112594604, "step": 627 }, { "epoch": 1.9748427672955975, "grad_norm": 0.1604827344417572, "learning_rate": 3.74208207635263e-05, "loss": 0.4503, "mean_token_accuracy": 0.8277794122695923, "step": 628 }, { "epoch": 1.9779874213836477, "grad_norm": 0.15117383003234863, "learning_rate": 3.741135507961402e-05, "loss": 0.4459, "mean_token_accuracy": 0.8283612132072449, "step": 629 }, { "epoch": 1.9811320754716981, "grad_norm": 0.152221217751503, "learning_rate": 3.740187340336978e-05, "loss": 0.4501, "mean_token_accuracy": 0.8274828791618347, "step": 630 }, { "epoch": 1.9842767295597485, "grad_norm": 0.15416257083415985, "learning_rate": 3.7392375744632705e-05, "loss": 0.4314, "mean_token_accuracy": 0.8337308764457703, "step": 631 }, { "epoch": 1.9874213836477987, "grad_norm": 0.15964354574680328, "learning_rate": 3.73828621132585e-05, "loss": 0.4477, "mean_token_accuracy": 0.8281906843185425, "step": 632 }, { "epoch": 1.990566037735849, "grad_norm": 0.176462784409523, "learning_rate": 3.737333251911943e-05, "loss": 0.458, "mean_token_accuracy": 0.8277231454849243, "step": 633 }, { "epoch": 1.9937106918238994, "grad_norm": 0.14236986637115479, "learning_rate": 3.736378697210435e-05, "loss": 0.4433, "mean_token_accuracy": 0.830397367477417, "step": 634 }, { "epoch": 1.9968553459119498, "grad_norm": 0.20474541187286377, "learning_rate": 3.735422548211865e-05, "loss": 0.4443, "mean_token_accuracy": 0.8304517269134521, "step": 635 }, { "epoch": 2.0, "grad_norm": 0.1430058777332306, "learning_rate": 3.7344648059084254e-05, "loss": 0.4409, "mean_token_accuracy": 0.8248613476753235, "step": 636 }, { "epoch": 2.00314465408805, "grad_norm": 0.2259976714849472, "learning_rate": 3.733505471293965e-05, "loss": 0.3875, "mean_token_accuracy": 0.8429651260375977, "step": 637 }, { "epoch": 2.006289308176101, "grad_norm": 0.7415010929107666, "learning_rate": 3.732544545363983e-05, "loss": 0.396, "mean_token_accuracy": 0.8418735861778259, "step": 638 }, { "epoch": 2.009433962264151, "grad_norm": 0.20141229033470154, "learning_rate": 3.73158202911563e-05, "loss": 0.3975, "mean_token_accuracy": 0.8419850468635559, "step": 639 }, { "epoch": 2.0125786163522013, "grad_norm": 0.29176196455955505, "learning_rate": 3.730617923547708e-05, "loss": 0.38, "mean_token_accuracy": 0.8438968062400818, "step": 640 }, { "epoch": 2.0157232704402515, "grad_norm": 0.19412219524383545, "learning_rate": 3.7296522296606663e-05, "loss": 0.3857, "mean_token_accuracy": 0.843908965587616, "step": 641 }, { "epoch": 2.018867924528302, "grad_norm": 0.33550146222114563, "learning_rate": 3.7286849484566046e-05, "loss": 0.3931, "mean_token_accuracy": 0.8432913422584534, "step": 642 }, { "epoch": 2.0220125786163523, "grad_norm": 0.2596316635608673, "learning_rate": 3.727716080939268e-05, "loss": 0.3962, "mean_token_accuracy": 0.841339111328125, "step": 643 }, { "epoch": 2.0251572327044025, "grad_norm": 0.2917231321334839, "learning_rate": 3.726745628114048e-05, "loss": 0.3957, "mean_token_accuracy": 0.8408085703849792, "step": 644 }, { "epoch": 2.0283018867924527, "grad_norm": 0.22424006462097168, "learning_rate": 3.725773590987982e-05, "loss": 0.3824, "mean_token_accuracy": 0.8433759808540344, "step": 645 }, { "epoch": 2.0314465408805034, "grad_norm": 0.31289535760879517, "learning_rate": 3.7247999705697515e-05, "loss": 0.3867, "mean_token_accuracy": 0.8449568152427673, "step": 646 }, { "epoch": 2.0345911949685536, "grad_norm": 0.28442052006721497, "learning_rate": 3.723824767869679e-05, "loss": 0.4006, "mean_token_accuracy": 0.8402043581008911, "step": 647 }, { "epoch": 2.0377358490566038, "grad_norm": 0.282670795917511, "learning_rate": 3.722847983899732e-05, "loss": 0.388, "mean_token_accuracy": 0.8409184217453003, "step": 648 }, { "epoch": 2.040880503144654, "grad_norm": 0.26842135190963745, "learning_rate": 3.7218696196735165e-05, "loss": 0.3722, "mean_token_accuracy": 0.8447781801223755, "step": 649 }, { "epoch": 2.0440251572327046, "grad_norm": 0.22568707168102264, "learning_rate": 3.720889676206279e-05, "loss": 0.3783, "mean_token_accuracy": 0.8424438834190369, "step": 650 }, { "epoch": 2.047169811320755, "grad_norm": 0.23483210802078247, "learning_rate": 3.719908154514904e-05, "loss": 0.4051, "mean_token_accuracy": 0.8413841128349304, "step": 651 }, { "epoch": 2.050314465408805, "grad_norm": 0.19200192391872406, "learning_rate": 3.7189250556179156e-05, "loss": 0.391, "mean_token_accuracy": 0.8443261384963989, "step": 652 }, { "epoch": 2.0534591194968552, "grad_norm": 0.2197944074869156, "learning_rate": 3.717940380535474e-05, "loss": 0.3733, "mean_token_accuracy": 0.8451419472694397, "step": 653 }, { "epoch": 2.056603773584906, "grad_norm": 0.15395909547805786, "learning_rate": 3.716954130289374e-05, "loss": 0.396, "mean_token_accuracy": 0.8405272364616394, "step": 654 }, { "epoch": 2.059748427672956, "grad_norm": 0.2504603862762451, "learning_rate": 3.7159663059030446e-05, "loss": 0.3819, "mean_token_accuracy": 0.8421579599380493, "step": 655 }, { "epoch": 2.0628930817610063, "grad_norm": 0.16238710284233093, "learning_rate": 3.7149769084015514e-05, "loss": 0.3874, "mean_token_accuracy": 0.8433874249458313, "step": 656 }, { "epoch": 2.0660377358490565, "grad_norm": 0.25545817613601685, "learning_rate": 3.713985938811588e-05, "loss": 0.3916, "mean_token_accuracy": 0.841235876083374, "step": 657 }, { "epoch": 2.069182389937107, "grad_norm": 0.2000405341386795, "learning_rate": 3.7129933981614816e-05, "loss": 0.3717, "mean_token_accuracy": 0.8454606533050537, "step": 658 }, { "epoch": 2.0723270440251573, "grad_norm": 0.2362697422504425, "learning_rate": 3.711999287481191e-05, "loss": 0.3962, "mean_token_accuracy": 0.840667724609375, "step": 659 }, { "epoch": 2.0754716981132075, "grad_norm": 0.21723434329032898, "learning_rate": 3.7110036078023024e-05, "loss": 0.3906, "mean_token_accuracy": 0.8431429266929626, "step": 660 }, { "epoch": 2.0786163522012577, "grad_norm": 0.17892718315124512, "learning_rate": 3.71000636015803e-05, "loss": 0.3848, "mean_token_accuracy": 0.8448073863983154, "step": 661 }, { "epoch": 2.0817610062893084, "grad_norm": 0.18332098424434662, "learning_rate": 3.709007545583217e-05, "loss": 0.393, "mean_token_accuracy": 0.8400642275810242, "step": 662 }, { "epoch": 2.0849056603773586, "grad_norm": 0.1824181079864502, "learning_rate": 3.708007165114329e-05, "loss": 0.398, "mean_token_accuracy": 0.8406155705451965, "step": 663 }, { "epoch": 2.088050314465409, "grad_norm": 0.18380111455917358, "learning_rate": 3.70700521978946e-05, "loss": 0.3918, "mean_token_accuracy": 0.8429200649261475, "step": 664 }, { "epoch": 2.091194968553459, "grad_norm": 0.1803388148546219, "learning_rate": 3.706001710648327e-05, "loss": 0.4006, "mean_token_accuracy": 0.8405814170837402, "step": 665 }, { "epoch": 2.0943396226415096, "grad_norm": 0.17774419486522675, "learning_rate": 3.7049966387322694e-05, "loss": 0.3817, "mean_token_accuracy": 0.842662513256073, "step": 666 }, { "epoch": 2.09748427672956, "grad_norm": 0.19682133197784424, "learning_rate": 3.703990005084248e-05, "loss": 0.3837, "mean_token_accuracy": 0.8443858027458191, "step": 667 }, { "epoch": 2.10062893081761, "grad_norm": 0.19912737607955933, "learning_rate": 3.702981810748844e-05, "loss": 0.3818, "mean_token_accuracy": 0.8457018136978149, "step": 668 }, { "epoch": 2.1037735849056602, "grad_norm": 0.17018529772758484, "learning_rate": 3.7019720567722595e-05, "loss": 0.375, "mean_token_accuracy": 0.8442913293838501, "step": 669 }, { "epoch": 2.106918238993711, "grad_norm": 0.18541787564754486, "learning_rate": 3.700960744202313e-05, "loss": 0.3678, "mean_token_accuracy": 0.8443611264228821, "step": 670 }, { "epoch": 2.110062893081761, "grad_norm": 0.15378877520561218, "learning_rate": 3.6999478740884435e-05, "loss": 0.3852, "mean_token_accuracy": 0.8438538312911987, "step": 671 }, { "epoch": 2.1132075471698113, "grad_norm": 0.1910746991634369, "learning_rate": 3.6989334474817026e-05, "loss": 0.3812, "mean_token_accuracy": 0.8418902158737183, "step": 672 }, { "epoch": 2.1163522012578615, "grad_norm": 0.15623098611831665, "learning_rate": 3.697917465434759e-05, "loss": 0.3934, "mean_token_accuracy": 0.8424828052520752, "step": 673 }, { "epoch": 2.119496855345912, "grad_norm": 0.17167270183563232, "learning_rate": 3.6968999290018953e-05, "loss": 0.3905, "mean_token_accuracy": 0.8438811898231506, "step": 674 }, { "epoch": 2.1226415094339623, "grad_norm": 0.1417340338230133, "learning_rate": 3.695880839239007e-05, "loss": 0.3953, "mean_token_accuracy": 0.8393605947494507, "step": 675 }, { "epoch": 2.1257861635220126, "grad_norm": 0.19678044319152832, "learning_rate": 3.694860197203602e-05, "loss": 0.3838, "mean_token_accuracy": 0.8427770137786865, "step": 676 }, { "epoch": 2.1289308176100628, "grad_norm": 0.16106431186199188, "learning_rate": 3.693838003954798e-05, "loss": 0.397, "mean_token_accuracy": 0.8402181267738342, "step": 677 }, { "epoch": 2.1320754716981134, "grad_norm": 0.15794852375984192, "learning_rate": 3.692814260553323e-05, "loss": 0.3899, "mean_token_accuracy": 0.8398270606994629, "step": 678 }, { "epoch": 2.1352201257861636, "grad_norm": 0.1485368013381958, "learning_rate": 3.691788968061512e-05, "loss": 0.3834, "mean_token_accuracy": 0.8433291912078857, "step": 679 }, { "epoch": 2.138364779874214, "grad_norm": 0.16451844573020935, "learning_rate": 3.690762127543312e-05, "loss": 0.388, "mean_token_accuracy": 0.8428193926811218, "step": 680 }, { "epoch": 2.141509433962264, "grad_norm": 0.1383827030658722, "learning_rate": 3.6897337400642706e-05, "loss": 0.3863, "mean_token_accuracy": 0.844539225101471, "step": 681 }, { "epoch": 2.1446540880503147, "grad_norm": 0.16487407684326172, "learning_rate": 3.688703806691545e-05, "loss": 0.3806, "mean_token_accuracy": 0.8434612154960632, "step": 682 }, { "epoch": 2.147798742138365, "grad_norm": 0.16573506593704224, "learning_rate": 3.6876723284938944e-05, "loss": 0.3753, "mean_token_accuracy": 0.8452996611595154, "step": 683 }, { "epoch": 2.150943396226415, "grad_norm": 0.16341538727283478, "learning_rate": 3.686639306541681e-05, "loss": 0.3854, "mean_token_accuracy": 0.8425849676132202, "step": 684 }, { "epoch": 2.1540880503144653, "grad_norm": 0.17833516001701355, "learning_rate": 3.685604741906871e-05, "loss": 0.3863, "mean_token_accuracy": 0.8424407243728638, "step": 685 }, { "epoch": 2.157232704402516, "grad_norm": 0.15904675424098969, "learning_rate": 3.684568635663029e-05, "loss": 0.3786, "mean_token_accuracy": 0.8449070453643799, "step": 686 }, { "epoch": 2.160377358490566, "grad_norm": 0.14655998349189758, "learning_rate": 3.683530988885321e-05, "loss": 0.3935, "mean_token_accuracy": 0.8412083387374878, "step": 687 }, { "epoch": 2.1635220125786163, "grad_norm": 0.16055339574813843, "learning_rate": 3.6824918026505094e-05, "loss": 0.3858, "mean_token_accuracy": 0.8450068831443787, "step": 688 }, { "epoch": 2.1666666666666665, "grad_norm": 0.1480380743741989, "learning_rate": 3.681451078036957e-05, "loss": 0.3874, "mean_token_accuracy": 0.8429787158966064, "step": 689 }, { "epoch": 2.169811320754717, "grad_norm": 0.1642938256263733, "learning_rate": 3.680408816124622e-05, "loss": 0.4035, "mean_token_accuracy": 0.8393082022666931, "step": 690 }, { "epoch": 2.1729559748427674, "grad_norm": 0.15204781293869019, "learning_rate": 3.6793650179950574e-05, "loss": 0.369, "mean_token_accuracy": 0.8482940793037415, "step": 691 }, { "epoch": 2.1761006289308176, "grad_norm": 0.18579435348510742, "learning_rate": 3.67831968473141e-05, "loss": 0.3856, "mean_token_accuracy": 0.8452662229537964, "step": 692 }, { "epoch": 2.1792452830188678, "grad_norm": 0.16767072677612305, "learning_rate": 3.6772728174184185e-05, "loss": 0.3887, "mean_token_accuracy": 0.8428248167037964, "step": 693 }, { "epoch": 2.1823899371069184, "grad_norm": 0.1605173796415329, "learning_rate": 3.676224417142417e-05, "loss": 0.3735, "mean_token_accuracy": 0.8427917957305908, "step": 694 }, { "epoch": 2.1855345911949686, "grad_norm": 0.15939317643642426, "learning_rate": 3.675174484991328e-05, "loss": 0.3782, "mean_token_accuracy": 0.8429858088493347, "step": 695 }, { "epoch": 2.188679245283019, "grad_norm": 0.17312584817409515, "learning_rate": 3.674123022054664e-05, "loss": 0.3846, "mean_token_accuracy": 0.843677818775177, "step": 696 }, { "epoch": 2.191823899371069, "grad_norm": 0.14854241907596588, "learning_rate": 3.6730700294235255e-05, "loss": 0.401, "mean_token_accuracy": 0.8407949209213257, "step": 697 }, { "epoch": 2.1949685534591197, "grad_norm": 0.15429075062274933, "learning_rate": 3.6720155081906004e-05, "loss": 0.383, "mean_token_accuracy": 0.8440051078796387, "step": 698 }, { "epoch": 2.19811320754717, "grad_norm": 0.19384504854679108, "learning_rate": 3.670959459450164e-05, "loss": 0.3876, "mean_token_accuracy": 0.8426343202590942, "step": 699 }, { "epoch": 2.20125786163522, "grad_norm": 0.17279024422168732, "learning_rate": 3.669901884298077e-05, "loss": 0.3933, "mean_token_accuracy": 0.8408414721488953, "step": 700 }, { "epoch": 2.2044025157232703, "grad_norm": 0.14874425530433655, "learning_rate": 3.6688427838317805e-05, "loss": 0.3851, "mean_token_accuracy": 0.843673586845398, "step": 701 }, { "epoch": 2.207547169811321, "grad_norm": 0.1604325771331787, "learning_rate": 3.667782159150302e-05, "loss": 0.3934, "mean_token_accuracy": 0.840538740158081, "step": 702 }, { "epoch": 2.210691823899371, "grad_norm": 0.1530817151069641, "learning_rate": 3.666720011354251e-05, "loss": 0.3947, "mean_token_accuracy": 0.8400828838348389, "step": 703 }, { "epoch": 2.2138364779874213, "grad_norm": 0.14883075654506683, "learning_rate": 3.6656563415458144e-05, "loss": 0.3965, "mean_token_accuracy": 0.8402729034423828, "step": 704 }, { "epoch": 2.2169811320754715, "grad_norm": 0.13267891108989716, "learning_rate": 3.66459115082876e-05, "loss": 0.3857, "mean_token_accuracy": 0.8434920310974121, "step": 705 }, { "epoch": 2.220125786163522, "grad_norm": 0.17050376534461975, "learning_rate": 3.663524440308436e-05, "loss": 0.3995, "mean_token_accuracy": 0.8391640186309814, "step": 706 }, { "epoch": 2.2232704402515724, "grad_norm": 0.16033872961997986, "learning_rate": 3.6624562110917634e-05, "loss": 0.3791, "mean_token_accuracy": 0.8431466817855835, "step": 707 }, { "epoch": 2.2264150943396226, "grad_norm": 0.16110140085220337, "learning_rate": 3.6613864642872433e-05, "loss": 0.3815, "mean_token_accuracy": 0.84393709897995, "step": 708 }, { "epoch": 2.229559748427673, "grad_norm": 0.1578216254711151, "learning_rate": 3.660315201004949e-05, "loss": 0.3901, "mean_token_accuracy": 0.8419655561447144, "step": 709 }, { "epoch": 2.2327044025157234, "grad_norm": 0.15616242587566376, "learning_rate": 3.659242422356528e-05, "loss": 0.3755, "mean_token_accuracy": 0.847722589969635, "step": 710 }, { "epoch": 2.2358490566037736, "grad_norm": 0.16292689740657806, "learning_rate": 3.658168129455201e-05, "loss": 0.3778, "mean_token_accuracy": 0.8432061076164246, "step": 711 }, { "epoch": 2.238993710691824, "grad_norm": 0.14753198623657227, "learning_rate": 3.657092323415759e-05, "loss": 0.3908, "mean_token_accuracy": 0.8414838314056396, "step": 712 }, { "epoch": 2.242138364779874, "grad_norm": 0.14525368809700012, "learning_rate": 3.656015005354565e-05, "loss": 0.3864, "mean_token_accuracy": 0.8425192832946777, "step": 713 }, { "epoch": 2.2452830188679247, "grad_norm": 0.1643352061510086, "learning_rate": 3.654936176389548e-05, "loss": 0.3937, "mean_token_accuracy": 0.8406062722206116, "step": 714 }, { "epoch": 2.248427672955975, "grad_norm": 0.1485619693994522, "learning_rate": 3.653855837640208e-05, "loss": 0.3949, "mean_token_accuracy": 0.8385857939720154, "step": 715 }, { "epoch": 2.251572327044025, "grad_norm": 0.17640046775341034, "learning_rate": 3.65277399022761e-05, "loss": 0.3931, "mean_token_accuracy": 0.84043288230896, "step": 716 }, { "epoch": 2.2547169811320753, "grad_norm": 0.13140366971492767, "learning_rate": 3.651690635274385e-05, "loss": 0.382, "mean_token_accuracy": 0.8446291089057922, "step": 717 }, { "epoch": 2.257861635220126, "grad_norm": 0.1530969738960266, "learning_rate": 3.650605773904728e-05, "loss": 0.373, "mean_token_accuracy": 0.8450080156326294, "step": 718 }, { "epoch": 2.261006289308176, "grad_norm": 0.13690687716007233, "learning_rate": 3.649519407244397e-05, "loss": 0.3972, "mean_token_accuracy": 0.8407341241836548, "step": 719 }, { "epoch": 2.2641509433962264, "grad_norm": 0.1574098914861679, "learning_rate": 3.648431536420713e-05, "loss": 0.3798, "mean_token_accuracy": 0.8445830345153809, "step": 720 }, { "epoch": 2.2672955974842766, "grad_norm": 0.1320675164461136, "learning_rate": 3.6473421625625575e-05, "loss": 0.3939, "mean_token_accuracy": 0.8414525389671326, "step": 721 }, { "epoch": 2.270440251572327, "grad_norm": 0.16731448471546173, "learning_rate": 3.646251286800371e-05, "loss": 0.3884, "mean_token_accuracy": 0.8414167165756226, "step": 722 }, { "epoch": 2.2735849056603774, "grad_norm": 0.15549425780773163, "learning_rate": 3.645158910266154e-05, "loss": 0.3905, "mean_token_accuracy": 0.8417775630950928, "step": 723 }, { "epoch": 2.2767295597484276, "grad_norm": 0.14840513467788696, "learning_rate": 3.6440650340934625e-05, "loss": 0.4033, "mean_token_accuracy": 0.8384340405464172, "step": 724 }, { "epoch": 2.279874213836478, "grad_norm": 0.1481187343597412, "learning_rate": 3.64296965941741e-05, "loss": 0.3943, "mean_token_accuracy": 0.8407248854637146, "step": 725 }, { "epoch": 2.2830188679245285, "grad_norm": 0.16942381858825684, "learning_rate": 3.641872787374664e-05, "loss": 0.3954, "mean_token_accuracy": 0.841485857963562, "step": 726 }, { "epoch": 2.2861635220125787, "grad_norm": 0.1288381665945053, "learning_rate": 3.640774419103448e-05, "loss": 0.3898, "mean_token_accuracy": 0.8435741662979126, "step": 727 }, { "epoch": 2.289308176100629, "grad_norm": 0.1845909059047699, "learning_rate": 3.6396745557435344e-05, "loss": 0.3959, "mean_token_accuracy": 0.839992105960846, "step": 728 }, { "epoch": 2.292452830188679, "grad_norm": 0.13671965897083282, "learning_rate": 3.6385731984362505e-05, "loss": 0.3952, "mean_token_accuracy": 0.8399907946586609, "step": 729 }, { "epoch": 2.2955974842767297, "grad_norm": 0.18472205102443695, "learning_rate": 3.637470348324473e-05, "loss": 0.3913, "mean_token_accuracy": 0.8435697555541992, "step": 730 }, { "epoch": 2.29874213836478, "grad_norm": 0.136347234249115, "learning_rate": 3.6363660065526255e-05, "loss": 0.3984, "mean_token_accuracy": 0.8397014737129211, "step": 731 }, { "epoch": 2.30188679245283, "grad_norm": 0.17823082208633423, "learning_rate": 3.635260174266682e-05, "loss": 0.3909, "mean_token_accuracy": 0.8430346846580505, "step": 732 }, { "epoch": 2.3050314465408803, "grad_norm": 0.143354594707489, "learning_rate": 3.634152852614163e-05, "loss": 0.3936, "mean_token_accuracy": 0.8386049270629883, "step": 733 }, { "epoch": 2.308176100628931, "grad_norm": 0.14964474737644196, "learning_rate": 3.633044042744134e-05, "loss": 0.3879, "mean_token_accuracy": 0.8436481952667236, "step": 734 }, { "epoch": 2.311320754716981, "grad_norm": 0.14824624359607697, "learning_rate": 3.631933745807204e-05, "loss": 0.4019, "mean_token_accuracy": 0.8401293754577637, "step": 735 }, { "epoch": 2.3144654088050314, "grad_norm": 0.14450529217720032, "learning_rate": 3.6308219629555264e-05, "loss": 0.4031, "mean_token_accuracy": 0.8403022289276123, "step": 736 }, { "epoch": 2.3176100628930816, "grad_norm": 0.16056504845619202, "learning_rate": 3.629708695342795e-05, "loss": 0.3936, "mean_token_accuracy": 0.8434531688690186, "step": 737 }, { "epoch": 2.3207547169811322, "grad_norm": 0.1542007476091385, "learning_rate": 3.628593944124247e-05, "loss": 0.3994, "mean_token_accuracy": 0.8412461280822754, "step": 738 }, { "epoch": 2.3238993710691824, "grad_norm": 0.15933917462825775, "learning_rate": 3.627477710456657e-05, "loss": 0.3968, "mean_token_accuracy": 0.8403854966163635, "step": 739 }, { "epoch": 2.3270440251572326, "grad_norm": 0.15049493312835693, "learning_rate": 3.626359995498337e-05, "loss": 0.3845, "mean_token_accuracy": 0.8435242176055908, "step": 740 }, { "epoch": 2.330188679245283, "grad_norm": 0.15058250725269318, "learning_rate": 3.625240800409139e-05, "loss": 0.3976, "mean_token_accuracy": 0.8395102024078369, "step": 741 }, { "epoch": 2.3333333333333335, "grad_norm": 0.14439113438129425, "learning_rate": 3.624120126350449e-05, "loss": 0.3873, "mean_token_accuracy": 0.8440976738929749, "step": 742 }, { "epoch": 2.3364779874213837, "grad_norm": 0.14690877497196198, "learning_rate": 3.6229979744851886e-05, "loss": 0.3769, "mean_token_accuracy": 0.8444263935089111, "step": 743 }, { "epoch": 2.339622641509434, "grad_norm": 0.17405840754508972, "learning_rate": 3.621874345977813e-05, "loss": 0.4031, "mean_token_accuracy": 0.8406171798706055, "step": 744 }, { "epoch": 2.342767295597484, "grad_norm": 0.15319286286830902, "learning_rate": 3.6207492419943065e-05, "loss": 0.3899, "mean_token_accuracy": 0.8424206972122192, "step": 745 }, { "epoch": 2.3459119496855347, "grad_norm": 0.16251526772975922, "learning_rate": 3.61962266370219e-05, "loss": 0.3898, "mean_token_accuracy": 0.8415105938911438, "step": 746 }, { "epoch": 2.349056603773585, "grad_norm": 0.1394706815481186, "learning_rate": 3.618494612270511e-05, "loss": 0.3919, "mean_token_accuracy": 0.8406856656074524, "step": 747 }, { "epoch": 2.352201257861635, "grad_norm": 0.15003950893878937, "learning_rate": 3.6173650888698456e-05, "loss": 0.3984, "mean_token_accuracy": 0.8404019474983215, "step": 748 }, { "epoch": 2.3553459119496853, "grad_norm": 0.1499018669128418, "learning_rate": 3.616234094672298e-05, "loss": 0.3947, "mean_token_accuracy": 0.8411343693733215, "step": 749 }, { "epoch": 2.358490566037736, "grad_norm": 0.15295803546905518, "learning_rate": 3.615101630851499e-05, "loss": 0.3919, "mean_token_accuracy": 0.8424092531204224, "step": 750 }, { "epoch": 2.361635220125786, "grad_norm": 0.15068519115447998, "learning_rate": 3.6139676985826035e-05, "loss": 0.3825, "mean_token_accuracy": 0.845098078250885, "step": 751 }, { "epoch": 2.3647798742138364, "grad_norm": 0.17812252044677734, "learning_rate": 3.6128322990422924e-05, "loss": 0.3781, "mean_token_accuracy": 0.8452845811843872, "step": 752 }, { "epoch": 2.3679245283018866, "grad_norm": 0.1486268937587738, "learning_rate": 3.6116954334087644e-05, "loss": 0.3904, "mean_token_accuracy": 0.8416242003440857, "step": 753 }, { "epoch": 2.3710691823899372, "grad_norm": 0.20920655131340027, "learning_rate": 3.6105571028617445e-05, "loss": 0.3882, "mean_token_accuracy": 0.8425738215446472, "step": 754 }, { "epoch": 2.3742138364779874, "grad_norm": 0.14427687227725983, "learning_rate": 3.609417308582477e-05, "loss": 0.3859, "mean_token_accuracy": 0.841761589050293, "step": 755 }, { "epoch": 2.3773584905660377, "grad_norm": 0.1912706047296524, "learning_rate": 3.608276051753722e-05, "loss": 0.3996, "mean_token_accuracy": 0.8410661816596985, "step": 756 }, { "epoch": 2.380503144654088, "grad_norm": 0.15891054272651672, "learning_rate": 3.607133333559761e-05, "loss": 0.3871, "mean_token_accuracy": 0.8439091444015503, "step": 757 }, { "epoch": 2.3836477987421385, "grad_norm": 0.18664152920246124, "learning_rate": 3.605989155186389e-05, "loss": 0.3946, "mean_token_accuracy": 0.8415076732635498, "step": 758 }, { "epoch": 2.3867924528301887, "grad_norm": 0.19155050814151764, "learning_rate": 3.6048435178209194e-05, "loss": 0.3951, "mean_token_accuracy": 0.8402332067489624, "step": 759 }, { "epoch": 2.389937106918239, "grad_norm": 0.14341680705547333, "learning_rate": 3.603696422652176e-05, "loss": 0.3858, "mean_token_accuracy": 0.8424145579338074, "step": 760 }, { "epoch": 2.3930817610062896, "grad_norm": 0.1860472857952118, "learning_rate": 3.602547870870498e-05, "loss": 0.3829, "mean_token_accuracy": 0.8446792960166931, "step": 761 }, { "epoch": 2.3962264150943398, "grad_norm": 0.16511857509613037, "learning_rate": 3.6013978636677354e-05, "loss": 0.3883, "mean_token_accuracy": 0.8434211611747742, "step": 762 }, { "epoch": 2.39937106918239, "grad_norm": 0.1752832978963852, "learning_rate": 3.600246402237248e-05, "loss": 0.3923, "mean_token_accuracy": 0.840692400932312, "step": 763 }, { "epoch": 2.40251572327044, "grad_norm": 0.1690099984407425, "learning_rate": 3.5990934877739045e-05, "loss": 0.3969, "mean_token_accuracy": 0.8391318917274475, "step": 764 }, { "epoch": 2.4056603773584904, "grad_norm": 0.13980698585510254, "learning_rate": 3.597939121474082e-05, "loss": 0.3978, "mean_token_accuracy": 0.8390938639640808, "step": 765 }, { "epoch": 2.408805031446541, "grad_norm": 0.1642495095729828, "learning_rate": 3.5967833045356664e-05, "loss": 0.3863, "mean_token_accuracy": 0.8447932600975037, "step": 766 }, { "epoch": 2.411949685534591, "grad_norm": 0.1437988579273224, "learning_rate": 3.595626038158043e-05, "loss": 0.4029, "mean_token_accuracy": 0.8407385945320129, "step": 767 }, { "epoch": 2.4150943396226414, "grad_norm": 0.1538083404302597, "learning_rate": 3.594467323542107e-05, "loss": 0.3918, "mean_token_accuracy": 0.8416243195533752, "step": 768 }, { "epoch": 2.418238993710692, "grad_norm": 0.16916736960411072, "learning_rate": 3.593307161890254e-05, "loss": 0.3855, "mean_token_accuracy": 0.8438270092010498, "step": 769 }, { "epoch": 2.4213836477987423, "grad_norm": 0.15521636605262756, "learning_rate": 3.592145554406381e-05, "loss": 0.4015, "mean_token_accuracy": 0.8377931118011475, "step": 770 }, { "epoch": 2.4245283018867925, "grad_norm": 0.15567384660243988, "learning_rate": 3.5909825022958857e-05, "loss": 0.3918, "mean_token_accuracy": 0.841164767742157, "step": 771 }, { "epoch": 2.4276729559748427, "grad_norm": 0.15030360221862793, "learning_rate": 3.5898180067656655e-05, "loss": 0.3968, "mean_token_accuracy": 0.8410426378250122, "step": 772 }, { "epoch": 2.430817610062893, "grad_norm": 0.14496302604675293, "learning_rate": 3.5886520690241136e-05, "loss": 0.3939, "mean_token_accuracy": 0.8415027260780334, "step": 773 }, { "epoch": 2.4339622641509435, "grad_norm": 0.16270099580287933, "learning_rate": 3.587484690281123e-05, "loss": 0.392, "mean_token_accuracy": 0.8409454822540283, "step": 774 }, { "epoch": 2.4371069182389937, "grad_norm": 0.1545172780752182, "learning_rate": 3.5863158717480794e-05, "loss": 0.4027, "mean_token_accuracy": 0.8391482830047607, "step": 775 }, { "epoch": 2.440251572327044, "grad_norm": 0.17777201533317566, "learning_rate": 3.585145614637864e-05, "loss": 0.3875, "mean_token_accuracy": 0.845243513584137, "step": 776 }, { "epoch": 2.4433962264150946, "grad_norm": 0.13722971081733704, "learning_rate": 3.583973920164849e-05, "loss": 0.3907, "mean_token_accuracy": 0.8422651290893555, "step": 777 }, { "epoch": 2.4465408805031448, "grad_norm": 0.16192865371704102, "learning_rate": 3.5828007895449e-05, "loss": 0.3844, "mean_token_accuracy": 0.8450747728347778, "step": 778 }, { "epoch": 2.449685534591195, "grad_norm": 0.15951241552829742, "learning_rate": 3.581626223995372e-05, "loss": 0.3847, "mean_token_accuracy": 0.8426473736763, "step": 779 }, { "epoch": 2.452830188679245, "grad_norm": 0.1600176841020584, "learning_rate": 3.580450224735111e-05, "loss": 0.3868, "mean_token_accuracy": 0.8432039618492126, "step": 780 }, { "epoch": 2.4559748427672954, "grad_norm": 0.1509629637002945, "learning_rate": 3.579272792984447e-05, "loss": 0.3837, "mean_token_accuracy": 0.843054473400116, "step": 781 }, { "epoch": 2.459119496855346, "grad_norm": 0.15704569220542908, "learning_rate": 3.5780939299651995e-05, "loss": 0.4006, "mean_token_accuracy": 0.8408331274986267, "step": 782 }, { "epoch": 2.4622641509433962, "grad_norm": 0.1845240294933319, "learning_rate": 3.576913636900672e-05, "loss": 0.4115, "mean_token_accuracy": 0.8372202515602112, "step": 783 }, { "epoch": 2.4654088050314464, "grad_norm": 0.13821645081043243, "learning_rate": 3.5757319150156515e-05, "loss": 0.3848, "mean_token_accuracy": 0.840967059135437, "step": 784 }, { "epoch": 2.468553459119497, "grad_norm": 0.18777751922607422, "learning_rate": 3.57454876553641e-05, "loss": 0.3991, "mean_token_accuracy": 0.8414704203605652, "step": 785 }, { "epoch": 2.4716981132075473, "grad_norm": 0.14253944158554077, "learning_rate": 3.573364189690699e-05, "loss": 0.401, "mean_token_accuracy": 0.8404497504234314, "step": 786 }, { "epoch": 2.4748427672955975, "grad_norm": 0.17456071078777313, "learning_rate": 3.572178188707749e-05, "loss": 0.3815, "mean_token_accuracy": 0.8429886102676392, "step": 787 }, { "epoch": 2.4779874213836477, "grad_norm": 0.14869949221611023, "learning_rate": 3.5709907638182725e-05, "loss": 0.4001, "mean_token_accuracy": 0.8371372818946838, "step": 788 }, { "epoch": 2.481132075471698, "grad_norm": 0.1542949378490448, "learning_rate": 3.569801916254457e-05, "loss": 0.3738, "mean_token_accuracy": 0.8442022800445557, "step": 789 }, { "epoch": 2.4842767295597485, "grad_norm": 0.1838357299566269, "learning_rate": 3.5686116472499665e-05, "loss": 0.3944, "mean_token_accuracy": 0.8391475677490234, "step": 790 }, { "epoch": 2.4874213836477987, "grad_norm": 0.1430555135011673, "learning_rate": 3.5674199580399425e-05, "loss": 0.3959, "mean_token_accuracy": 0.8409162163734436, "step": 791 }, { "epoch": 2.490566037735849, "grad_norm": 0.17979463934898376, "learning_rate": 3.566226849860997e-05, "loss": 0.3928, "mean_token_accuracy": 0.8403165936470032, "step": 792 }, { "epoch": 2.4937106918238996, "grad_norm": 0.137798473238945, "learning_rate": 3.5650323239512175e-05, "loss": 0.4061, "mean_token_accuracy": 0.8378630876541138, "step": 793 }, { "epoch": 2.49685534591195, "grad_norm": 0.1714894026517868, "learning_rate": 3.563836381550159e-05, "loss": 0.394, "mean_token_accuracy": 0.8412221074104309, "step": 794 }, { "epoch": 2.5, "grad_norm": 0.15192954242229462, "learning_rate": 3.5626390238988504e-05, "loss": 0.4004, "mean_token_accuracy": 0.8397027254104614, "step": 795 }, { "epoch": 2.50314465408805, "grad_norm": 0.16279102861881256, "learning_rate": 3.561440252239787e-05, "loss": 0.3799, "mean_token_accuracy": 0.8437523245811462, "step": 796 }, { "epoch": 2.5062893081761004, "grad_norm": 0.15632732212543488, "learning_rate": 3.56024006781693e-05, "loss": 0.3938, "mean_token_accuracy": 0.8418835997581482, "step": 797 }, { "epoch": 2.509433962264151, "grad_norm": 0.1601596623659134, "learning_rate": 3.559038471875711e-05, "loss": 0.3757, "mean_token_accuracy": 0.8448925614356995, "step": 798 }, { "epoch": 2.5125786163522013, "grad_norm": 0.15498454868793488, "learning_rate": 3.557835465663021e-05, "loss": 0.3966, "mean_token_accuracy": 0.83966064453125, "step": 799 }, { "epoch": 2.5157232704402515, "grad_norm": 0.15797755122184753, "learning_rate": 3.5566310504272194e-05, "loss": 0.3916, "mean_token_accuracy": 0.8428415656089783, "step": 800 }, { "epoch": 2.518867924528302, "grad_norm": 0.17878195643424988, "learning_rate": 3.5554252274181246e-05, "loss": 0.3922, "mean_token_accuracy": 0.8414608240127563, "step": 801 }, { "epoch": 2.5220125786163523, "grad_norm": 0.1354987770318985, "learning_rate": 3.554217997887016e-05, "loss": 0.3874, "mean_token_accuracy": 0.8432824611663818, "step": 802 }, { "epoch": 2.5251572327044025, "grad_norm": 0.17350763082504272, "learning_rate": 3.553009363086634e-05, "loss": 0.399, "mean_token_accuracy": 0.838498055934906, "step": 803 }, { "epoch": 2.5283018867924527, "grad_norm": 0.1341233104467392, "learning_rate": 3.551799324271176e-05, "loss": 0.4094, "mean_token_accuracy": 0.8359872698783875, "step": 804 }, { "epoch": 2.531446540880503, "grad_norm": 0.18266914784908295, "learning_rate": 3.550587882696297e-05, "loss": 0.3996, "mean_token_accuracy": 0.8383893966674805, "step": 805 }, { "epoch": 2.5345911949685536, "grad_norm": 0.14570754766464233, "learning_rate": 3.549375039619109e-05, "loss": 0.393, "mean_token_accuracy": 0.8403996229171753, "step": 806 }, { "epoch": 2.5377358490566038, "grad_norm": 0.1772826611995697, "learning_rate": 3.5481607962981744e-05, "loss": 0.4042, "mean_token_accuracy": 0.8374120593070984, "step": 807 }, { "epoch": 2.540880503144654, "grad_norm": 0.13402216136455536, "learning_rate": 3.546945153993512e-05, "loss": 0.3969, "mean_token_accuracy": 0.8391050100326538, "step": 808 }, { "epoch": 2.5440251572327046, "grad_norm": 0.15814465284347534, "learning_rate": 3.5457281139665906e-05, "loss": 0.3778, "mean_token_accuracy": 0.8457577228546143, "step": 809 }, { "epoch": 2.547169811320755, "grad_norm": 0.13784445822238922, "learning_rate": 3.544509677480332e-05, "loss": 0.4, "mean_token_accuracy": 0.8372910022735596, "step": 810 }, { "epoch": 2.550314465408805, "grad_norm": 0.15171018242835999, "learning_rate": 3.543289845799104e-05, "loss": 0.3939, "mean_token_accuracy": 0.8409681916236877, "step": 811 }, { "epoch": 2.5534591194968552, "grad_norm": 0.14729034900665283, "learning_rate": 3.542068620188723e-05, "loss": 0.3979, "mean_token_accuracy": 0.8405581116676331, "step": 812 }, { "epoch": 2.5566037735849054, "grad_norm": 0.16989730298519135, "learning_rate": 3.540846001916454e-05, "loss": 0.3912, "mean_token_accuracy": 0.8416076898574829, "step": 813 }, { "epoch": 2.559748427672956, "grad_norm": 0.16093933582305908, "learning_rate": 3.5396219922510044e-05, "loss": 0.4019, "mean_token_accuracy": 0.8399268984794617, "step": 814 }, { "epoch": 2.5628930817610063, "grad_norm": 0.1573500782251358, "learning_rate": 3.538396592462527e-05, "loss": 0.3914, "mean_token_accuracy": 0.8417745232582092, "step": 815 }, { "epoch": 2.5660377358490565, "grad_norm": 0.1340954303741455, "learning_rate": 3.5371698038226156e-05, "loss": 0.397, "mean_token_accuracy": 0.8433490991592407, "step": 816 }, { "epoch": 2.569182389937107, "grad_norm": 0.14694054424762726, "learning_rate": 3.535941627604307e-05, "loss": 0.3905, "mean_token_accuracy": 0.8411130309104919, "step": 817 }, { "epoch": 2.5723270440251573, "grad_norm": 0.1424230933189392, "learning_rate": 3.534712065082078e-05, "loss": 0.3997, "mean_token_accuracy": 0.8407908082008362, "step": 818 }, { "epoch": 2.5754716981132075, "grad_norm": 0.14012737572193146, "learning_rate": 3.533481117531842e-05, "loss": 0.3798, "mean_token_accuracy": 0.8445196151733398, "step": 819 }, { "epoch": 2.5786163522012577, "grad_norm": 0.14719845354557037, "learning_rate": 3.5322487862309516e-05, "loss": 0.4068, "mean_token_accuracy": 0.8381505608558655, "step": 820 }, { "epoch": 2.581761006289308, "grad_norm": 0.14293119311332703, "learning_rate": 3.531015072458193e-05, "loss": 0.387, "mean_token_accuracy": 0.8414595723152161, "step": 821 }, { "epoch": 2.5849056603773586, "grad_norm": 0.15876775979995728, "learning_rate": 3.5297799774937904e-05, "loss": 0.3968, "mean_token_accuracy": 0.8399572968482971, "step": 822 }, { "epoch": 2.588050314465409, "grad_norm": 0.1583719551563263, "learning_rate": 3.528543502619398e-05, "loss": 0.3837, "mean_token_accuracy": 0.8397992253303528, "step": 823 }, { "epoch": 2.591194968553459, "grad_norm": 0.16118523478507996, "learning_rate": 3.527305649118104e-05, "loss": 0.3875, "mean_token_accuracy": 0.8445963263511658, "step": 824 }, { "epoch": 2.5943396226415096, "grad_norm": 0.1711784452199936, "learning_rate": 3.526066418274427e-05, "loss": 0.4029, "mean_token_accuracy": 0.8400377631187439, "step": 825 }, { "epoch": 2.59748427672956, "grad_norm": 0.14684435725212097, "learning_rate": 3.5248258113743126e-05, "loss": 0.3746, "mean_token_accuracy": 0.8455667495727539, "step": 826 }, { "epoch": 2.60062893081761, "grad_norm": 0.16698506474494934, "learning_rate": 3.523583829705137e-05, "loss": 0.3929, "mean_token_accuracy": 0.8404534459114075, "step": 827 }, { "epoch": 2.6037735849056602, "grad_norm": 0.15083381533622742, "learning_rate": 3.5223404745557014e-05, "loss": 0.3964, "mean_token_accuracy": 0.8417296409606934, "step": 828 }, { "epoch": 2.6069182389937104, "grad_norm": 0.15367716550827026, "learning_rate": 3.521095747216235e-05, "loss": 0.3921, "mean_token_accuracy": 0.839659571647644, "step": 829 }, { "epoch": 2.610062893081761, "grad_norm": 0.16619056463241577, "learning_rate": 3.519849648978387e-05, "loss": 0.3885, "mean_token_accuracy": 0.840570330619812, "step": 830 }, { "epoch": 2.6132075471698113, "grad_norm": 0.13816650211811066, "learning_rate": 3.51860218113523e-05, "loss": 0.4104, "mean_token_accuracy": 0.8387232422828674, "step": 831 }, { "epoch": 2.6163522012578615, "grad_norm": 0.15900948643684387, "learning_rate": 3.5173533449812627e-05, "loss": 0.3917, "mean_token_accuracy": 0.8438796401023865, "step": 832 }, { "epoch": 2.619496855345912, "grad_norm": 0.14279142022132874, "learning_rate": 3.516103141812396e-05, "loss": 0.3886, "mean_token_accuracy": 0.8426015973091125, "step": 833 }, { "epoch": 2.6226415094339623, "grad_norm": 0.1456068903207779, "learning_rate": 3.5148515729259655e-05, "loss": 0.4118, "mean_token_accuracy": 0.8380022644996643, "step": 834 }, { "epoch": 2.6257861635220126, "grad_norm": 0.17821946740150452, "learning_rate": 3.51359863962072e-05, "loss": 0.3919, "mean_token_accuracy": 0.8446857929229736, "step": 835 }, { "epoch": 2.6289308176100628, "grad_norm": 0.13951314985752106, "learning_rate": 3.512344343196827e-05, "loss": 0.3948, "mean_token_accuracy": 0.841708779335022, "step": 836 }, { "epoch": 2.632075471698113, "grad_norm": 0.1878071278333664, "learning_rate": 3.511088684955868e-05, "loss": 0.3835, "mean_token_accuracy": 0.843018114566803, "step": 837 }, { "epoch": 2.6352201257861636, "grad_norm": 0.14683803915977478, "learning_rate": 3.509831666200835e-05, "loss": 0.3926, "mean_token_accuracy": 0.8428792357444763, "step": 838 }, { "epoch": 2.638364779874214, "grad_norm": 0.19433486461639404, "learning_rate": 3.508573288236135e-05, "loss": 0.3948, "mean_token_accuracy": 0.8401256799697876, "step": 839 }, { "epoch": 2.641509433962264, "grad_norm": 0.12940803170204163, "learning_rate": 3.5073135523675853e-05, "loss": 0.3959, "mean_token_accuracy": 0.8402566313743591, "step": 840 }, { "epoch": 2.6446540880503147, "grad_norm": 0.16293668746948242, "learning_rate": 3.506052459902409e-05, "loss": 0.3869, "mean_token_accuracy": 0.841245174407959, "step": 841 }, { "epoch": 2.647798742138365, "grad_norm": 0.153008833527565, "learning_rate": 3.504790012149241e-05, "loss": 0.3911, "mean_token_accuracy": 0.840588390827179, "step": 842 }, { "epoch": 2.650943396226415, "grad_norm": 0.14682938158512115, "learning_rate": 3.503526210418119e-05, "loss": 0.3967, "mean_token_accuracy": 0.8388072848320007, "step": 843 }, { "epoch": 2.6540880503144653, "grad_norm": 0.15707483887672424, "learning_rate": 3.50226105602049e-05, "loss": 0.3982, "mean_token_accuracy": 0.8398618102073669, "step": 844 }, { "epoch": 2.6572327044025155, "grad_norm": 0.15881039202213287, "learning_rate": 3.5009945502692e-05, "loss": 0.3933, "mean_token_accuracy": 0.841876745223999, "step": 845 }, { "epoch": 2.660377358490566, "grad_norm": 0.1584368199110031, "learning_rate": 3.4997266944785e-05, "loss": 0.4016, "mean_token_accuracy": 0.8401806950569153, "step": 846 }, { "epoch": 2.6635220125786163, "grad_norm": 0.16472209990024567, "learning_rate": 3.4984574899640415e-05, "loss": 0.4028, "mean_token_accuracy": 0.8396847248077393, "step": 847 }, { "epoch": 2.6666666666666665, "grad_norm": 0.15707650780677795, "learning_rate": 3.497186938042876e-05, "loss": 0.4055, "mean_token_accuracy": 0.8379293084144592, "step": 848 }, { "epoch": 2.669811320754717, "grad_norm": 0.1491553783416748, "learning_rate": 3.4959150400334516e-05, "loss": 0.3789, "mean_token_accuracy": 0.8430595397949219, "step": 849 }, { "epoch": 2.6729559748427674, "grad_norm": 0.1792389154434204, "learning_rate": 3.494641797255616e-05, "loss": 0.389, "mean_token_accuracy": 0.8386589884757996, "step": 850 }, { "epoch": 2.6761006289308176, "grad_norm": 0.15633267164230347, "learning_rate": 3.493367211030609e-05, "loss": 0.3971, "mean_token_accuracy": 0.8409385681152344, "step": 851 }, { "epoch": 2.6792452830188678, "grad_norm": 0.18488091230392456, "learning_rate": 3.492091282681066e-05, "loss": 0.3912, "mean_token_accuracy": 0.8436446785926819, "step": 852 }, { "epoch": 2.682389937106918, "grad_norm": 0.17898158729076385, "learning_rate": 3.490814013531017e-05, "loss": 0.3944, "mean_token_accuracy": 0.8414387702941895, "step": 853 }, { "epoch": 2.6855345911949686, "grad_norm": 0.1576376110315323, "learning_rate": 3.489535404905879e-05, "loss": 0.4077, "mean_token_accuracy": 0.8369157314300537, "step": 854 }, { "epoch": 2.688679245283019, "grad_norm": 0.19907242059707642, "learning_rate": 3.4882554581324646e-05, "loss": 0.3961, "mean_token_accuracy": 0.8421619534492493, "step": 855 }, { "epoch": 2.691823899371069, "grad_norm": 0.14955250918865204, "learning_rate": 3.48697417453897e-05, "loss": 0.4018, "mean_token_accuracy": 0.8398721814155579, "step": 856 }, { "epoch": 2.6949685534591197, "grad_norm": 0.1245059221982956, "learning_rate": 3.485691555454982e-05, "loss": 0.4034, "mean_token_accuracy": 0.8386282324790955, "step": 857 }, { "epoch": 2.69811320754717, "grad_norm": 0.16106462478637695, "learning_rate": 3.48440760221147e-05, "loss": 0.3885, "mean_token_accuracy": 0.8441129922866821, "step": 858 }, { "epoch": 2.70125786163522, "grad_norm": 0.1262756884098053, "learning_rate": 3.4831223161407904e-05, "loss": 0.4013, "mean_token_accuracy": 0.8384125828742981, "step": 859 }, { "epoch": 2.7044025157232703, "grad_norm": 0.16710178554058075, "learning_rate": 3.4818356985766825e-05, "loss": 0.3906, "mean_token_accuracy": 0.8418945670127869, "step": 860 }, { "epoch": 2.7075471698113205, "grad_norm": 0.14853724837303162, "learning_rate": 3.4805477508542665e-05, "loss": 0.3895, "mean_token_accuracy": 0.8417914509773254, "step": 861 }, { "epoch": 2.710691823899371, "grad_norm": 0.1702132523059845, "learning_rate": 3.4792584743100425e-05, "loss": 0.4086, "mean_token_accuracy": 0.8393948674201965, "step": 862 }, { "epoch": 2.7138364779874213, "grad_norm": 0.15771856904029846, "learning_rate": 3.4779678702818915e-05, "loss": 0.403, "mean_token_accuracy": 0.8413280844688416, "step": 863 }, { "epoch": 2.7169811320754715, "grad_norm": 0.1526855230331421, "learning_rate": 3.4766759401090695e-05, "loss": 0.395, "mean_token_accuracy": 0.8401949405670166, "step": 864 }, { "epoch": 2.720125786163522, "grad_norm": 0.15966515243053436, "learning_rate": 3.47538268513221e-05, "loss": 0.3947, "mean_token_accuracy": 0.8407111763954163, "step": 865 }, { "epoch": 2.7232704402515724, "grad_norm": 0.15073886513710022, "learning_rate": 3.474088106693323e-05, "loss": 0.3969, "mean_token_accuracy": 0.8384842276573181, "step": 866 }, { "epoch": 2.7264150943396226, "grad_norm": 0.15170443058013916, "learning_rate": 3.472792206135786e-05, "loss": 0.3938, "mean_token_accuracy": 0.8373732566833496, "step": 867 }, { "epoch": 2.729559748427673, "grad_norm": 0.1322529911994934, "learning_rate": 3.471494984804355e-05, "loss": 0.4004, "mean_token_accuracy": 0.8394856452941895, "step": 868 }, { "epoch": 2.732704402515723, "grad_norm": 0.14727897942066193, "learning_rate": 3.4701964440451545e-05, "loss": 0.3904, "mean_token_accuracy": 0.8422083258628845, "step": 869 }, { "epoch": 2.7358490566037736, "grad_norm": 0.14118586480617523, "learning_rate": 3.468896585205676e-05, "loss": 0.3974, "mean_token_accuracy": 0.8416357040405273, "step": 870 }, { "epoch": 2.738993710691824, "grad_norm": 0.1370769441127777, "learning_rate": 3.467595409634781e-05, "loss": 0.3972, "mean_token_accuracy": 0.8423879146575928, "step": 871 }, { "epoch": 2.742138364779874, "grad_norm": 0.12729352712631226, "learning_rate": 3.466292918682696e-05, "loss": 0.3891, "mean_token_accuracy": 0.8427572250366211, "step": 872 }, { "epoch": 2.7452830188679247, "grad_norm": 0.1347734034061432, "learning_rate": 3.464989113701016e-05, "loss": 0.3784, "mean_token_accuracy": 0.8443403244018555, "step": 873 }, { "epoch": 2.748427672955975, "grad_norm": 0.1276370882987976, "learning_rate": 3.4636839960426935e-05, "loss": 0.3969, "mean_token_accuracy": 0.8401352763175964, "step": 874 }, { "epoch": 2.751572327044025, "grad_norm": 0.12423428148031235, "learning_rate": 3.462377567062048e-05, "loss": 0.3887, "mean_token_accuracy": 0.8414220213890076, "step": 875 }, { "epoch": 2.7547169811320753, "grad_norm": 0.15068255364894867, "learning_rate": 3.4610698281147574e-05, "loss": 0.4004, "mean_token_accuracy": 0.8396066427230835, "step": 876 }, { "epoch": 2.757861635220126, "grad_norm": 0.13070818781852722, "learning_rate": 3.4597607805578614e-05, "loss": 0.3903, "mean_token_accuracy": 0.8402306437492371, "step": 877 }, { "epoch": 2.761006289308176, "grad_norm": 0.14346636831760406, "learning_rate": 3.458450425749754e-05, "loss": 0.3924, "mean_token_accuracy": 0.8397766351699829, "step": 878 }, { "epoch": 2.7641509433962264, "grad_norm": 0.14210395514965057, "learning_rate": 3.4571387650501886e-05, "loss": 0.3989, "mean_token_accuracy": 0.8378952145576477, "step": 879 }, { "epoch": 2.767295597484277, "grad_norm": 0.1414806991815567, "learning_rate": 3.455825799820274e-05, "loss": 0.3868, "mean_token_accuracy": 0.8398988246917725, "step": 880 }, { "epoch": 2.770440251572327, "grad_norm": 0.1717570275068283, "learning_rate": 3.45451153142247e-05, "loss": 0.4038, "mean_token_accuracy": 0.8383566737174988, "step": 881 }, { "epoch": 2.7735849056603774, "grad_norm": 0.13245093822479248, "learning_rate": 3.45319596122059e-05, "loss": 0.3847, "mean_token_accuracy": 0.842934787273407, "step": 882 }, { "epoch": 2.7767295597484276, "grad_norm": 0.1757514327764511, "learning_rate": 3.4518790905798004e-05, "loss": 0.3936, "mean_token_accuracy": 0.8390152454376221, "step": 883 }, { "epoch": 2.779874213836478, "grad_norm": 0.12597371637821198, "learning_rate": 3.450560920866613e-05, "loss": 0.4083, "mean_token_accuracy": 0.8398998379707336, "step": 884 }, { "epoch": 2.7830188679245285, "grad_norm": 0.18164481222629547, "learning_rate": 3.4492414534488917e-05, "loss": 0.3949, "mean_token_accuracy": 0.8426663875579834, "step": 885 }, { "epoch": 2.7861635220125787, "grad_norm": 0.12726567685604095, "learning_rate": 3.4479206896958434e-05, "loss": 0.4044, "mean_token_accuracy": 0.8387595415115356, "step": 886 }, { "epoch": 2.789308176100629, "grad_norm": 0.1545555740594864, "learning_rate": 3.446598630978024e-05, "loss": 0.3938, "mean_token_accuracy": 0.8420332670211792, "step": 887 }, { "epoch": 2.7924528301886795, "grad_norm": 0.12999430298805237, "learning_rate": 3.445275278667329e-05, "loss": 0.3945, "mean_token_accuracy": 0.8392206430435181, "step": 888 }, { "epoch": 2.7955974842767297, "grad_norm": 0.1740381270647049, "learning_rate": 3.443950634137e-05, "loss": 0.3868, "mean_token_accuracy": 0.841921865940094, "step": 889 }, { "epoch": 2.79874213836478, "grad_norm": 0.1445263922214508, "learning_rate": 3.4426246987616176e-05, "loss": 0.3815, "mean_token_accuracy": 0.844272792339325, "step": 890 }, { "epoch": 2.80188679245283, "grad_norm": 0.16784286499023438, "learning_rate": 3.4412974739171026e-05, "loss": 0.393, "mean_token_accuracy": 0.841655969619751, "step": 891 }, { "epoch": 2.8050314465408803, "grad_norm": 0.16633418202400208, "learning_rate": 3.439968960980712e-05, "loss": 0.3883, "mean_token_accuracy": 0.8415431380271912, "step": 892 }, { "epoch": 2.808176100628931, "grad_norm": 0.14095844328403473, "learning_rate": 3.438639161331042e-05, "loss": 0.3899, "mean_token_accuracy": 0.8424274325370789, "step": 893 }, { "epoch": 2.811320754716981, "grad_norm": 0.1560608446598053, "learning_rate": 3.437308076348024e-05, "loss": 0.4066, "mean_token_accuracy": 0.839648962020874, "step": 894 }, { "epoch": 2.8144654088050314, "grad_norm": 0.13541994988918304, "learning_rate": 3.4359757074129205e-05, "loss": 0.3911, "mean_token_accuracy": 0.842206597328186, "step": 895 }, { "epoch": 2.817610062893082, "grad_norm": 0.1437758207321167, "learning_rate": 3.4346420559083285e-05, "loss": 0.4054, "mean_token_accuracy": 0.8397225141525269, "step": 896 }, { "epoch": 2.8207547169811322, "grad_norm": 0.11642755568027496, "learning_rate": 3.433307123218176e-05, "loss": 0.3999, "mean_token_accuracy": 0.8443455696105957, "step": 897 }, { "epoch": 2.8238993710691824, "grad_norm": 0.13762716948986053, "learning_rate": 3.43197091072772e-05, "loss": 0.3982, "mean_token_accuracy": 0.8394805192947388, "step": 898 }, { "epoch": 2.8270440251572326, "grad_norm": 0.1345590353012085, "learning_rate": 3.430633419823545e-05, "loss": 0.4009, "mean_token_accuracy": 0.8399420976638794, "step": 899 }, { "epoch": 2.830188679245283, "grad_norm": 0.13669990003108978, "learning_rate": 3.429294651893563e-05, "loss": 0.3908, "mean_token_accuracy": 0.8423181176185608, "step": 900 }, { "epoch": 2.8333333333333335, "grad_norm": 0.14826296269893646, "learning_rate": 3.4279546083270097e-05, "loss": 0.3985, "mean_token_accuracy": 0.8397724628448486, "step": 901 }, { "epoch": 2.8364779874213837, "grad_norm": 0.14717832207679749, "learning_rate": 3.426613290514447e-05, "loss": 0.3775, "mean_token_accuracy": 0.8437859416007996, "step": 902 }, { "epoch": 2.839622641509434, "grad_norm": 0.1572881042957306, "learning_rate": 3.4252706998477574e-05, "loss": 0.3908, "mean_token_accuracy": 0.8429072499275208, "step": 903 }, { "epoch": 2.8427672955974845, "grad_norm": 0.1492244154214859, "learning_rate": 3.423926837720144e-05, "loss": 0.4069, "mean_token_accuracy": 0.8388838768005371, "step": 904 }, { "epoch": 2.8459119496855347, "grad_norm": 0.15537552535533905, "learning_rate": 3.42258170552613e-05, "loss": 0.3888, "mean_token_accuracy": 0.8405810594558716, "step": 905 }, { "epoch": 2.849056603773585, "grad_norm": 0.1573428362607956, "learning_rate": 3.421235304661555e-05, "loss": 0.3889, "mean_token_accuracy": 0.843213677406311, "step": 906 }, { "epoch": 2.852201257861635, "grad_norm": 0.15431787073612213, "learning_rate": 3.41988763652358e-05, "loss": 0.3969, "mean_token_accuracy": 0.8401937484741211, "step": 907 }, { "epoch": 2.8553459119496853, "grad_norm": 0.16958792507648468, "learning_rate": 3.4185387025106745e-05, "loss": 0.3966, "mean_token_accuracy": 0.8413075804710388, "step": 908 }, { "epoch": 2.858490566037736, "grad_norm": 0.15407972037792206, "learning_rate": 3.417188504022625e-05, "loss": 0.3944, "mean_token_accuracy": 0.8398054242134094, "step": 909 }, { "epoch": 2.861635220125786, "grad_norm": 0.16856738924980164, "learning_rate": 3.415837042460531e-05, "loss": 0.3862, "mean_token_accuracy": 0.8428761959075928, "step": 910 }, { "epoch": 2.8647798742138364, "grad_norm": 0.16327671706676483, "learning_rate": 3.414484319226799e-05, "loss": 0.4078, "mean_token_accuracy": 0.8394198417663574, "step": 911 }, { "epoch": 2.867924528301887, "grad_norm": 0.13859853148460388, "learning_rate": 3.4131303357251505e-05, "loss": 0.3979, "mean_token_accuracy": 0.839618980884552, "step": 912 }, { "epoch": 2.8710691823899372, "grad_norm": 0.16593888401985168, "learning_rate": 3.411775093360609e-05, "loss": 0.3959, "mean_token_accuracy": 0.8423987030982971, "step": 913 }, { "epoch": 2.8742138364779874, "grad_norm": 0.12062633782625198, "learning_rate": 3.4104185935395075e-05, "loss": 0.391, "mean_token_accuracy": 0.8417162895202637, "step": 914 }, { "epoch": 2.8773584905660377, "grad_norm": 0.14128541946411133, "learning_rate": 3.409060837669483e-05, "loss": 0.4021, "mean_token_accuracy": 0.840218186378479, "step": 915 }, { "epoch": 2.880503144654088, "grad_norm": 0.12717147171497345, "learning_rate": 3.407701827159476e-05, "loss": 0.4044, "mean_token_accuracy": 0.8357147574424744, "step": 916 }, { "epoch": 2.8836477987421385, "grad_norm": 0.14301805198192596, "learning_rate": 3.406341563419729e-05, "loss": 0.3944, "mean_token_accuracy": 0.8393138647079468, "step": 917 }, { "epoch": 2.8867924528301887, "grad_norm": 0.12959828972816467, "learning_rate": 3.4049800478617844e-05, "loss": 0.3893, "mean_token_accuracy": 0.8411592841148376, "step": 918 }, { "epoch": 2.889937106918239, "grad_norm": 0.1419060230255127, "learning_rate": 3.4036172818984855e-05, "loss": 0.3923, "mean_token_accuracy": 0.8420356512069702, "step": 919 }, { "epoch": 2.8930817610062896, "grad_norm": 0.14367401599884033, "learning_rate": 3.4022532669439706e-05, "loss": 0.406, "mean_token_accuracy": 0.8380276560783386, "step": 920 }, { "epoch": 2.8962264150943398, "grad_norm": 0.13751500844955444, "learning_rate": 3.4008880044136763e-05, "loss": 0.3834, "mean_token_accuracy": 0.8410583734512329, "step": 921 }, { "epoch": 2.89937106918239, "grad_norm": 0.13970538973808289, "learning_rate": 3.399521495724332e-05, "loss": 0.3923, "mean_token_accuracy": 0.842393696308136, "step": 922 }, { "epoch": 2.90251572327044, "grad_norm": 0.1298050582408905, "learning_rate": 3.39815374229396e-05, "loss": 0.3951, "mean_token_accuracy": 0.8414964079856873, "step": 923 }, { "epoch": 2.9056603773584904, "grad_norm": 0.12179820239543915, "learning_rate": 3.396784745541877e-05, "loss": 0.3891, "mean_token_accuracy": 0.8448134064674377, "step": 924 }, { "epoch": 2.908805031446541, "grad_norm": 0.12799564003944397, "learning_rate": 3.3954145068886876e-05, "loss": 0.4021, "mean_token_accuracy": 0.8391764163970947, "step": 925 }, { "epoch": 2.911949685534591, "grad_norm": 0.1219654306769371, "learning_rate": 3.394043027756286e-05, "loss": 0.4057, "mean_token_accuracy": 0.8397082686424255, "step": 926 }, { "epoch": 2.9150943396226414, "grad_norm": 0.1398463398218155, "learning_rate": 3.392670309567852e-05, "loss": 0.3973, "mean_token_accuracy": 0.8409194350242615, "step": 927 }, { "epoch": 2.918238993710692, "grad_norm": 0.13440991938114166, "learning_rate": 3.391296353747854e-05, "loss": 0.3866, "mean_token_accuracy": 0.8404433131217957, "step": 928 }, { "epoch": 2.9213836477987423, "grad_norm": 0.13937310874462128, "learning_rate": 3.389921161722042e-05, "loss": 0.3968, "mean_token_accuracy": 0.8404849171638489, "step": 929 }, { "epoch": 2.9245283018867925, "grad_norm": 0.15991294384002686, "learning_rate": 3.3885447349174514e-05, "loss": 0.3983, "mean_token_accuracy": 0.83951735496521, "step": 930 }, { "epoch": 2.9276729559748427, "grad_norm": 0.14287906885147095, "learning_rate": 3.387167074762396e-05, "loss": 0.4209, "mean_token_accuracy": 0.8352724313735962, "step": 931 }, { "epoch": 2.930817610062893, "grad_norm": 0.1415240466594696, "learning_rate": 3.3857881826864715e-05, "loss": 0.3867, "mean_token_accuracy": 0.8398596048355103, "step": 932 }, { "epoch": 2.9339622641509435, "grad_norm": 0.14125128090381622, "learning_rate": 3.3844080601205516e-05, "loss": 0.4074, "mean_token_accuracy": 0.8381497263908386, "step": 933 }, { "epoch": 2.9371069182389937, "grad_norm": 0.13332779705524445, "learning_rate": 3.383026708496787e-05, "loss": 0.4023, "mean_token_accuracy": 0.8399627804756165, "step": 934 }, { "epoch": 2.940251572327044, "grad_norm": 0.13416799902915955, "learning_rate": 3.381644129248602e-05, "loss": 0.3902, "mean_token_accuracy": 0.8443140983581543, "step": 935 }, { "epoch": 2.9433962264150946, "grad_norm": 0.14479246735572815, "learning_rate": 3.380260323810698e-05, "loss": 0.3884, "mean_token_accuracy": 0.8421072959899902, "step": 936 }, { "epoch": 2.9465408805031448, "grad_norm": 0.1284419447183609, "learning_rate": 3.378875293619047e-05, "loss": 0.392, "mean_token_accuracy": 0.8417386412620544, "step": 937 }, { "epoch": 2.949685534591195, "grad_norm": 0.13157133758068085, "learning_rate": 3.3774890401108915e-05, "loss": 0.4076, "mean_token_accuracy": 0.8378016352653503, "step": 938 }, { "epoch": 2.952830188679245, "grad_norm": 0.13864070177078247, "learning_rate": 3.376101564724743e-05, "loss": 0.3901, "mean_token_accuracy": 0.8419355750083923, "step": 939 }, { "epoch": 2.9559748427672954, "grad_norm": 0.14290863275527954, "learning_rate": 3.374712868900384e-05, "loss": 0.3963, "mean_token_accuracy": 0.8409423828125, "step": 940 }, { "epoch": 2.959119496855346, "grad_norm": 0.12841610610485077, "learning_rate": 3.373322954078859e-05, "loss": 0.3969, "mean_token_accuracy": 0.8391101956367493, "step": 941 }, { "epoch": 2.9622641509433962, "grad_norm": 0.14718987047672272, "learning_rate": 3.371931821702481e-05, "loss": 0.4037, "mean_token_accuracy": 0.8362467288970947, "step": 942 }, { "epoch": 2.9654088050314464, "grad_norm": 0.1477939933538437, "learning_rate": 3.3705394732148264e-05, "loss": 0.4049, "mean_token_accuracy": 0.8374539613723755, "step": 943 }, { "epoch": 2.968553459119497, "grad_norm": 0.13969185948371887, "learning_rate": 3.369145910060731e-05, "loss": 0.3864, "mean_token_accuracy": 0.8428540825843811, "step": 944 }, { "epoch": 2.9716981132075473, "grad_norm": 0.15239839255809784, "learning_rate": 3.3677511336862924e-05, "loss": 0.4012, "mean_token_accuracy": 0.8415625095367432, "step": 945 }, { "epoch": 2.9748427672955975, "grad_norm": 0.1359701305627823, "learning_rate": 3.366355145538867e-05, "loss": 0.4019, "mean_token_accuracy": 0.8378898501396179, "step": 946 }, { "epoch": 2.9779874213836477, "grad_norm": 0.17559880018234253, "learning_rate": 3.3649579470670696e-05, "loss": 0.3981, "mean_token_accuracy": 0.8389723300933838, "step": 947 }, { "epoch": 2.981132075471698, "grad_norm": 0.12837964296340942, "learning_rate": 3.3635595397207705e-05, "loss": 0.4067, "mean_token_accuracy": 0.8383262157440186, "step": 948 }, { "epoch": 2.9842767295597485, "grad_norm": 0.16035127639770508, "learning_rate": 3.3621599249510936e-05, "loss": 0.4018, "mean_token_accuracy": 0.8396176695823669, "step": 949 }, { "epoch": 2.9874213836477987, "grad_norm": 0.13061131536960602, "learning_rate": 3.360759104210417e-05, "loss": 0.4034, "mean_token_accuracy": 0.839031994342804, "step": 950 }, { "epoch": 2.990566037735849, "grad_norm": 0.15194053947925568, "learning_rate": 3.359357078952368e-05, "loss": 0.385, "mean_token_accuracy": 0.842530369758606, "step": 951 }, { "epoch": 2.9937106918238996, "grad_norm": 0.1333847939968109, "learning_rate": 3.3579538506318264e-05, "loss": 0.3998, "mean_token_accuracy": 0.8397628664970398, "step": 952 }, { "epoch": 2.99685534591195, "grad_norm": 0.1507115513086319, "learning_rate": 3.3565494207049194e-05, "loss": 0.4102, "mean_token_accuracy": 0.8390610218048096, "step": 953 }, { "epoch": 3.0, "grad_norm": 0.12280502915382385, "learning_rate": 3.3551437906290207e-05, "loss": 0.3837, "mean_token_accuracy": 0.8393963575363159, "step": 954 }, { "epoch": 3.00314465408805, "grad_norm": 0.2119593769311905, "learning_rate": 3.353736961862751e-05, "loss": 0.3276, "mean_token_accuracy": 0.8589532375335693, "step": 955 }, { "epoch": 3.006289308176101, "grad_norm": 0.5749790668487549, "learning_rate": 3.352328935865972e-05, "loss": 0.3309, "mean_token_accuracy": 0.8566039800643921, "step": 956 }, { "epoch": 3.009433962264151, "grad_norm": 0.30272579193115234, "learning_rate": 3.350919714099791e-05, "loss": 0.3372, "mean_token_accuracy": 0.8549745082855225, "step": 957 }, { "epoch": 3.0125786163522013, "grad_norm": 0.46458861231803894, "learning_rate": 3.3495092980265526e-05, "loss": 0.323, "mean_token_accuracy": 0.8595535755157471, "step": 958 }, { "epoch": 3.0157232704402515, "grad_norm": 0.29047465324401855, "learning_rate": 3.348097689109844e-05, "loss": 0.329, "mean_token_accuracy": 0.859404444694519, "step": 959 }, { "epoch": 3.018867924528302, "grad_norm": 0.1861647367477417, "learning_rate": 3.346684888814489e-05, "loss": 0.3285, "mean_token_accuracy": 0.8579146862030029, "step": 960 }, { "epoch": 3.0220125786163523, "grad_norm": 0.23538491129875183, "learning_rate": 3.345270898606548e-05, "loss": 0.3214, "mean_token_accuracy": 0.8577497005462646, "step": 961 }, { "epoch": 3.0251572327044025, "grad_norm": 0.24326147139072418, "learning_rate": 3.3438557199533143e-05, "loss": 0.3172, "mean_token_accuracy": 0.8618540167808533, "step": 962 }, { "epoch": 3.0283018867924527, "grad_norm": 0.19439604878425598, "learning_rate": 3.342439354323317e-05, "loss": 0.3343, "mean_token_accuracy": 0.8569464087486267, "step": 963 }, { "epoch": 3.0314465408805034, "grad_norm": 0.2556777596473694, "learning_rate": 3.341021803186316e-05, "loss": 0.3229, "mean_token_accuracy": 0.859006404876709, "step": 964 }, { "epoch": 3.0345911949685536, "grad_norm": 0.20313474535942078, "learning_rate": 3.3396030680133e-05, "loss": 0.319, "mean_token_accuracy": 0.8617485165596008, "step": 965 }, { "epoch": 3.0377358490566038, "grad_norm": 0.251817524433136, "learning_rate": 3.3381831502764894e-05, "loss": 0.3193, "mean_token_accuracy": 0.863706111907959, "step": 966 }, { "epoch": 3.040880503144654, "grad_norm": 0.20017312467098236, "learning_rate": 3.3367620514493284e-05, "loss": 0.3238, "mean_token_accuracy": 0.8608281016349792, "step": 967 }, { "epoch": 3.0440251572327046, "grad_norm": 0.27038446068763733, "learning_rate": 3.335339773006489e-05, "loss": 0.3151, "mean_token_accuracy": 0.8628440499305725, "step": 968 }, { "epoch": 3.047169811320755, "grad_norm": 0.2303992211818695, "learning_rate": 3.3339163164238665e-05, "loss": 0.3103, "mean_token_accuracy": 0.8667078018188477, "step": 969 }, { "epoch": 3.050314465408805, "grad_norm": 0.23521584272384644, "learning_rate": 3.33249168317858e-05, "loss": 0.3129, "mean_token_accuracy": 0.8642385005950928, "step": 970 }, { "epoch": 3.0534591194968552, "grad_norm": 0.20210982859134674, "learning_rate": 3.331065874748967e-05, "loss": 0.3125, "mean_token_accuracy": 0.8629567623138428, "step": 971 }, { "epoch": 3.056603773584906, "grad_norm": 0.22222645580768585, "learning_rate": 3.329638892614587e-05, "loss": 0.3041, "mean_token_accuracy": 0.8669219017028809, "step": 972 }, { "epoch": 3.059748427672956, "grad_norm": 0.1965298056602478, "learning_rate": 3.328210738256215e-05, "loss": 0.3232, "mean_token_accuracy": 0.8628347516059875, "step": 973 }, { "epoch": 3.0628930817610063, "grad_norm": 0.19745439291000366, "learning_rate": 3.326781413155846e-05, "loss": 0.3271, "mean_token_accuracy": 0.8626418113708496, "step": 974 }, { "epoch": 3.0660377358490565, "grad_norm": 0.16124515235424042, "learning_rate": 3.325350918796686e-05, "loss": 0.3434, "mean_token_accuracy": 0.8579691648483276, "step": 975 }, { "epoch": 3.069182389937107, "grad_norm": 0.1931421458721161, "learning_rate": 3.323919256663157e-05, "loss": 0.3245, "mean_token_accuracy": 0.8611186146736145, "step": 976 }, { "epoch": 3.0723270440251573, "grad_norm": 0.18141953647136688, "learning_rate": 3.3224864282408925e-05, "loss": 0.3094, "mean_token_accuracy": 0.8668796420097351, "step": 977 }, { "epoch": 3.0754716981132075, "grad_norm": 0.17107489705085754, "learning_rate": 3.3210524350167346e-05, "loss": 0.3047, "mean_token_accuracy": 0.8638178110122681, "step": 978 }, { "epoch": 3.0786163522012577, "grad_norm": 0.18757247924804688, "learning_rate": 3.319617278478735e-05, "loss": 0.3279, "mean_token_accuracy": 0.8593670725822449, "step": 979 }, { "epoch": 3.0817610062893084, "grad_norm": 0.15939109027385712, "learning_rate": 3.3181809601161535e-05, "loss": 0.3341, "mean_token_accuracy": 0.8586524128913879, "step": 980 }, { "epoch": 3.0849056603773586, "grad_norm": 0.17841064929962158, "learning_rate": 3.316743481419454e-05, "loss": 0.3279, "mean_token_accuracy": 0.8618712425231934, "step": 981 }, { "epoch": 3.088050314465409, "grad_norm": 0.15791453421115875, "learning_rate": 3.3153048438803064e-05, "loss": 0.3183, "mean_token_accuracy": 0.8625402450561523, "step": 982 }, { "epoch": 3.091194968553459, "grad_norm": 0.16996054351329803, "learning_rate": 3.3138650489915805e-05, "loss": 0.3459, "mean_token_accuracy": 0.8569095134735107, "step": 983 }, { "epoch": 3.0943396226415096, "grad_norm": 0.14582346379756927, "learning_rate": 3.3124240982473495e-05, "loss": 0.3199, "mean_token_accuracy": 0.8625748157501221, "step": 984 }, { "epoch": 3.09748427672956, "grad_norm": 0.16500362753868103, "learning_rate": 3.3109819931428855e-05, "loss": 0.3309, "mean_token_accuracy": 0.8585406541824341, "step": 985 }, { "epoch": 3.10062893081761, "grad_norm": 0.1490785777568817, "learning_rate": 3.309538735174657e-05, "loss": 0.3192, "mean_token_accuracy": 0.8605947494506836, "step": 986 }, { "epoch": 3.1037735849056602, "grad_norm": 0.1524612009525299, "learning_rate": 3.30809432584033e-05, "loss": 0.3223, "mean_token_accuracy": 0.8592230677604675, "step": 987 }, { "epoch": 3.106918238993711, "grad_norm": 0.1406700164079666, "learning_rate": 3.306648766638765e-05, "loss": 0.3349, "mean_token_accuracy": 0.8577442169189453, "step": 988 }, { "epoch": 3.110062893081761, "grad_norm": 0.15463581681251526, "learning_rate": 3.3052020590700174e-05, "loss": 0.3305, "mean_token_accuracy": 0.8602052927017212, "step": 989 }, { "epoch": 3.1132075471698113, "grad_norm": 0.1442749798297882, "learning_rate": 3.30375420463533e-05, "loss": 0.3205, "mean_token_accuracy": 0.8626676201820374, "step": 990 }, { "epoch": 3.1163522012578615, "grad_norm": 0.14940737187862396, "learning_rate": 3.302305204837141e-05, "loss": 0.3072, "mean_token_accuracy": 0.8642657995223999, "step": 991 }, { "epoch": 3.119496855345912, "grad_norm": 0.1608681082725525, "learning_rate": 3.300855061179074e-05, "loss": 0.3054, "mean_token_accuracy": 0.8632607460021973, "step": 992 }, { "epoch": 3.1226415094339623, "grad_norm": 0.1494629681110382, "learning_rate": 3.2994037751659386e-05, "loss": 0.3224, "mean_token_accuracy": 0.8599094152450562, "step": 993 }, { "epoch": 3.1257861635220126, "grad_norm": 0.1440831571817398, "learning_rate": 3.297951348303733e-05, "loss": 0.3369, "mean_token_accuracy": 0.8589987754821777, "step": 994 }, { "epoch": 3.1289308176100628, "grad_norm": 0.16431039571762085, "learning_rate": 3.296497782099637e-05, "loss": 0.3264, "mean_token_accuracy": 0.8605884313583374, "step": 995 }, { "epoch": 3.1320754716981134, "grad_norm": 0.14332625269889832, "learning_rate": 3.2950430780620134e-05, "loss": 0.3078, "mean_token_accuracy": 0.8648683428764343, "step": 996 }, { "epoch": 3.1352201257861636, "grad_norm": 0.1832369714975357, "learning_rate": 3.293587237700405e-05, "loss": 0.3178, "mean_token_accuracy": 0.8615559339523315, "step": 997 }, { "epoch": 3.138364779874214, "grad_norm": 0.14069117605686188, "learning_rate": 3.292130262525536e-05, "loss": 0.3123, "mean_token_accuracy": 0.8634822368621826, "step": 998 }, { "epoch": 3.141509433962264, "grad_norm": 0.16523166000843048, "learning_rate": 3.290672154049306e-05, "loss": 0.3357, "mean_token_accuracy": 0.8579123616218567, "step": 999 }, { "epoch": 3.1446540880503147, "grad_norm": 0.14075875282287598, "learning_rate": 3.2892129137847886e-05, "loss": 0.3249, "mean_token_accuracy": 0.8633124828338623, "step": 1000 }, { "epoch": 3.147798742138365, "grad_norm": 0.14860154688358307, "learning_rate": 3.287752543246238e-05, "loss": 0.3179, "mean_token_accuracy": 0.8615362048149109, "step": 1001 }, { "epoch": 3.150943396226415, "grad_norm": 0.15627436339855194, "learning_rate": 3.2862910439490764e-05, "loss": 0.3236, "mean_token_accuracy": 0.8606275320053101, "step": 1002 }, { "epoch": 3.1540880503144653, "grad_norm": 0.13712136447429657, "learning_rate": 3.284828417409899e-05, "loss": 0.3133, "mean_token_accuracy": 0.8660383820533752, "step": 1003 }, { "epoch": 3.157232704402516, "grad_norm": 0.16311588883399963, "learning_rate": 3.283364665146469e-05, "loss": 0.3095, "mean_token_accuracy": 0.8647195100784302, "step": 1004 }, { "epoch": 3.160377358490566, "grad_norm": 0.1587604582309723, "learning_rate": 3.2818997886777214e-05, "loss": 0.3333, "mean_token_accuracy": 0.8577799201011658, "step": 1005 }, { "epoch": 3.1635220125786163, "grad_norm": 0.14171631634235382, "learning_rate": 3.280433789523753e-05, "loss": 0.3386, "mean_token_accuracy": 0.8587863445281982, "step": 1006 }, { "epoch": 3.1666666666666665, "grad_norm": 0.14268644154071808, "learning_rate": 3.2789666692058304e-05, "loss": 0.3312, "mean_token_accuracy": 0.8618650436401367, "step": 1007 }, { "epoch": 3.169811320754717, "grad_norm": 0.1492924988269806, "learning_rate": 3.277498429246381e-05, "loss": 0.3347, "mean_token_accuracy": 0.8577045798301697, "step": 1008 }, { "epoch": 3.1729559748427674, "grad_norm": 0.13884948194026947, "learning_rate": 3.276029071168993e-05, "loss": 0.3257, "mean_token_accuracy": 0.8589639067649841, "step": 1009 }, { "epoch": 3.1761006289308176, "grad_norm": 0.17117413878440857, "learning_rate": 3.2745585964984175e-05, "loss": 0.3281, "mean_token_accuracy": 0.8598911166191101, "step": 1010 }, { "epoch": 3.1792452830188678, "grad_norm": 0.15213529765605927, "learning_rate": 3.273087006760563e-05, "loss": 0.3272, "mean_token_accuracy": 0.8612444996833801, "step": 1011 }, { "epoch": 3.1823899371069184, "grad_norm": 0.13560143113136292, "learning_rate": 3.271614303482494e-05, "loss": 0.3371, "mean_token_accuracy": 0.8586222529411316, "step": 1012 }, { "epoch": 3.1855345911949686, "grad_norm": 0.15628263354301453, "learning_rate": 3.270140488192434e-05, "loss": 0.3292, "mean_token_accuracy": 0.8595730662345886, "step": 1013 }, { "epoch": 3.188679245283019, "grad_norm": 0.14781375229358673, "learning_rate": 3.268665562419756e-05, "loss": 0.3286, "mean_token_accuracy": 0.8589670658111572, "step": 1014 }, { "epoch": 3.191823899371069, "grad_norm": 0.15790531039237976, "learning_rate": 3.267189527694989e-05, "loss": 0.3274, "mean_token_accuracy": 0.8619564175605774, "step": 1015 }, { "epoch": 3.1949685534591197, "grad_norm": 0.14986778795719147, "learning_rate": 3.26571238554981e-05, "loss": 0.3204, "mean_token_accuracy": 0.8628863096237183, "step": 1016 }, { "epoch": 3.19811320754717, "grad_norm": 0.16138505935668945, "learning_rate": 3.264234137517047e-05, "loss": 0.3368, "mean_token_accuracy": 0.8585647344589233, "step": 1017 }, { "epoch": 3.20125786163522, "grad_norm": 0.1323172003030777, "learning_rate": 3.262754785130676e-05, "loss": 0.3126, "mean_token_accuracy": 0.8637034893035889, "step": 1018 }, { "epoch": 3.2044025157232703, "grad_norm": 0.15144748985767365, "learning_rate": 3.261274329925817e-05, "loss": 0.3142, "mean_token_accuracy": 0.863036036491394, "step": 1019 }, { "epoch": 3.207547169811321, "grad_norm": 0.15019363164901733, "learning_rate": 3.259792773438734e-05, "loss": 0.3355, "mean_token_accuracy": 0.8558260202407837, "step": 1020 }, { "epoch": 3.210691823899371, "grad_norm": 0.14695622026920319, "learning_rate": 3.2583101172068366e-05, "loss": 0.3308, "mean_token_accuracy": 0.8598349690437317, "step": 1021 }, { "epoch": 3.2138364779874213, "grad_norm": 0.15687057375907898, "learning_rate": 3.256826362768675e-05, "loss": 0.3112, "mean_token_accuracy": 0.8649905323982239, "step": 1022 }, { "epoch": 3.2169811320754715, "grad_norm": 0.15487264096736908, "learning_rate": 3.255341511663937e-05, "loss": 0.3235, "mean_token_accuracy": 0.8581811785697937, "step": 1023 }, { "epoch": 3.220125786163522, "grad_norm": 0.15453389286994934, "learning_rate": 3.253855565433449e-05, "loss": 0.3157, "mean_token_accuracy": 0.860891580581665, "step": 1024 }, { "epoch": 3.2232704402515724, "grad_norm": 0.1621147245168686, "learning_rate": 3.2523685256191756e-05, "loss": 0.3167, "mean_token_accuracy": 0.8601864576339722, "step": 1025 }, { "epoch": 3.2264150943396226, "grad_norm": 0.14345519244670868, "learning_rate": 3.250880393764215e-05, "loss": 0.314, "mean_token_accuracy": 0.861411988735199, "step": 1026 }, { "epoch": 3.229559748427673, "grad_norm": 0.1536128968000412, "learning_rate": 3.249391171412798e-05, "loss": 0.3488, "mean_token_accuracy": 0.8550587296485901, "step": 1027 }, { "epoch": 3.2327044025157234, "grad_norm": 0.15043888986110687, "learning_rate": 3.247900860110288e-05, "loss": 0.3356, "mean_token_accuracy": 0.8564497232437134, "step": 1028 }, { "epoch": 3.2358490566037736, "grad_norm": 0.1707436740398407, "learning_rate": 3.246409461403178e-05, "loss": 0.3344, "mean_token_accuracy": 0.8586674928665161, "step": 1029 }, { "epoch": 3.238993710691824, "grad_norm": 0.152404323220253, "learning_rate": 3.244916976839089e-05, "loss": 0.3287, "mean_token_accuracy": 0.8581715822219849, "step": 1030 }, { "epoch": 3.242138364779874, "grad_norm": 0.1631406843662262, "learning_rate": 3.243423407966769e-05, "loss": 0.3234, "mean_token_accuracy": 0.8623092770576477, "step": 1031 }, { "epoch": 3.2452830188679247, "grad_norm": 0.15967321395874023, "learning_rate": 3.241928756336093e-05, "loss": 0.3226, "mean_token_accuracy": 0.8575816750526428, "step": 1032 }, { "epoch": 3.248427672955975, "grad_norm": 0.14411385357379913, "learning_rate": 3.240433023498056e-05, "loss": 0.3309, "mean_token_accuracy": 0.8565492630004883, "step": 1033 }, { "epoch": 3.251572327044025, "grad_norm": 0.16305826604366302, "learning_rate": 3.238936211004779e-05, "loss": 0.3238, "mean_token_accuracy": 0.8587154150009155, "step": 1034 }, { "epoch": 3.2547169811320753, "grad_norm": 0.14919938147068024, "learning_rate": 3.237438320409499e-05, "loss": 0.3415, "mean_token_accuracy": 0.8566460609436035, "step": 1035 }, { "epoch": 3.257861635220126, "grad_norm": 0.15982939302921295, "learning_rate": 3.2359393532665744e-05, "loss": 0.3393, "mean_token_accuracy": 0.8567012548446655, "step": 1036 }, { "epoch": 3.261006289308176, "grad_norm": 0.15051038563251495, "learning_rate": 3.234439311131483e-05, "loss": 0.3311, "mean_token_accuracy": 0.8606399297714233, "step": 1037 }, { "epoch": 3.2641509433962264, "grad_norm": 0.15478473901748657, "learning_rate": 3.232938195560812e-05, "loss": 0.3029, "mean_token_accuracy": 0.8636201620101929, "step": 1038 }, { "epoch": 3.2672955974842766, "grad_norm": 0.16665510833263397, "learning_rate": 3.231436008112268e-05, "loss": 0.3252, "mean_token_accuracy": 0.8577843904495239, "step": 1039 }, { "epoch": 3.270440251572327, "grad_norm": 0.14956386387348175, "learning_rate": 3.2299327503446675e-05, "loss": 0.3244, "mean_token_accuracy": 0.8613575100898743, "step": 1040 }, { "epoch": 3.2735849056603774, "grad_norm": 0.15692093968391418, "learning_rate": 3.2284284238179385e-05, "loss": 0.3207, "mean_token_accuracy": 0.8585643172264099, "step": 1041 }, { "epoch": 3.2767295597484276, "grad_norm": 0.14993791282176971, "learning_rate": 3.226923030093117e-05, "loss": 0.3244, "mean_token_accuracy": 0.8617245554924011, "step": 1042 }, { "epoch": 3.279874213836478, "grad_norm": 0.14964789152145386, "learning_rate": 3.225416570732346e-05, "loss": 0.3282, "mean_token_accuracy": 0.8592199087142944, "step": 1043 }, { "epoch": 3.2830188679245285, "grad_norm": 0.1455584019422531, "learning_rate": 3.2239090472988775e-05, "loss": 0.323, "mean_token_accuracy": 0.8582662343978882, "step": 1044 }, { "epoch": 3.2861635220125787, "grad_norm": 0.14278201758861542, "learning_rate": 3.222400461357064e-05, "loss": 0.3016, "mean_token_accuracy": 0.8661797046661377, "step": 1045 }, { "epoch": 3.289308176100629, "grad_norm": 0.13362888991832733, "learning_rate": 3.220890814472363e-05, "loss": 0.3325, "mean_token_accuracy": 0.8578916788101196, "step": 1046 }, { "epoch": 3.292452830188679, "grad_norm": 0.14459533989429474, "learning_rate": 3.219380108211331e-05, "loss": 0.3237, "mean_token_accuracy": 0.8623485565185547, "step": 1047 }, { "epoch": 3.2955974842767297, "grad_norm": 0.13775929808616638, "learning_rate": 3.217868344141625e-05, "loss": 0.3279, "mean_token_accuracy": 0.8596442341804504, "step": 1048 }, { "epoch": 3.29874213836478, "grad_norm": 0.12983611226081848, "learning_rate": 3.2163555238320006e-05, "loss": 0.3241, "mean_token_accuracy": 0.8623194694519043, "step": 1049 }, { "epoch": 3.30188679245283, "grad_norm": 0.13327816128730774, "learning_rate": 3.214841648852308e-05, "loss": 0.3283, "mean_token_accuracy": 0.8586874604225159, "step": 1050 }, { "epoch": 3.3050314465408803, "grad_norm": 0.14740483462810516, "learning_rate": 3.21332672077349e-05, "loss": 0.3289, "mean_token_accuracy": 0.8617115616798401, "step": 1051 }, { "epoch": 3.308176100628931, "grad_norm": 0.1283697932958603, "learning_rate": 3.211810741167588e-05, "loss": 0.3175, "mean_token_accuracy": 0.8588584065437317, "step": 1052 }, { "epoch": 3.311320754716981, "grad_norm": 0.15197114646434784, "learning_rate": 3.210293711607729e-05, "loss": 0.3347, "mean_token_accuracy": 0.8559669852256775, "step": 1053 }, { "epoch": 3.3144654088050314, "grad_norm": 0.13700467348098755, "learning_rate": 3.2087756336681306e-05, "loss": 0.325, "mean_token_accuracy": 0.8605079054832458, "step": 1054 }, { "epoch": 3.3176100628930816, "grad_norm": 0.13200706243515015, "learning_rate": 3.207256508924101e-05, "loss": 0.3317, "mean_token_accuracy": 0.8585512638092041, "step": 1055 }, { "epoch": 3.3207547169811322, "grad_norm": 0.1488047093153, "learning_rate": 3.2057363389520326e-05, "loss": 0.3381, "mean_token_accuracy": 0.8565067052841187, "step": 1056 }, { "epoch": 3.3238993710691824, "grad_norm": 0.1520201414823532, "learning_rate": 3.204215125329402e-05, "loss": 0.3342, "mean_token_accuracy": 0.8572053909301758, "step": 1057 }, { "epoch": 3.3270440251572326, "grad_norm": 0.17158624529838562, "learning_rate": 3.20269286963477e-05, "loss": 0.3316, "mean_token_accuracy": 0.858790934085846, "step": 1058 }, { "epoch": 3.330188679245283, "grad_norm": 0.14773574471473694, "learning_rate": 3.2011695734477776e-05, "loss": 0.3207, "mean_token_accuracy": 0.8638202548027039, "step": 1059 }, { "epoch": 3.3333333333333335, "grad_norm": 0.17465537786483765, "learning_rate": 3.199645238349146e-05, "loss": 0.321, "mean_token_accuracy": 0.8613391518592834, "step": 1060 }, { "epoch": 3.3364779874213837, "grad_norm": 0.13760405778884888, "learning_rate": 3.198119865920677e-05, "loss": 0.3217, "mean_token_accuracy": 0.8583642244338989, "step": 1061 }, { "epoch": 3.339622641509434, "grad_norm": 0.15301382541656494, "learning_rate": 3.196593457745243e-05, "loss": 0.3197, "mean_token_accuracy": 0.8593172430992126, "step": 1062 }, { "epoch": 3.342767295597484, "grad_norm": 0.1372508555650711, "learning_rate": 3.195066015406797e-05, "loss": 0.3318, "mean_token_accuracy": 0.8575015664100647, "step": 1063 }, { "epoch": 3.3459119496855347, "grad_norm": 0.14857687056064606, "learning_rate": 3.193537540490363e-05, "loss": 0.3316, "mean_token_accuracy": 0.8603220582008362, "step": 1064 }, { "epoch": 3.349056603773585, "grad_norm": 0.12988995015621185, "learning_rate": 3.192008034582034e-05, "loss": 0.3467, "mean_token_accuracy": 0.8531395792961121, "step": 1065 }, { "epoch": 3.352201257861635, "grad_norm": 0.15077495574951172, "learning_rate": 3.190477499268978e-05, "loss": 0.3352, "mean_token_accuracy": 0.8567934632301331, "step": 1066 }, { "epoch": 3.3553459119496853, "grad_norm": 0.14799143373966217, "learning_rate": 3.1889459361394266e-05, "loss": 0.2899, "mean_token_accuracy": 0.8670781850814819, "step": 1067 }, { "epoch": 3.358490566037736, "grad_norm": 0.1631443202495575, "learning_rate": 3.1874133467826804e-05, "loss": 0.3139, "mean_token_accuracy": 0.8630610108375549, "step": 1068 }, { "epoch": 3.361635220125786, "grad_norm": 0.1495402604341507, "learning_rate": 3.1858797327891034e-05, "loss": 0.3071, "mean_token_accuracy": 0.8628964424133301, "step": 1069 }, { "epoch": 3.3647798742138364, "grad_norm": 0.1613866537809372, "learning_rate": 3.1843450957501254e-05, "loss": 0.3187, "mean_token_accuracy": 0.8615692257881165, "step": 1070 }, { "epoch": 3.3679245283018866, "grad_norm": 0.15920579433441162, "learning_rate": 3.182809437258235e-05, "loss": 0.3149, "mean_token_accuracy": 0.862976610660553, "step": 1071 }, { "epoch": 3.3710691823899372, "grad_norm": 0.15265892446041107, "learning_rate": 3.181272758906982e-05, "loss": 0.3171, "mean_token_accuracy": 0.8599149584770203, "step": 1072 }, { "epoch": 3.3742138364779874, "grad_norm": 0.15581786632537842, "learning_rate": 3.179735062290974e-05, "loss": 0.3255, "mean_token_accuracy": 0.8590546250343323, "step": 1073 }, { "epoch": 3.3773584905660377, "grad_norm": 0.13360513746738434, "learning_rate": 3.178196349005877e-05, "loss": 0.3146, "mean_token_accuracy": 0.8623788952827454, "step": 1074 }, { "epoch": 3.380503144654088, "grad_norm": 0.1339465230703354, "learning_rate": 3.17665662064841e-05, "loss": 0.3545, "mean_token_accuracy": 0.852672278881073, "step": 1075 }, { "epoch": 3.3836477987421385, "grad_norm": 0.1387154757976532, "learning_rate": 3.175115878816346e-05, "loss": 0.3286, "mean_token_accuracy": 0.8580860495567322, "step": 1076 }, { "epoch": 3.3867924528301887, "grad_norm": 0.14403823018074036, "learning_rate": 3.173574125108508e-05, "loss": 0.3441, "mean_token_accuracy": 0.8584855794906616, "step": 1077 }, { "epoch": 3.389937106918239, "grad_norm": 0.15723450481891632, "learning_rate": 3.172031361124774e-05, "loss": 0.3252, "mean_token_accuracy": 0.8592239022254944, "step": 1078 }, { "epoch": 3.3930817610062896, "grad_norm": 0.13823583722114563, "learning_rate": 3.1704875884660645e-05, "loss": 0.3227, "mean_token_accuracy": 0.860748291015625, "step": 1079 }, { "epoch": 3.3962264150943398, "grad_norm": 0.17623189091682434, "learning_rate": 3.16894280873435e-05, "loss": 0.3334, "mean_token_accuracy": 0.8578466773033142, "step": 1080 }, { "epoch": 3.39937106918239, "grad_norm": 0.15371590852737427, "learning_rate": 3.1673970235326454e-05, "loss": 0.3171, "mean_token_accuracy": 0.8577912449836731, "step": 1081 }, { "epoch": 3.40251572327044, "grad_norm": 0.16374284029006958, "learning_rate": 3.165850234465009e-05, "loss": 0.3333, "mean_token_accuracy": 0.8590589761734009, "step": 1082 }, { "epoch": 3.4056603773584904, "grad_norm": 0.1451641023159027, "learning_rate": 3.16430244313654e-05, "loss": 0.3259, "mean_token_accuracy": 0.8558355569839478, "step": 1083 }, { "epoch": 3.408805031446541, "grad_norm": 0.1551799774169922, "learning_rate": 3.1627536511533795e-05, "loss": 0.3339, "mean_token_accuracy": 0.8557202219963074, "step": 1084 }, { "epoch": 3.411949685534591, "grad_norm": 0.12902113795280457, "learning_rate": 3.1612038601227054e-05, "loss": 0.3279, "mean_token_accuracy": 0.8582240343093872, "step": 1085 }, { "epoch": 3.4150943396226414, "grad_norm": 0.15876270830631256, "learning_rate": 3.159653071652732e-05, "loss": 0.3307, "mean_token_accuracy": 0.8565781712532043, "step": 1086 }, { "epoch": 3.418238993710692, "grad_norm": 0.12267336994409561, "learning_rate": 3.1581012873527095e-05, "loss": 0.327, "mean_token_accuracy": 0.8610835671424866, "step": 1087 }, { "epoch": 3.4213836477987423, "grad_norm": 0.15954187512397766, "learning_rate": 3.156548508832922e-05, "loss": 0.3304, "mean_token_accuracy": 0.8586665391921997, "step": 1088 }, { "epoch": 3.4245283018867925, "grad_norm": 0.1352037936449051, "learning_rate": 3.154994737704684e-05, "loss": 0.3317, "mean_token_accuracy": 0.8586193919181824, "step": 1089 }, { "epoch": 3.4276729559748427, "grad_norm": 0.15149874985218048, "learning_rate": 3.153439975580341e-05, "loss": 0.3375, "mean_token_accuracy": 0.854519248008728, "step": 1090 }, { "epoch": 3.430817610062893, "grad_norm": 0.13168424367904663, "learning_rate": 3.151884224073267e-05, "loss": 0.332, "mean_token_accuracy": 0.8577566742897034, "step": 1091 }, { "epoch": 3.4339622641509435, "grad_norm": 0.13555015623569489, "learning_rate": 3.150327484797861e-05, "loss": 0.3236, "mean_token_accuracy": 0.8604238033294678, "step": 1092 }, { "epoch": 3.4371069182389937, "grad_norm": 0.132098987698555, "learning_rate": 3.1487697593695495e-05, "loss": 0.3232, "mean_token_accuracy": 0.8571335077285767, "step": 1093 }, { "epoch": 3.440251572327044, "grad_norm": 0.12883129715919495, "learning_rate": 3.147211049404779e-05, "loss": 0.3271, "mean_token_accuracy": 0.8585851192474365, "step": 1094 }, { "epoch": 3.4433962264150946, "grad_norm": 0.12580397725105286, "learning_rate": 3.145651356521022e-05, "loss": 0.3376, "mean_token_accuracy": 0.8587803840637207, "step": 1095 }, { "epoch": 3.4465408805031448, "grad_norm": 0.13099335134029388, "learning_rate": 3.1440906823367676e-05, "loss": 0.3191, "mean_token_accuracy": 0.8609188795089722, "step": 1096 }, { "epoch": 3.449685534591195, "grad_norm": 0.1349143236875534, "learning_rate": 3.142529028471525e-05, "loss": 0.3198, "mean_token_accuracy": 0.8595296144485474, "step": 1097 }, { "epoch": 3.452830188679245, "grad_norm": 0.14235574007034302, "learning_rate": 3.140966396545817e-05, "loss": 0.3402, "mean_token_accuracy": 0.854846179485321, "step": 1098 }, { "epoch": 3.4559748427672954, "grad_norm": 0.12889491021633148, "learning_rate": 3.139402788181186e-05, "loss": 0.3474, "mean_token_accuracy": 0.8538563847541809, "step": 1099 }, { "epoch": 3.459119496855346, "grad_norm": 0.15164689719676971, "learning_rate": 3.137838205000185e-05, "loss": 0.3369, "mean_token_accuracy": 0.8573994040489197, "step": 1100 }, { "epoch": 3.4622641509433962, "grad_norm": 0.1371922492980957, "learning_rate": 3.136272648626377e-05, "loss": 0.3334, "mean_token_accuracy": 0.8574711084365845, "step": 1101 }, { "epoch": 3.4654088050314464, "grad_norm": 0.1531357765197754, "learning_rate": 3.1347061206843376e-05, "loss": 0.329, "mean_token_accuracy": 0.8560079336166382, "step": 1102 }, { "epoch": 3.468553459119497, "grad_norm": 0.15599913895130157, "learning_rate": 3.133138622799651e-05, "loss": 0.3245, "mean_token_accuracy": 0.8592775464057922, "step": 1103 }, { "epoch": 3.4716981132075473, "grad_norm": 0.13518376648426056, "learning_rate": 3.131570156598905e-05, "loss": 0.325, "mean_token_accuracy": 0.8589106798171997, "step": 1104 }, { "epoch": 3.4748427672955975, "grad_norm": 0.15266206860542297, "learning_rate": 3.1300007237096944e-05, "loss": 0.3346, "mean_token_accuracy": 0.857874870300293, "step": 1105 }, { "epoch": 3.4779874213836477, "grad_norm": 0.1420784741640091, "learning_rate": 3.128430325760616e-05, "loss": 0.3296, "mean_token_accuracy": 0.8598121404647827, "step": 1106 }, { "epoch": 3.481132075471698, "grad_norm": 0.15022684633731842, "learning_rate": 3.126858964381269e-05, "loss": 0.3211, "mean_token_accuracy": 0.8595227003097534, "step": 1107 }, { "epoch": 3.4842767295597485, "grad_norm": 0.15471023321151733, "learning_rate": 3.125286641202252e-05, "loss": 0.3259, "mean_token_accuracy": 0.8603160381317139, "step": 1108 }, { "epoch": 3.4874213836477987, "grad_norm": 0.1372515708208084, "learning_rate": 3.123713357855163e-05, "loss": 0.3256, "mean_token_accuracy": 0.8592529296875, "step": 1109 }, { "epoch": 3.490566037735849, "grad_norm": 0.14333589375019073, "learning_rate": 3.122139115972591e-05, "loss": 0.3216, "mean_token_accuracy": 0.8592953681945801, "step": 1110 }, { "epoch": 3.4937106918238996, "grad_norm": 0.13496816158294678, "learning_rate": 3.120563917188127e-05, "loss": 0.3319, "mean_token_accuracy": 0.8571884632110596, "step": 1111 }, { "epoch": 3.49685534591195, "grad_norm": 0.13933950662612915, "learning_rate": 3.1189877631363514e-05, "loss": 0.3262, "mean_token_accuracy": 0.8581831455230713, "step": 1112 }, { "epoch": 3.5, "grad_norm": 0.13966146111488342, "learning_rate": 3.117410655452835e-05, "loss": 0.3428, "mean_token_accuracy": 0.8562673926353455, "step": 1113 }, { "epoch": 3.50314465408805, "grad_norm": 0.13303886353969574, "learning_rate": 3.115832595774139e-05, "loss": 0.3212, "mean_token_accuracy": 0.8587968349456787, "step": 1114 }, { "epoch": 3.5062893081761004, "grad_norm": 0.14135101437568665, "learning_rate": 3.114253585737813e-05, "loss": 0.3278, "mean_token_accuracy": 0.8600554466247559, "step": 1115 }, { "epoch": 3.509433962264151, "grad_norm": 0.1329166740179062, "learning_rate": 3.112673626982394e-05, "loss": 0.3426, "mean_token_accuracy": 0.853922963142395, "step": 1116 }, { "epoch": 3.5125786163522013, "grad_norm": 0.1364695280790329, "learning_rate": 3.1110927211474e-05, "loss": 0.343, "mean_token_accuracy": 0.8539784550666809, "step": 1117 }, { "epoch": 3.5157232704402515, "grad_norm": 0.1294035166501999, "learning_rate": 3.109510869873335e-05, "loss": 0.3456, "mean_token_accuracy": 0.8523107767105103, "step": 1118 }, { "epoch": 3.518867924528302, "grad_norm": 0.1287209391593933, "learning_rate": 3.107928074801682e-05, "loss": 0.3331, "mean_token_accuracy": 0.8596871495246887, "step": 1119 }, { "epoch": 3.5220125786163523, "grad_norm": 0.12785553932189941, "learning_rate": 3.106344337574904e-05, "loss": 0.3308, "mean_token_accuracy": 0.8588385581970215, "step": 1120 }, { "epoch": 3.5251572327044025, "grad_norm": 0.14281029999256134, "learning_rate": 3.1047596598364436e-05, "loss": 0.3362, "mean_token_accuracy": 0.8574225902557373, "step": 1121 }, { "epoch": 3.5283018867924527, "grad_norm": 0.13183027505874634, "learning_rate": 3.1031740432307164e-05, "loss": 0.3127, "mean_token_accuracy": 0.8617753386497498, "step": 1122 }, { "epoch": 3.531446540880503, "grad_norm": 0.13874752819538116, "learning_rate": 3.1015874894031144e-05, "loss": 0.3334, "mean_token_accuracy": 0.8580701351165771, "step": 1123 }, { "epoch": 3.5345911949685536, "grad_norm": 0.12882445752620697, "learning_rate": 3.1e-05, "loss": 0.3373, "mean_token_accuracy": 0.8580555319786072, "step": 1124 }, { "epoch": 3.5377358490566038, "grad_norm": 0.14017626643180847, "learning_rate": 3.0984115766687096e-05, "loss": 0.3346, "mean_token_accuracy": 0.8570591807365417, "step": 1125 }, { "epoch": 3.540880503144654, "grad_norm": 0.12685616314411163, "learning_rate": 3.0968222210575446e-05, "loss": 0.3221, "mean_token_accuracy": 0.8617547154426575, "step": 1126 }, { "epoch": 3.5440251572327046, "grad_norm": 0.1380213052034378, "learning_rate": 3.095231934815777e-05, "loss": 0.3239, "mean_token_accuracy": 0.8619950413703918, "step": 1127 }, { "epoch": 3.547169811320755, "grad_norm": 0.136666402220726, "learning_rate": 3.0936407195936444e-05, "loss": 0.329, "mean_token_accuracy": 0.8563064336776733, "step": 1128 }, { "epoch": 3.550314465408805, "grad_norm": 0.1525544971227646, "learning_rate": 3.092048577042347e-05, "loss": 0.3268, "mean_token_accuracy": 0.8579536080360413, "step": 1129 }, { "epoch": 3.5534591194968552, "grad_norm": 0.14374618232250214, "learning_rate": 3.090455508814047e-05, "loss": 0.3191, "mean_token_accuracy": 0.861701250076294, "step": 1130 }, { "epoch": 3.5566037735849054, "grad_norm": 0.1475425511598587, "learning_rate": 3.088861516561871e-05, "loss": 0.3168, "mean_token_accuracy": 0.8611730933189392, "step": 1131 }, { "epoch": 3.559748427672956, "grad_norm": 0.1540241688489914, "learning_rate": 3.087266601939898e-05, "loss": 0.3125, "mean_token_accuracy": 0.8643029928207397, "step": 1132 }, { "epoch": 3.5628930817610063, "grad_norm": 0.14442327618598938, "learning_rate": 3.0856707666031694e-05, "loss": 0.3327, "mean_token_accuracy": 0.8581700921058655, "step": 1133 }, { "epoch": 3.5660377358490565, "grad_norm": 0.15339228510856628, "learning_rate": 3.084074012207681e-05, "loss": 0.3034, "mean_token_accuracy": 0.8633737564086914, "step": 1134 }, { "epoch": 3.569182389937107, "grad_norm": 0.14664144814014435, "learning_rate": 3.0824763404103795e-05, "loss": 0.3392, "mean_token_accuracy": 0.8558675646781921, "step": 1135 }, { "epoch": 3.5723270440251573, "grad_norm": 0.16255716979503632, "learning_rate": 3.080877752869168e-05, "loss": 0.3353, "mean_token_accuracy": 0.8562329411506653, "step": 1136 }, { "epoch": 3.5754716981132075, "grad_norm": 0.1473674178123474, "learning_rate": 3.079278251242896e-05, "loss": 0.336, "mean_token_accuracy": 0.8567706942558289, "step": 1137 }, { "epoch": 3.5786163522012577, "grad_norm": 0.1516093611717224, "learning_rate": 3.0776778371913634e-05, "loss": 0.305, "mean_token_accuracy": 0.8626529574394226, "step": 1138 }, { "epoch": 3.581761006289308, "grad_norm": 0.1458694338798523, "learning_rate": 3.076076512375317e-05, "loss": 0.3435, "mean_token_accuracy": 0.8568443059921265, "step": 1139 }, { "epoch": 3.5849056603773586, "grad_norm": 0.14711114764213562, "learning_rate": 3.0744742784564476e-05, "loss": 0.3441, "mean_token_accuracy": 0.8580071926116943, "step": 1140 }, { "epoch": 3.588050314465409, "grad_norm": 0.1530224233865738, "learning_rate": 3.0728711370973915e-05, "loss": 0.3328, "mean_token_accuracy": 0.8571670055389404, "step": 1141 }, { "epoch": 3.591194968553459, "grad_norm": 0.1374947428703308, "learning_rate": 3.071267089961724e-05, "loss": 0.3153, "mean_token_accuracy": 0.8622951507568359, "step": 1142 }, { "epoch": 3.5943396226415096, "grad_norm": 0.15225058794021606, "learning_rate": 3.069662138713962e-05, "loss": 0.3218, "mean_token_accuracy": 0.8582719564437866, "step": 1143 }, { "epoch": 3.59748427672956, "grad_norm": 0.13568712770938873, "learning_rate": 3.0680562850195594e-05, "loss": 0.3311, "mean_token_accuracy": 0.8576697707176208, "step": 1144 }, { "epoch": 3.60062893081761, "grad_norm": 0.16287925839424133, "learning_rate": 3.0664495305449084e-05, "loss": 0.3225, "mean_token_accuracy": 0.8594067692756653, "step": 1145 }, { "epoch": 3.6037735849056602, "grad_norm": 0.14078298211097717, "learning_rate": 3.064841876957335e-05, "loss": 0.3139, "mean_token_accuracy": 0.8635094165802002, "step": 1146 }, { "epoch": 3.6069182389937104, "grad_norm": 0.1636054962873459, "learning_rate": 3.0632333259250966e-05, "loss": 0.3261, "mean_token_accuracy": 0.8589130640029907, "step": 1147 }, { "epoch": 3.610062893081761, "grad_norm": 0.1595996916294098, "learning_rate": 3.0616238791173846e-05, "loss": 0.34, "mean_token_accuracy": 0.8539798855781555, "step": 1148 }, { "epoch": 3.6132075471698113, "grad_norm": 0.13862352073192596, "learning_rate": 3.060013538204319e-05, "loss": 0.3185, "mean_token_accuracy": 0.8629903197288513, "step": 1149 }, { "epoch": 3.6163522012578615, "grad_norm": 0.14423629641532898, "learning_rate": 3.058402304856946e-05, "loss": 0.332, "mean_token_accuracy": 0.8597278594970703, "step": 1150 }, { "epoch": 3.619496855345912, "grad_norm": 0.1510828286409378, "learning_rate": 3.0567901807472394e-05, "loss": 0.3137, "mean_token_accuracy": 0.8636487126350403, "step": 1151 }, { "epoch": 3.6226415094339623, "grad_norm": 0.14903636276721954, "learning_rate": 3.055177167548098e-05, "loss": 0.3312, "mean_token_accuracy": 0.8585656881332397, "step": 1152 }, { "epoch": 3.6257861635220126, "grad_norm": 0.1466004103422165, "learning_rate": 3.053563266933343e-05, "loss": 0.3295, "mean_token_accuracy": 0.8588059544563293, "step": 1153 }, { "epoch": 3.6289308176100628, "grad_norm": 0.14094513654708862, "learning_rate": 3.051948480577714e-05, "loss": 0.3323, "mean_token_accuracy": 0.8572471737861633, "step": 1154 }, { "epoch": 3.632075471698113, "grad_norm": 0.13098470866680145, "learning_rate": 3.0503328101568713e-05, "loss": 0.336, "mean_token_accuracy": 0.8568713665008545, "step": 1155 }, { "epoch": 3.6352201257861636, "grad_norm": 0.1380501538515091, "learning_rate": 3.0487162573473944e-05, "loss": 0.3424, "mean_token_accuracy": 0.8545725345611572, "step": 1156 }, { "epoch": 3.638364779874214, "grad_norm": 0.13335196673870087, "learning_rate": 3.047098823826776e-05, "loss": 0.3258, "mean_token_accuracy": 0.8585770130157471, "step": 1157 }, { "epoch": 3.641509433962264, "grad_norm": 0.14133410155773163, "learning_rate": 3.0454805112734243e-05, "loss": 0.3433, "mean_token_accuracy": 0.8555521965026855, "step": 1158 }, { "epoch": 3.6446540880503147, "grad_norm": 0.1298840194940567, "learning_rate": 3.043861321366658e-05, "loss": 0.3353, "mean_token_accuracy": 0.859320342540741, "step": 1159 }, { "epoch": 3.647798742138365, "grad_norm": 0.1415969878435135, "learning_rate": 3.042241255786708e-05, "loss": 0.3397, "mean_token_accuracy": 0.8569203615188599, "step": 1160 }, { "epoch": 3.650943396226415, "grad_norm": 0.1243518739938736, "learning_rate": 3.040620316214713e-05, "loss": 0.3227, "mean_token_accuracy": 0.86089026927948, "step": 1161 }, { "epoch": 3.6540880503144653, "grad_norm": 0.16862568259239197, "learning_rate": 3.0389985043327183e-05, "loss": 0.35, "mean_token_accuracy": 0.8550198674201965, "step": 1162 }, { "epoch": 3.6572327044025155, "grad_norm": 0.1371821016073227, "learning_rate": 3.0373758218236747e-05, "loss": 0.3112, "mean_token_accuracy": 0.8630196452140808, "step": 1163 }, { "epoch": 3.660377358490566, "grad_norm": 0.14441686868667603, "learning_rate": 3.0357522703714365e-05, "loss": 0.3322, "mean_token_accuracy": 0.8588864207267761, "step": 1164 }, { "epoch": 3.6635220125786163, "grad_norm": 0.14246846735477448, "learning_rate": 3.03412785166076e-05, "loss": 0.3236, "mean_token_accuracy": 0.8609263300895691, "step": 1165 }, { "epoch": 3.6666666666666665, "grad_norm": 0.14187541604042053, "learning_rate": 3.032502567377302e-05, "loss": 0.3287, "mean_token_accuracy": 0.8563394546508789, "step": 1166 }, { "epoch": 3.669811320754717, "grad_norm": 0.1399676501750946, "learning_rate": 3.030876419207616e-05, "loss": 0.3193, "mean_token_accuracy": 0.8611453771591187, "step": 1167 }, { "epoch": 3.6729559748427674, "grad_norm": 0.15003567934036255, "learning_rate": 3.0292494088391522e-05, "loss": 0.3436, "mean_token_accuracy": 0.8553731441497803, "step": 1168 }, { "epoch": 3.6761006289308176, "grad_norm": 0.1415679007768631, "learning_rate": 3.027621537960257e-05, "loss": 0.3343, "mean_token_accuracy": 0.8586793541908264, "step": 1169 }, { "epoch": 3.6792452830188678, "grad_norm": 0.13909806311130524, "learning_rate": 3.0259928082601675e-05, "loss": 0.3342, "mean_token_accuracy": 0.8584513664245605, "step": 1170 }, { "epoch": 3.682389937106918, "grad_norm": 0.1442382037639618, "learning_rate": 3.0243632214290137e-05, "loss": 0.3401, "mean_token_accuracy": 0.854636549949646, "step": 1171 }, { "epoch": 3.6855345911949686, "grad_norm": 0.1362895667552948, "learning_rate": 3.022732779157815e-05, "loss": 0.3486, "mean_token_accuracy": 0.8539217710494995, "step": 1172 }, { "epoch": 3.688679245283019, "grad_norm": 0.14030933380126953, "learning_rate": 3.0211014831384773e-05, "loss": 0.3305, "mean_token_accuracy": 0.8588977456092834, "step": 1173 }, { "epoch": 3.691823899371069, "grad_norm": 0.14149655401706696, "learning_rate": 3.0194693350637943e-05, "loss": 0.331, "mean_token_accuracy": 0.8578828573226929, "step": 1174 }, { "epoch": 3.6949685534591197, "grad_norm": 0.1377638727426529, "learning_rate": 3.017836336627441e-05, "loss": 0.3356, "mean_token_accuracy": 0.854766845703125, "step": 1175 }, { "epoch": 3.69811320754717, "grad_norm": 0.1417425572872162, "learning_rate": 3.0162024895239787e-05, "loss": 0.3372, "mean_token_accuracy": 0.8555588126182556, "step": 1176 }, { "epoch": 3.70125786163522, "grad_norm": 0.14174726605415344, "learning_rate": 3.0145677954488457e-05, "loss": 0.3338, "mean_token_accuracy": 0.856699526309967, "step": 1177 }, { "epoch": 3.7044025157232703, "grad_norm": 0.13998551666736603, "learning_rate": 3.0129322560983616e-05, "loss": 0.326, "mean_token_accuracy": 0.8607649803161621, "step": 1178 }, { "epoch": 3.7075471698113205, "grad_norm": 0.1254308521747589, "learning_rate": 3.0112958731697214e-05, "loss": 0.3512, "mean_token_accuracy": 0.8531991243362427, "step": 1179 }, { "epoch": 3.710691823899371, "grad_norm": 0.1437944918870926, "learning_rate": 3.0096586483609966e-05, "loss": 0.3354, "mean_token_accuracy": 0.8561474680900574, "step": 1180 }, { "epoch": 3.7138364779874213, "grad_norm": 0.1321929395198822, "learning_rate": 3.0080205833711325e-05, "loss": 0.3284, "mean_token_accuracy": 0.8593660593032837, "step": 1181 }, { "epoch": 3.7169811320754715, "grad_norm": 0.15401658415794373, "learning_rate": 3.0063816798999456e-05, "loss": 0.3314, "mean_token_accuracy": 0.8591458201408386, "step": 1182 }, { "epoch": 3.720125786163522, "grad_norm": 0.14057710766792297, "learning_rate": 3.0047419396481227e-05, "loss": 0.3475, "mean_token_accuracy": 0.8535549640655518, "step": 1183 }, { "epoch": 3.7232704402515724, "grad_norm": 0.13213631510734558, "learning_rate": 3.0031013643172185e-05, "loss": 0.3159, "mean_token_accuracy": 0.8608952164649963, "step": 1184 }, { "epoch": 3.7264150943396226, "grad_norm": 0.1514212042093277, "learning_rate": 3.0014599556096557e-05, "loss": 0.3384, "mean_token_accuracy": 0.8577061295509338, "step": 1185 }, { "epoch": 3.729559748427673, "grad_norm": 0.14160504937171936, "learning_rate": 2.9998177152287204e-05, "loss": 0.3297, "mean_token_accuracy": 0.8586458563804626, "step": 1186 }, { "epoch": 3.732704402515723, "grad_norm": 0.14224717020988464, "learning_rate": 2.9981746448785615e-05, "loss": 0.3203, "mean_token_accuracy": 0.8611958622932434, "step": 1187 }, { "epoch": 3.7358490566037736, "grad_norm": 0.12940874695777893, "learning_rate": 2.9965307462641906e-05, "loss": 0.3384, "mean_token_accuracy": 0.8546769618988037, "step": 1188 }, { "epoch": 3.738993710691824, "grad_norm": 0.1318395733833313, "learning_rate": 2.9948860210914773e-05, "loss": 0.3339, "mean_token_accuracy": 0.8579802513122559, "step": 1189 }, { "epoch": 3.742138364779874, "grad_norm": 0.14659515023231506, "learning_rate": 2.9932404710671504e-05, "loss": 0.3371, "mean_token_accuracy": 0.8568556308746338, "step": 1190 }, { "epoch": 3.7452830188679247, "grad_norm": 0.1241716668009758, "learning_rate": 2.9915940978987926e-05, "loss": 0.3199, "mean_token_accuracy": 0.8621662855148315, "step": 1191 }, { "epoch": 3.748427672955975, "grad_norm": 0.14975996315479279, "learning_rate": 2.989946903294843e-05, "loss": 0.3447, "mean_token_accuracy": 0.8569864630699158, "step": 1192 }, { "epoch": 3.751572327044025, "grad_norm": 0.12985654175281525, "learning_rate": 2.9882988889645913e-05, "loss": 0.3268, "mean_token_accuracy": 0.8582627773284912, "step": 1193 }, { "epoch": 3.7547169811320753, "grad_norm": 0.14152680337429047, "learning_rate": 2.9866500566181802e-05, "loss": 0.3121, "mean_token_accuracy": 0.8632294535636902, "step": 1194 }, { "epoch": 3.757861635220126, "grad_norm": 0.14638929069042206, "learning_rate": 2.985000407966598e-05, "loss": 0.3317, "mean_token_accuracy": 0.8586151003837585, "step": 1195 }, { "epoch": 3.761006289308176, "grad_norm": 0.1430143564939499, "learning_rate": 2.9833499447216824e-05, "loss": 0.3387, "mean_token_accuracy": 0.8545314073562622, "step": 1196 }, { "epoch": 3.7641509433962264, "grad_norm": 0.14943763613700867, "learning_rate": 2.9816986685961156e-05, "loss": 0.3322, "mean_token_accuracy": 0.858390748500824, "step": 1197 }, { "epoch": 3.767295597484277, "grad_norm": 0.14059577882289886, "learning_rate": 2.9800465813034242e-05, "loss": 0.3294, "mean_token_accuracy": 0.8604702949523926, "step": 1198 }, { "epoch": 3.770440251572327, "grad_norm": 0.15087758004665375, "learning_rate": 2.9783936845579747e-05, "loss": 0.3381, "mean_token_accuracy": 0.8561389446258545, "step": 1199 }, { "epoch": 3.7735849056603774, "grad_norm": 0.14814791083335876, "learning_rate": 2.9767399800749755e-05, "loss": 0.3311, "mean_token_accuracy": 0.8588146567344666, "step": 1200 }, { "epoch": 3.7767295597484276, "grad_norm": 0.13886821269989014, "learning_rate": 2.9750854695704715e-05, "loss": 0.332, "mean_token_accuracy": 0.8596969842910767, "step": 1201 }, { "epoch": 3.779874213836478, "grad_norm": 0.15242241322994232, "learning_rate": 2.9734301547613465e-05, "loss": 0.3277, "mean_token_accuracy": 0.8580199480056763, "step": 1202 }, { "epoch": 3.7830188679245285, "grad_norm": 0.13634704053401947, "learning_rate": 2.971774037365316e-05, "loss": 0.3357, "mean_token_accuracy": 0.8593721985816956, "step": 1203 }, { "epoch": 3.7861635220125787, "grad_norm": 0.14197935163974762, "learning_rate": 2.9701171191009304e-05, "loss": 0.3378, "mean_token_accuracy": 0.8571075797080994, "step": 1204 }, { "epoch": 3.789308176100629, "grad_norm": 0.1357307732105255, "learning_rate": 2.96845940168757e-05, "loss": 0.3152, "mean_token_accuracy": 0.8626152276992798, "step": 1205 }, { "epoch": 3.7924528301886795, "grad_norm": 0.1317468136548996, "learning_rate": 2.966800886845445e-05, "loss": 0.3392, "mean_token_accuracy": 0.8555001616477966, "step": 1206 }, { "epoch": 3.7955974842767297, "grad_norm": 0.14835762977600098, "learning_rate": 2.9651415762955925e-05, "loss": 0.344, "mean_token_accuracy": 0.8575615882873535, "step": 1207 }, { "epoch": 3.79874213836478, "grad_norm": 0.14677360653877258, "learning_rate": 2.9634814717598762e-05, "loss": 0.3227, "mean_token_accuracy": 0.8616207838058472, "step": 1208 }, { "epoch": 3.80188679245283, "grad_norm": 0.13944582641124725, "learning_rate": 2.961820574960982e-05, "loss": 0.3467, "mean_token_accuracy": 0.8558523058891296, "step": 1209 }, { "epoch": 3.8050314465408803, "grad_norm": 0.1433691531419754, "learning_rate": 2.9601588876224215e-05, "loss": 0.3417, "mean_token_accuracy": 0.8568807244300842, "step": 1210 }, { "epoch": 3.808176100628931, "grad_norm": 0.14389808475971222, "learning_rate": 2.958496411468522e-05, "loss": 0.3118, "mean_token_accuracy": 0.8629230260848999, "step": 1211 }, { "epoch": 3.811320754716981, "grad_norm": 0.14503751695156097, "learning_rate": 2.956833148224433e-05, "loss": 0.3369, "mean_token_accuracy": 0.8592314720153809, "step": 1212 }, { "epoch": 3.8144654088050314, "grad_norm": 0.14210133254528046, "learning_rate": 2.955169099616119e-05, "loss": 0.3301, "mean_token_accuracy": 0.8592085242271423, "step": 1213 }, { "epoch": 3.817610062893082, "grad_norm": 0.12544848024845123, "learning_rate": 2.95350426737036e-05, "loss": 0.3211, "mean_token_accuracy": 0.8602308034896851, "step": 1214 }, { "epoch": 3.8207547169811322, "grad_norm": 0.15146970748901367, "learning_rate": 2.9518386532147487e-05, "loss": 0.3319, "mean_token_accuracy": 0.857859194278717, "step": 1215 }, { "epoch": 3.8238993710691824, "grad_norm": 0.13156718015670776, "learning_rate": 2.95017225887769e-05, "loss": 0.3218, "mean_token_accuracy": 0.8625040650367737, "step": 1216 }, { "epoch": 3.8270440251572326, "grad_norm": 0.14363285899162292, "learning_rate": 2.948505086088397e-05, "loss": 0.3457, "mean_token_accuracy": 0.8564530611038208, "step": 1217 }, { "epoch": 3.830188679245283, "grad_norm": 0.12839969992637634, "learning_rate": 2.9468371365768926e-05, "loss": 0.3099, "mean_token_accuracy": 0.8643601536750793, "step": 1218 }, { "epoch": 3.8333333333333335, "grad_norm": 0.12974990904331207, "learning_rate": 2.9451684120740038e-05, "loss": 0.3274, "mean_token_accuracy": 0.8582882285118103, "step": 1219 }, { "epoch": 3.8364779874213837, "grad_norm": 0.14015363156795502, "learning_rate": 2.943498914311364e-05, "loss": 0.3368, "mean_token_accuracy": 0.8556147217750549, "step": 1220 }, { "epoch": 3.839622641509434, "grad_norm": 0.13860003650188446, "learning_rate": 2.941828645021406e-05, "loss": 0.3375, "mean_token_accuracy": 0.8543303608894348, "step": 1221 }, { "epoch": 3.8427672955974845, "grad_norm": 0.14258892834186554, "learning_rate": 2.9401576059373656e-05, "loss": 0.3259, "mean_token_accuracy": 0.8594518303871155, "step": 1222 }, { "epoch": 3.8459119496855347, "grad_norm": 0.13030867278575897, "learning_rate": 2.9384857987932768e-05, "loss": 0.3162, "mean_token_accuracy": 0.8623016476631165, "step": 1223 }, { "epoch": 3.849056603773585, "grad_norm": 0.13517674803733826, "learning_rate": 2.9368132253239702e-05, "loss": 0.3254, "mean_token_accuracy": 0.8589973449707031, "step": 1224 }, { "epoch": 3.852201257861635, "grad_norm": 0.12886177003383636, "learning_rate": 2.935139887265072e-05, "loss": 0.3354, "mean_token_accuracy": 0.8530575037002563, "step": 1225 }, { "epoch": 3.8553459119496853, "grad_norm": 0.14796893298625946, "learning_rate": 2.9334657863530016e-05, "loss": 0.3325, "mean_token_accuracy": 0.8576770424842834, "step": 1226 }, { "epoch": 3.858490566037736, "grad_norm": 0.1297404170036316, "learning_rate": 2.9317909243249706e-05, "loss": 0.3436, "mean_token_accuracy": 0.8564417958259583, "step": 1227 }, { "epoch": 3.861635220125786, "grad_norm": 0.1457653045654297, "learning_rate": 2.9301153029189794e-05, "loss": 0.3387, "mean_token_accuracy": 0.8541455268859863, "step": 1228 }, { "epoch": 3.8647798742138364, "grad_norm": 0.135748028755188, "learning_rate": 2.928438923873816e-05, "loss": 0.3194, "mean_token_accuracy": 0.8600180745124817, "step": 1229 }, { "epoch": 3.867924528301887, "grad_norm": 0.14161410927772522, "learning_rate": 2.926761788929058e-05, "loss": 0.3369, "mean_token_accuracy": 0.8556387424468994, "step": 1230 }, { "epoch": 3.8710691823899372, "grad_norm": 0.13314643502235413, "learning_rate": 2.9250838998250638e-05, "loss": 0.3317, "mean_token_accuracy": 0.8579115867614746, "step": 1231 }, { "epoch": 3.8742138364779874, "grad_norm": 0.14150525629520416, "learning_rate": 2.9234052583029746e-05, "loss": 0.3222, "mean_token_accuracy": 0.8584760427474976, "step": 1232 }, { "epoch": 3.8773584905660377, "grad_norm": 0.13233336806297302, "learning_rate": 2.9217258661047142e-05, "loss": 0.3517, "mean_token_accuracy": 0.852990448474884, "step": 1233 }, { "epoch": 3.880503144654088, "grad_norm": 0.15107811987400055, "learning_rate": 2.920045724972985e-05, "loss": 0.3305, "mean_token_accuracy": 0.8559724688529968, "step": 1234 }, { "epoch": 3.8836477987421385, "grad_norm": 0.16093167662620544, "learning_rate": 2.9183648366512648e-05, "loss": 0.3315, "mean_token_accuracy": 0.8561708927154541, "step": 1235 }, { "epoch": 3.8867924528301887, "grad_norm": 0.14102764427661896, "learning_rate": 2.9166832028838085e-05, "loss": 0.332, "mean_token_accuracy": 0.856883704662323, "step": 1236 }, { "epoch": 3.889937106918239, "grad_norm": 0.15067261457443237, "learning_rate": 2.915000825415644e-05, "loss": 0.3338, "mean_token_accuracy": 0.858379065990448, "step": 1237 }, { "epoch": 3.8930817610062896, "grad_norm": 0.1495073437690735, "learning_rate": 2.9133177059925715e-05, "loss": 0.3218, "mean_token_accuracy": 0.8599880933761597, "step": 1238 }, { "epoch": 3.8962264150943398, "grad_norm": 0.14388205111026764, "learning_rate": 2.9116338463611596e-05, "loss": 0.348, "mean_token_accuracy": 0.8538846373558044, "step": 1239 }, { "epoch": 3.89937106918239, "grad_norm": 0.1329399049282074, "learning_rate": 2.9099492482687478e-05, "loss": 0.3267, "mean_token_accuracy": 0.8591561317443848, "step": 1240 }, { "epoch": 3.90251572327044, "grad_norm": 0.14387187361717224, "learning_rate": 2.9082639134634378e-05, "loss": 0.3285, "mean_token_accuracy": 0.8577826023101807, "step": 1241 }, { "epoch": 3.9056603773584904, "grad_norm": 0.1352883279323578, "learning_rate": 2.9065778436941003e-05, "loss": 0.3314, "mean_token_accuracy": 0.8583676218986511, "step": 1242 }, { "epoch": 3.908805031446541, "grad_norm": 0.1337868720293045, "learning_rate": 2.904891040710365e-05, "loss": 0.3267, "mean_token_accuracy": 0.857197105884552, "step": 1243 }, { "epoch": 3.911949685534591, "grad_norm": 0.13829989731311798, "learning_rate": 2.903203506262624e-05, "loss": 0.3357, "mean_token_accuracy": 0.85539710521698, "step": 1244 }, { "epoch": 3.9150943396226414, "grad_norm": 0.1476396769285202, "learning_rate": 2.9015152421020296e-05, "loss": 0.3344, "mean_token_accuracy": 0.857477068901062, "step": 1245 }, { "epoch": 3.918238993710692, "grad_norm": 0.14620055258274078, "learning_rate": 2.899826249980489e-05, "loss": 0.3423, "mean_token_accuracy": 0.8560461401939392, "step": 1246 }, { "epoch": 3.9213836477987423, "grad_norm": 0.12422426789999008, "learning_rate": 2.898136531650666e-05, "loss": 0.3546, "mean_token_accuracy": 0.8542535901069641, "step": 1247 }, { "epoch": 3.9245283018867925, "grad_norm": 0.13504689931869507, "learning_rate": 2.8964460888659786e-05, "loss": 0.3391, "mean_token_accuracy": 0.858195424079895, "step": 1248 }, { "epoch": 3.9276729559748427, "grad_norm": 0.13244280219078064, "learning_rate": 2.8947549233805953e-05, "loss": 0.3121, "mean_token_accuracy": 0.8605561852455139, "step": 1249 }, { "epoch": 3.930817610062893, "grad_norm": 0.13584861159324646, "learning_rate": 2.893063036949435e-05, "loss": 0.3304, "mean_token_accuracy": 0.8578067421913147, "step": 1250 }, { "epoch": 3.9339622641509435, "grad_norm": 0.14337222278118134, "learning_rate": 2.891370431328165e-05, "loss": 0.3347, "mean_token_accuracy": 0.8570818305015564, "step": 1251 }, { "epoch": 3.9371069182389937, "grad_norm": 0.1409681886434555, "learning_rate": 2.8896771082731986e-05, "loss": 0.3374, "mean_token_accuracy": 0.8561692833900452, "step": 1252 }, { "epoch": 3.940251572327044, "grad_norm": 0.13311010599136353, "learning_rate": 2.8879830695416933e-05, "loss": 0.3386, "mean_token_accuracy": 0.8557576537132263, "step": 1253 }, { "epoch": 3.9433962264150946, "grad_norm": 0.13869957625865936, "learning_rate": 2.8862883168915508e-05, "loss": 0.3236, "mean_token_accuracy": 0.8604661822319031, "step": 1254 }, { "epoch": 3.9465408805031448, "grad_norm": 0.15289968252182007, "learning_rate": 2.884592852081412e-05, "loss": 0.3256, "mean_token_accuracy": 0.8590101003646851, "step": 1255 }, { "epoch": 3.949685534591195, "grad_norm": 0.12691056728363037, "learning_rate": 2.882896676870657e-05, "loss": 0.327, "mean_token_accuracy": 0.8594552874565125, "step": 1256 }, { "epoch": 3.952830188679245, "grad_norm": 0.13348102569580078, "learning_rate": 2.8811997930194032e-05, "loss": 0.3335, "mean_token_accuracy": 0.8576066493988037, "step": 1257 }, { "epoch": 3.9559748427672954, "grad_norm": 0.13760992884635925, "learning_rate": 2.8795022022885043e-05, "loss": 0.3247, "mean_token_accuracy": 0.8583594560623169, "step": 1258 }, { "epoch": 3.959119496855346, "grad_norm": 0.12815621495246887, "learning_rate": 2.8778039064395464e-05, "loss": 0.332, "mean_token_accuracy": 0.8571073412895203, "step": 1259 }, { "epoch": 3.9622641509433962, "grad_norm": 0.14194735884666443, "learning_rate": 2.8761049072348478e-05, "loss": 0.3321, "mean_token_accuracy": 0.8575592637062073, "step": 1260 }, { "epoch": 3.9654088050314464, "grad_norm": 0.12652727961540222, "learning_rate": 2.874405206437455e-05, "loss": 0.3126, "mean_token_accuracy": 0.8608962297439575, "step": 1261 }, { "epoch": 3.968553459119497, "grad_norm": 0.13028152287006378, "learning_rate": 2.8727048058111467e-05, "loss": 0.3395, "mean_token_accuracy": 0.8562440872192383, "step": 1262 }, { "epoch": 3.9716981132075473, "grad_norm": 0.1258905678987503, "learning_rate": 2.8710037071204235e-05, "loss": 0.3402, "mean_token_accuracy": 0.8553774356842041, "step": 1263 }, { "epoch": 3.9748427672955975, "grad_norm": 0.1402544230222702, "learning_rate": 2.8693019121305123e-05, "loss": 0.3403, "mean_token_accuracy": 0.8573205471038818, "step": 1264 }, { "epoch": 3.9779874213836477, "grad_norm": 0.13947366178035736, "learning_rate": 2.867599422607363e-05, "loss": 0.3292, "mean_token_accuracy": 0.8590210676193237, "step": 1265 }, { "epoch": 3.981132075471698, "grad_norm": 0.13215205073356628, "learning_rate": 2.865896240317645e-05, "loss": 0.3266, "mean_token_accuracy": 0.8595618605613708, "step": 1266 }, { "epoch": 3.9842767295597485, "grad_norm": 0.1323062777519226, "learning_rate": 2.8641923670287465e-05, "loss": 0.3276, "mean_token_accuracy": 0.8571467995643616, "step": 1267 }, { "epoch": 3.9874213836477987, "grad_norm": 0.12528589367866516, "learning_rate": 2.8624878045087744e-05, "loss": 0.3238, "mean_token_accuracy": 0.8595758080482483, "step": 1268 }, { "epoch": 3.990566037735849, "grad_norm": 0.14454223215579987, "learning_rate": 2.8607825545265492e-05, "loss": 0.3193, "mean_token_accuracy": 0.8578279614448547, "step": 1269 }, { "epoch": 3.9937106918238996, "grad_norm": 0.13367754220962524, "learning_rate": 2.8590766188516047e-05, "loss": 0.324, "mean_token_accuracy": 0.8594267964363098, "step": 1270 }, { "epoch": 3.99685534591195, "grad_norm": 0.14187419414520264, "learning_rate": 2.8573699992541892e-05, "loss": 0.3318, "mean_token_accuracy": 0.8586729168891907, "step": 1271 }, { "epoch": 4.0, "grad_norm": 0.1359594166278839, "learning_rate": 2.8556626975052563e-05, "loss": 0.3187, "mean_token_accuracy": 0.8581359386444092, "step": 1272 }, { "epoch": 4.00314465408805, "grad_norm": 0.21873971819877625, "learning_rate": 2.85395471537647e-05, "loss": 0.249, "mean_token_accuracy": 0.883579432964325, "step": 1273 }, { "epoch": 4.0062893081761, "grad_norm": 0.35204988718032837, "learning_rate": 2.8522460546402002e-05, "loss": 0.2546, "mean_token_accuracy": 0.8850920796394348, "step": 1274 }, { "epoch": 4.009433962264151, "grad_norm": 0.26803094148635864, "learning_rate": 2.8505367170695207e-05, "loss": 0.2628, "mean_token_accuracy": 0.8832299709320068, "step": 1275 }, { "epoch": 4.012578616352202, "grad_norm": 0.3539370894432068, "learning_rate": 2.8488267044382073e-05, "loss": 0.2587, "mean_token_accuracy": 0.886466920375824, "step": 1276 }, { "epoch": 4.015723270440252, "grad_norm": 0.2860783636569977, "learning_rate": 2.847116018520737e-05, "loss": 0.2652, "mean_token_accuracy": 0.8802006840705872, "step": 1277 }, { "epoch": 4.018867924528302, "grad_norm": 0.19191104173660278, "learning_rate": 2.8454046610922847e-05, "loss": 0.2587, "mean_token_accuracy": 0.884539008140564, "step": 1278 }, { "epoch": 4.022012578616352, "grad_norm": 0.2441309690475464, "learning_rate": 2.843692633928723e-05, "loss": 0.2266, "mean_token_accuracy": 0.8915911912918091, "step": 1279 }, { "epoch": 4.0251572327044025, "grad_norm": 0.17421981692314148, "learning_rate": 2.8419799388066182e-05, "loss": 0.2455, "mean_token_accuracy": 0.8853268623352051, "step": 1280 }, { "epoch": 4.028301886792453, "grad_norm": 0.20515094697475433, "learning_rate": 2.840266577503232e-05, "loss": 0.2521, "mean_token_accuracy": 0.886737585067749, "step": 1281 }, { "epoch": 4.031446540880503, "grad_norm": 0.1697227954864502, "learning_rate": 2.8385525517965143e-05, "loss": 0.2502, "mean_token_accuracy": 0.8833692669868469, "step": 1282 }, { "epoch": 4.034591194968553, "grad_norm": 0.19101938605308533, "learning_rate": 2.836837863465107e-05, "loss": 0.2615, "mean_token_accuracy": 0.8790598511695862, "step": 1283 }, { "epoch": 4.037735849056604, "grad_norm": 0.16805845499038696, "learning_rate": 2.8351225142883395e-05, "loss": 0.2596, "mean_token_accuracy": 0.8821934461593628, "step": 1284 }, { "epoch": 4.040880503144654, "grad_norm": 0.18171757459640503, "learning_rate": 2.8334065060462246e-05, "loss": 0.2338, "mean_token_accuracy": 0.8911201357841492, "step": 1285 }, { "epoch": 4.044025157232705, "grad_norm": 0.17769482731819153, "learning_rate": 2.8316898405194617e-05, "loss": 0.2514, "mean_token_accuracy": 0.8844525814056396, "step": 1286 }, { "epoch": 4.047169811320755, "grad_norm": 0.16998951137065887, "learning_rate": 2.829972519489431e-05, "loss": 0.24, "mean_token_accuracy": 0.8891401290893555, "step": 1287 }, { "epoch": 4.050314465408805, "grad_norm": 0.1702141910791397, "learning_rate": 2.828254544738192e-05, "loss": 0.2489, "mean_token_accuracy": 0.8864291310310364, "step": 1288 }, { "epoch": 4.053459119496855, "grad_norm": 0.16874520480632782, "learning_rate": 2.826535918048484e-05, "loss": 0.2406, "mean_token_accuracy": 0.8878934383392334, "step": 1289 }, { "epoch": 4.056603773584905, "grad_norm": 0.15455903112888336, "learning_rate": 2.8248166412037238e-05, "loss": 0.2479, "mean_token_accuracy": 0.8852613568305969, "step": 1290 }, { "epoch": 4.059748427672956, "grad_norm": 0.16695955395698547, "learning_rate": 2.8230967159879997e-05, "loss": 0.23, "mean_token_accuracy": 0.8919079303741455, "step": 1291 }, { "epoch": 4.062893081761007, "grad_norm": 0.16394896805286407, "learning_rate": 2.8213761441860765e-05, "loss": 0.2713, "mean_token_accuracy": 0.8791160583496094, "step": 1292 }, { "epoch": 4.066037735849057, "grad_norm": 0.15975283086299896, "learning_rate": 2.819654927583386e-05, "loss": 0.2384, "mean_token_accuracy": 0.8890205025672913, "step": 1293 }, { "epoch": 4.069182389937107, "grad_norm": 0.17008937895298004, "learning_rate": 2.817933067966033e-05, "loss": 0.2517, "mean_token_accuracy": 0.8825728893280029, "step": 1294 }, { "epoch": 4.072327044025157, "grad_norm": 0.1542217880487442, "learning_rate": 2.816210567120787e-05, "loss": 0.2735, "mean_token_accuracy": 0.879520833492279, "step": 1295 }, { "epoch": 4.0754716981132075, "grad_norm": 0.17298361659049988, "learning_rate": 2.8144874268350827e-05, "loss": 0.2556, "mean_token_accuracy": 0.8827891945838928, "step": 1296 }, { "epoch": 4.078616352201258, "grad_norm": 0.15569692850112915, "learning_rate": 2.8127636488970203e-05, "loss": 0.2269, "mean_token_accuracy": 0.8923097252845764, "step": 1297 }, { "epoch": 4.081761006289308, "grad_norm": 0.1688852161169052, "learning_rate": 2.8110392350953606e-05, "loss": 0.2302, "mean_token_accuracy": 0.8880395293235779, "step": 1298 }, { "epoch": 4.084905660377358, "grad_norm": 0.1484537571668625, "learning_rate": 2.8093141872195246e-05, "loss": 0.2613, "mean_token_accuracy": 0.8827516436576843, "step": 1299 }, { "epoch": 4.088050314465409, "grad_norm": 0.16927222907543182, "learning_rate": 2.8075885070595906e-05, "loss": 0.2536, "mean_token_accuracy": 0.8848833441734314, "step": 1300 }, { "epoch": 4.091194968553459, "grad_norm": 0.14585481584072113, "learning_rate": 2.805862196406293e-05, "loss": 0.2578, "mean_token_accuracy": 0.8819864988327026, "step": 1301 }, { "epoch": 4.09433962264151, "grad_norm": 0.1680736243724823, "learning_rate": 2.8041352570510216e-05, "loss": 0.2529, "mean_token_accuracy": 0.8855777382850647, "step": 1302 }, { "epoch": 4.09748427672956, "grad_norm": 0.14265228807926178, "learning_rate": 2.802407690785818e-05, "loss": 0.2368, "mean_token_accuracy": 0.8872923851013184, "step": 1303 }, { "epoch": 4.10062893081761, "grad_norm": 0.15464213490486145, "learning_rate": 2.8006794994033728e-05, "loss": 0.2616, "mean_token_accuracy": 0.8812229037284851, "step": 1304 }, { "epoch": 4.10377358490566, "grad_norm": 0.14933174848556519, "learning_rate": 2.7989506846970284e-05, "loss": 0.2466, "mean_token_accuracy": 0.8838765621185303, "step": 1305 }, { "epoch": 4.1069182389937104, "grad_norm": 0.1553312987089157, "learning_rate": 2.7972212484607708e-05, "loss": 0.2505, "mean_token_accuracy": 0.8844928741455078, "step": 1306 }, { "epoch": 4.110062893081761, "grad_norm": 0.15248145163059235, "learning_rate": 2.7954911924892344e-05, "loss": 0.2452, "mean_token_accuracy": 0.8878949880599976, "step": 1307 }, { "epoch": 4.113207547169812, "grad_norm": 0.14316493272781372, "learning_rate": 2.793760518577693e-05, "loss": 0.2567, "mean_token_accuracy": 0.8830364942550659, "step": 1308 }, { "epoch": 4.116352201257862, "grad_norm": 0.15453389286994934, "learning_rate": 2.792029228522064e-05, "loss": 0.2372, "mean_token_accuracy": 0.8906042575836182, "step": 1309 }, { "epoch": 4.119496855345912, "grad_norm": 0.14568613469600677, "learning_rate": 2.7902973241189037e-05, "loss": 0.254, "mean_token_accuracy": 0.8863977789878845, "step": 1310 }, { "epoch": 4.122641509433962, "grad_norm": 0.14949272572994232, "learning_rate": 2.788564807165406e-05, "loss": 0.2367, "mean_token_accuracy": 0.8867618441581726, "step": 1311 }, { "epoch": 4.1257861635220126, "grad_norm": 0.15481707453727722, "learning_rate": 2.7868316794594e-05, "loss": 0.2428, "mean_token_accuracy": 0.8883107900619507, "step": 1312 }, { "epoch": 4.128930817610063, "grad_norm": 0.145402193069458, "learning_rate": 2.7850979427993484e-05, "loss": 0.2408, "mean_token_accuracy": 0.887713611125946, "step": 1313 }, { "epoch": 4.132075471698113, "grad_norm": 0.14378874003887177, "learning_rate": 2.7833635989843474e-05, "loss": 0.2601, "mean_token_accuracy": 0.8857198357582092, "step": 1314 }, { "epoch": 4.135220125786163, "grad_norm": 0.15104271471500397, "learning_rate": 2.78162864981412e-05, "loss": 0.252, "mean_token_accuracy": 0.8861721754074097, "step": 1315 }, { "epoch": 4.138364779874214, "grad_norm": 0.13934184610843658, "learning_rate": 2.7798930970890216e-05, "loss": 0.2509, "mean_token_accuracy": 0.8866226077079773, "step": 1316 }, { "epoch": 4.1415094339622645, "grad_norm": 0.14720742404460907, "learning_rate": 2.7781569426100304e-05, "loss": 0.2464, "mean_token_accuracy": 0.8861123919487, "step": 1317 }, { "epoch": 4.144654088050315, "grad_norm": 0.1393972486257553, "learning_rate": 2.7764201881787512e-05, "loss": 0.2655, "mean_token_accuracy": 0.8817521929740906, "step": 1318 }, { "epoch": 4.147798742138365, "grad_norm": 0.1448722630739212, "learning_rate": 2.7746828355974104e-05, "loss": 0.2364, "mean_token_accuracy": 0.8872301578521729, "step": 1319 }, { "epoch": 4.150943396226415, "grad_norm": 0.13423529267311096, "learning_rate": 2.7729448866688544e-05, "loss": 0.2527, "mean_token_accuracy": 0.8843175768852234, "step": 1320 }, { "epoch": 4.154088050314465, "grad_norm": 0.13578058779239655, "learning_rate": 2.771206343196551e-05, "loss": 0.2603, "mean_token_accuracy": 0.8812825083732605, "step": 1321 }, { "epoch": 4.1572327044025155, "grad_norm": 0.1407482773065567, "learning_rate": 2.7694672069845813e-05, "loss": 0.2423, "mean_token_accuracy": 0.8843265771865845, "step": 1322 }, { "epoch": 4.160377358490566, "grad_norm": 0.14086057245731354, "learning_rate": 2.767727479837644e-05, "loss": 0.2347, "mean_token_accuracy": 0.8889259099960327, "step": 1323 }, { "epoch": 4.163522012578617, "grad_norm": 0.14522001147270203, "learning_rate": 2.7659871635610515e-05, "loss": 0.2498, "mean_token_accuracy": 0.8857369422912598, "step": 1324 }, { "epoch": 4.166666666666667, "grad_norm": 0.14944982528686523, "learning_rate": 2.7642462599607252e-05, "loss": 0.2333, "mean_token_accuracy": 0.8869674205780029, "step": 1325 }, { "epoch": 4.169811320754717, "grad_norm": 0.1487564593553543, "learning_rate": 2.7625047708431976e-05, "loss": 0.2352, "mean_token_accuracy": 0.8852526545524597, "step": 1326 }, { "epoch": 4.172955974842767, "grad_norm": 0.1437779664993286, "learning_rate": 2.7607626980156092e-05, "loss": 0.262, "mean_token_accuracy": 0.8827253580093384, "step": 1327 }, { "epoch": 4.176100628930818, "grad_norm": 0.1432497501373291, "learning_rate": 2.7590200432857047e-05, "loss": 0.2567, "mean_token_accuracy": 0.8826262354850769, "step": 1328 }, { "epoch": 4.179245283018868, "grad_norm": 0.14890795946121216, "learning_rate": 2.7572768084618334e-05, "loss": 0.2688, "mean_token_accuracy": 0.8793274760246277, "step": 1329 }, { "epoch": 4.182389937106918, "grad_norm": 0.14550916850566864, "learning_rate": 2.755532995352947e-05, "loss": 0.233, "mean_token_accuracy": 0.8903444409370422, "step": 1330 }, { "epoch": 4.185534591194968, "grad_norm": 0.18591845035552979, "learning_rate": 2.753788605768596e-05, "loss": 0.2596, "mean_token_accuracy": 0.8813364505767822, "step": 1331 }, { "epoch": 4.188679245283019, "grad_norm": 0.13862967491149902, "learning_rate": 2.7520436415189306e-05, "loss": 0.2622, "mean_token_accuracy": 0.8814187049865723, "step": 1332 }, { "epoch": 4.1918238993710695, "grad_norm": 0.16407757997512817, "learning_rate": 2.7502981044146963e-05, "loss": 0.2579, "mean_token_accuracy": 0.8837925791740417, "step": 1333 }, { "epoch": 4.19496855345912, "grad_norm": 0.14519527554512024, "learning_rate": 2.7485519962672337e-05, "loss": 0.2695, "mean_token_accuracy": 0.8818954825401306, "step": 1334 }, { "epoch": 4.19811320754717, "grad_norm": 0.14345420897006989, "learning_rate": 2.746805318888476e-05, "loss": 0.2494, "mean_token_accuracy": 0.8830834627151489, "step": 1335 }, { "epoch": 4.20125786163522, "grad_norm": 0.1446036845445633, "learning_rate": 2.7450580740909463e-05, "loss": 0.2498, "mean_token_accuracy": 0.8848795890808105, "step": 1336 }, { "epoch": 4.20440251572327, "grad_norm": 0.145370215177536, "learning_rate": 2.743310263687757e-05, "loss": 0.268, "mean_token_accuracy": 0.8806397914886475, "step": 1337 }, { "epoch": 4.2075471698113205, "grad_norm": 0.13711602985858917, "learning_rate": 2.7415618894926072e-05, "loss": 0.23, "mean_token_accuracy": 0.8897081613540649, "step": 1338 }, { "epoch": 4.210691823899371, "grad_norm": 0.15641474723815918, "learning_rate": 2.739812953319782e-05, "loss": 0.2406, "mean_token_accuracy": 0.8866468071937561, "step": 1339 }, { "epoch": 4.213836477987422, "grad_norm": 0.1482914835214615, "learning_rate": 2.738063456984148e-05, "loss": 0.2472, "mean_token_accuracy": 0.885341227054596, "step": 1340 }, { "epoch": 4.216981132075472, "grad_norm": 0.14941436052322388, "learning_rate": 2.736313402301155e-05, "loss": 0.2558, "mean_token_accuracy": 0.8828262090682983, "step": 1341 }, { "epoch": 4.220125786163522, "grad_norm": 0.14204715192317963, "learning_rate": 2.73456279108683e-05, "loss": 0.2591, "mean_token_accuracy": 0.8831762671470642, "step": 1342 }, { "epoch": 4.223270440251572, "grad_norm": 0.14757885038852692, "learning_rate": 2.73281162515778e-05, "loss": 0.2582, "mean_token_accuracy": 0.8833774328231812, "step": 1343 }, { "epoch": 4.226415094339623, "grad_norm": 0.14690524339675903, "learning_rate": 2.7310599063311857e-05, "loss": 0.258, "mean_token_accuracy": 0.8827973008155823, "step": 1344 }, { "epoch": 4.229559748427673, "grad_norm": 0.14617246389389038, "learning_rate": 2.7293076364248015e-05, "loss": 0.2412, "mean_token_accuracy": 0.886563241481781, "step": 1345 }, { "epoch": 4.232704402515723, "grad_norm": 0.13870076835155487, "learning_rate": 2.7275548172569557e-05, "loss": 0.2321, "mean_token_accuracy": 0.8864620923995972, "step": 1346 }, { "epoch": 4.235849056603773, "grad_norm": 0.15120095014572144, "learning_rate": 2.7258014506465442e-05, "loss": 0.2448, "mean_token_accuracy": 0.8861029744148254, "step": 1347 }, { "epoch": 4.238993710691824, "grad_norm": 0.15668298304080963, "learning_rate": 2.724047538413033e-05, "loss": 0.2577, "mean_token_accuracy": 0.8828524947166443, "step": 1348 }, { "epoch": 4.2421383647798745, "grad_norm": 0.13791413605213165, "learning_rate": 2.7222930823764517e-05, "loss": 0.2473, "mean_token_accuracy": 0.8867437243461609, "step": 1349 }, { "epoch": 4.245283018867925, "grad_norm": 0.15054844319820404, "learning_rate": 2.7205380843573967e-05, "loss": 0.2549, "mean_token_accuracy": 0.8830904364585876, "step": 1350 }, { "epoch": 4.248427672955975, "grad_norm": 0.14027497172355652, "learning_rate": 2.718782546177026e-05, "loss": 0.2551, "mean_token_accuracy": 0.8864468336105347, "step": 1351 }, { "epoch": 4.251572327044025, "grad_norm": 0.15007853507995605, "learning_rate": 2.7170264696570586e-05, "loss": 0.2437, "mean_token_accuracy": 0.8871759176254272, "step": 1352 }, { "epoch": 4.254716981132075, "grad_norm": 0.1608761101961136, "learning_rate": 2.7152698566197703e-05, "loss": 0.2557, "mean_token_accuracy": 0.8846498131752014, "step": 1353 }, { "epoch": 4.2578616352201255, "grad_norm": 0.13730347156524658, "learning_rate": 2.7135127088879947e-05, "loss": 0.2541, "mean_token_accuracy": 0.8821074366569519, "step": 1354 }, { "epoch": 4.261006289308176, "grad_norm": 0.14933264255523682, "learning_rate": 2.7117550282851222e-05, "loss": 0.2569, "mean_token_accuracy": 0.8827639818191528, "step": 1355 }, { "epoch": 4.264150943396227, "grad_norm": 0.15967227518558502, "learning_rate": 2.709996816635093e-05, "loss": 0.2457, "mean_token_accuracy": 0.8853583335876465, "step": 1356 }, { "epoch": 4.267295597484277, "grad_norm": 0.130757138133049, "learning_rate": 2.7082380757623997e-05, "loss": 0.2624, "mean_token_accuracy": 0.8816043734550476, "step": 1357 }, { "epoch": 4.270440251572327, "grad_norm": 0.17749525606632233, "learning_rate": 2.7064788074920853e-05, "loss": 0.2297, "mean_token_accuracy": 0.8903690576553345, "step": 1358 }, { "epoch": 4.273584905660377, "grad_norm": 0.14349985122680664, "learning_rate": 2.7047190136497374e-05, "loss": 0.2644, "mean_token_accuracy": 0.8817373514175415, "step": 1359 }, { "epoch": 4.276729559748428, "grad_norm": 0.15383297204971313, "learning_rate": 2.702958696061492e-05, "loss": 0.2536, "mean_token_accuracy": 0.8856379985809326, "step": 1360 }, { "epoch": 4.279874213836478, "grad_norm": 0.1427260786294937, "learning_rate": 2.7011978565540258e-05, "loss": 0.2431, "mean_token_accuracy": 0.8847337365150452, "step": 1361 }, { "epoch": 4.283018867924528, "grad_norm": 0.15544083714485168, "learning_rate": 2.6994364969545596e-05, "loss": 0.2447, "mean_token_accuracy": 0.8849520087242126, "step": 1362 }, { "epoch": 4.286163522012578, "grad_norm": 0.15651412308216095, "learning_rate": 2.697674619090852e-05, "loss": 0.2366, "mean_token_accuracy": 0.8874123692512512, "step": 1363 }, { "epoch": 4.289308176100629, "grad_norm": 0.15618132054805756, "learning_rate": 2.6959122247911996e-05, "loss": 0.255, "mean_token_accuracy": 0.8844594359397888, "step": 1364 }, { "epoch": 4.2924528301886795, "grad_norm": 0.14532870054244995, "learning_rate": 2.694149315884436e-05, "loss": 0.2429, "mean_token_accuracy": 0.8856728076934814, "step": 1365 }, { "epoch": 4.29559748427673, "grad_norm": 0.15094998478889465, "learning_rate": 2.692385894199929e-05, "loss": 0.257, "mean_token_accuracy": 0.8823837637901306, "step": 1366 }, { "epoch": 4.29874213836478, "grad_norm": 0.1429101526737213, "learning_rate": 2.6906219615675756e-05, "loss": 0.246, "mean_token_accuracy": 0.8854039311408997, "step": 1367 }, { "epoch": 4.30188679245283, "grad_norm": 0.1493634283542633, "learning_rate": 2.6888575198178073e-05, "loss": 0.2523, "mean_token_accuracy": 0.8816664814949036, "step": 1368 }, { "epoch": 4.30503144654088, "grad_norm": 0.1594749540090561, "learning_rate": 2.6870925707815807e-05, "loss": 0.2433, "mean_token_accuracy": 0.8858900666236877, "step": 1369 }, { "epoch": 4.3081761006289305, "grad_norm": 0.16246452927589417, "learning_rate": 2.6853271162903792e-05, "loss": 0.28, "mean_token_accuracy": 0.8772223591804504, "step": 1370 }, { "epoch": 4.311320754716981, "grad_norm": 0.15938788652420044, "learning_rate": 2.6835611581762135e-05, "loss": 0.2522, "mean_token_accuracy": 0.8834171295166016, "step": 1371 }, { "epoch": 4.314465408805032, "grad_norm": 0.1860852986574173, "learning_rate": 2.6817946982716125e-05, "loss": 0.2239, "mean_token_accuracy": 0.8932704925537109, "step": 1372 }, { "epoch": 4.317610062893082, "grad_norm": 0.1534428745508194, "learning_rate": 2.6800277384096287e-05, "loss": 0.2503, "mean_token_accuracy": 0.8848916292190552, "step": 1373 }, { "epoch": 4.320754716981132, "grad_norm": 0.19438529014587402, "learning_rate": 2.6782602804238328e-05, "loss": 0.2566, "mean_token_accuracy": 0.8817089200019836, "step": 1374 }, { "epoch": 4.323899371069182, "grad_norm": 0.1576819121837616, "learning_rate": 2.6764923261483135e-05, "loss": 0.252, "mean_token_accuracy": 0.8846593499183655, "step": 1375 }, { "epoch": 4.327044025157233, "grad_norm": 0.18220695853233337, "learning_rate": 2.6747238774176717e-05, "loss": 0.2499, "mean_token_accuracy": 0.8850845098495483, "step": 1376 }, { "epoch": 4.330188679245283, "grad_norm": 0.15748025476932526, "learning_rate": 2.6729549360670244e-05, "loss": 0.2523, "mean_token_accuracy": 0.8865378499031067, "step": 1377 }, { "epoch": 4.333333333333333, "grad_norm": 0.15679998695850372, "learning_rate": 2.6711855039319972e-05, "loss": 0.2525, "mean_token_accuracy": 0.8859933614730835, "step": 1378 }, { "epoch": 4.336477987421383, "grad_norm": 0.1435849666595459, "learning_rate": 2.6694155828487272e-05, "loss": 0.2553, "mean_token_accuracy": 0.882286787033081, "step": 1379 }, { "epoch": 4.339622641509434, "grad_norm": 0.14357006549835205, "learning_rate": 2.6676451746538577e-05, "loss": 0.2519, "mean_token_accuracy": 0.8833568692207336, "step": 1380 }, { "epoch": 4.3427672955974845, "grad_norm": 0.14986519515514374, "learning_rate": 2.6658742811845377e-05, "loss": 0.2494, "mean_token_accuracy": 0.8818812370300293, "step": 1381 }, { "epoch": 4.345911949685535, "grad_norm": 0.14738526940345764, "learning_rate": 2.6641029042784197e-05, "loss": 0.2504, "mean_token_accuracy": 0.8836377859115601, "step": 1382 }, { "epoch": 4.349056603773585, "grad_norm": 0.15075050294399261, "learning_rate": 2.6623310457736575e-05, "loss": 0.2638, "mean_token_accuracy": 0.8816152811050415, "step": 1383 }, { "epoch": 4.352201257861635, "grad_norm": 0.14811988174915314, "learning_rate": 2.660558707508906e-05, "loss": 0.2639, "mean_token_accuracy": 0.8767325282096863, "step": 1384 }, { "epoch": 4.355345911949685, "grad_norm": 0.13753147423267365, "learning_rate": 2.6587858913233168e-05, "loss": 0.2412, "mean_token_accuracy": 0.8852753639221191, "step": 1385 }, { "epoch": 4.3584905660377355, "grad_norm": 0.1575927436351776, "learning_rate": 2.657012599056536e-05, "loss": 0.252, "mean_token_accuracy": 0.8817105293273926, "step": 1386 }, { "epoch": 4.361635220125786, "grad_norm": 0.14534930884838104, "learning_rate": 2.6552388325487078e-05, "loss": 0.2562, "mean_token_accuracy": 0.8825299143791199, "step": 1387 }, { "epoch": 4.364779874213837, "grad_norm": 0.1528378576040268, "learning_rate": 2.6534645936404655e-05, "loss": 0.2606, "mean_token_accuracy": 0.8781949877738953, "step": 1388 }, { "epoch": 4.367924528301887, "grad_norm": 0.14020651578903198, "learning_rate": 2.6516898841729323e-05, "loss": 0.257, "mean_token_accuracy": 0.8830549716949463, "step": 1389 }, { "epoch": 4.371069182389937, "grad_norm": 0.13903018832206726, "learning_rate": 2.6499147059877213e-05, "loss": 0.2522, "mean_token_accuracy": 0.8827590346336365, "step": 1390 }, { "epoch": 4.3742138364779874, "grad_norm": 0.14772510528564453, "learning_rate": 2.6481390609269308e-05, "loss": 0.2625, "mean_token_accuracy": 0.8802635073661804, "step": 1391 }, { "epoch": 4.377358490566038, "grad_norm": 0.1616673320531845, "learning_rate": 2.646362950833145e-05, "loss": 0.2425, "mean_token_accuracy": 0.8878819942474365, "step": 1392 }, { "epoch": 4.380503144654088, "grad_norm": 0.13991199433803558, "learning_rate": 2.6445863775494282e-05, "loss": 0.2522, "mean_token_accuracy": 0.8848397135734558, "step": 1393 }, { "epoch": 4.383647798742138, "grad_norm": 0.1424325704574585, "learning_rate": 2.6428093429193288e-05, "loss": 0.2689, "mean_token_accuracy": 0.8799664974212646, "step": 1394 }, { "epoch": 4.386792452830189, "grad_norm": 0.1396665871143341, "learning_rate": 2.6410318487868707e-05, "loss": 0.2548, "mean_token_accuracy": 0.8830381631851196, "step": 1395 }, { "epoch": 4.389937106918239, "grad_norm": 0.14341147243976593, "learning_rate": 2.6392538969965565e-05, "loss": 0.243, "mean_token_accuracy": 0.8879396319389343, "step": 1396 }, { "epoch": 4.3930817610062896, "grad_norm": 0.1465020775794983, "learning_rate": 2.637475489393363e-05, "loss": 0.2391, "mean_token_accuracy": 0.8867096900939941, "step": 1397 }, { "epoch": 4.39622641509434, "grad_norm": 0.1537989377975464, "learning_rate": 2.63569662782274e-05, "loss": 0.2578, "mean_token_accuracy": 0.8815436959266663, "step": 1398 }, { "epoch": 4.39937106918239, "grad_norm": 0.1487364023923874, "learning_rate": 2.6339173141306095e-05, "loss": 0.2679, "mean_token_accuracy": 0.8777388334274292, "step": 1399 }, { "epoch": 4.40251572327044, "grad_norm": 0.15459056198596954, "learning_rate": 2.6321375501633603e-05, "loss": 0.2686, "mean_token_accuracy": 0.8784300088882446, "step": 1400 }, { "epoch": 4.40566037735849, "grad_norm": 0.139274001121521, "learning_rate": 2.6303573377678513e-05, "loss": 0.2573, "mean_token_accuracy": 0.8845201134681702, "step": 1401 }, { "epoch": 4.408805031446541, "grad_norm": 0.15442243218421936, "learning_rate": 2.6285766787914034e-05, "loss": 0.2497, "mean_token_accuracy": 0.8847933411598206, "step": 1402 }, { "epoch": 4.411949685534591, "grad_norm": 0.1377149522304535, "learning_rate": 2.626795575081804e-05, "loss": 0.2274, "mean_token_accuracy": 0.8895265460014343, "step": 1403 }, { "epoch": 4.415094339622642, "grad_norm": 0.14790353178977966, "learning_rate": 2.6250140284873017e-05, "loss": 0.2685, "mean_token_accuracy": 0.8800943493843079, "step": 1404 }, { "epoch": 4.418238993710692, "grad_norm": 0.13157331943511963, "learning_rate": 2.6232320408566025e-05, "loss": 0.2427, "mean_token_accuracy": 0.8863032460212708, "step": 1405 }, { "epoch": 4.421383647798742, "grad_norm": 0.14972056448459625, "learning_rate": 2.6214496140388718e-05, "loss": 0.2452, "mean_token_accuracy": 0.8844586610794067, "step": 1406 }, { "epoch": 4.4245283018867925, "grad_norm": 0.14638881385326385, "learning_rate": 2.6196667498837302e-05, "loss": 0.2597, "mean_token_accuracy": 0.880418598651886, "step": 1407 }, { "epoch": 4.427672955974843, "grad_norm": 0.1468118280172348, "learning_rate": 2.617883450241252e-05, "loss": 0.2518, "mean_token_accuracy": 0.8848738670349121, "step": 1408 }, { "epoch": 4.430817610062893, "grad_norm": 0.14775048196315765, "learning_rate": 2.616099716961964e-05, "loss": 0.2653, "mean_token_accuracy": 0.8803516030311584, "step": 1409 }, { "epoch": 4.433962264150943, "grad_norm": 0.14332270622253418, "learning_rate": 2.6143155518968428e-05, "loss": 0.2487, "mean_token_accuracy": 0.8877217173576355, "step": 1410 }, { "epoch": 4.437106918238994, "grad_norm": 0.14214986562728882, "learning_rate": 2.612530956897312e-05, "loss": 0.2705, "mean_token_accuracy": 0.8784423470497131, "step": 1411 }, { "epoch": 4.440251572327044, "grad_norm": 0.1403186023235321, "learning_rate": 2.6107459338152427e-05, "loss": 0.2665, "mean_token_accuracy": 0.8789308071136475, "step": 1412 }, { "epoch": 4.443396226415095, "grad_norm": 0.138310045003891, "learning_rate": 2.60896048450295e-05, "loss": 0.2499, "mean_token_accuracy": 0.8843627572059631, "step": 1413 }, { "epoch": 4.446540880503145, "grad_norm": 0.1412084996700287, "learning_rate": 2.607174610813191e-05, "loss": 0.2594, "mean_token_accuracy": 0.8839447498321533, "step": 1414 }, { "epoch": 4.449685534591195, "grad_norm": 0.14437635242938995, "learning_rate": 2.605388314599163e-05, "loss": 0.241, "mean_token_accuracy": 0.8860413432121277, "step": 1415 }, { "epoch": 4.452830188679245, "grad_norm": 0.14431603252887726, "learning_rate": 2.6036015977145018e-05, "loss": 0.256, "mean_token_accuracy": 0.8797090649604797, "step": 1416 }, { "epoch": 4.455974842767295, "grad_norm": 0.1461351066827774, "learning_rate": 2.6018144620132805e-05, "loss": 0.2438, "mean_token_accuracy": 0.8827074766159058, "step": 1417 }, { "epoch": 4.459119496855346, "grad_norm": 0.14454837143421173, "learning_rate": 2.6000269093500055e-05, "loss": 0.2541, "mean_token_accuracy": 0.8819749355316162, "step": 1418 }, { "epoch": 4.462264150943396, "grad_norm": 0.15199217200279236, "learning_rate": 2.598238941579617e-05, "loss": 0.2622, "mean_token_accuracy": 0.8807674050331116, "step": 1419 }, { "epoch": 4.465408805031447, "grad_norm": 0.1487475484609604, "learning_rate": 2.596450560557485e-05, "loss": 0.2574, "mean_token_accuracy": 0.8806383013725281, "step": 1420 }, { "epoch": 4.468553459119497, "grad_norm": 0.14922156929969788, "learning_rate": 2.59466176813941e-05, "loss": 0.2574, "mean_token_accuracy": 0.8796212077140808, "step": 1421 }, { "epoch": 4.471698113207547, "grad_norm": 0.13734014332294464, "learning_rate": 2.5928725661816162e-05, "loss": 0.2412, "mean_token_accuracy": 0.8878635168075562, "step": 1422 }, { "epoch": 4.4748427672955975, "grad_norm": 0.1519346535205841, "learning_rate": 2.5910829565407573e-05, "loss": 0.2607, "mean_token_accuracy": 0.8802406191825867, "step": 1423 }, { "epoch": 4.477987421383648, "grad_norm": 0.13957345485687256, "learning_rate": 2.5892929410739066e-05, "loss": 0.2437, "mean_token_accuracy": 0.8885535001754761, "step": 1424 }, { "epoch": 4.481132075471698, "grad_norm": 0.14830255508422852, "learning_rate": 2.587502521638559e-05, "loss": 0.2575, "mean_token_accuracy": 0.8828555345535278, "step": 1425 }, { "epoch": 4.484276729559748, "grad_norm": 0.13677102327346802, "learning_rate": 2.5857117000926298e-05, "loss": 0.2501, "mean_token_accuracy": 0.8842593431472778, "step": 1426 }, { "epoch": 4.487421383647799, "grad_norm": 0.1367623507976532, "learning_rate": 2.5839204782944506e-05, "loss": 0.2689, "mean_token_accuracy": 0.8797082304954529, "step": 1427 }, { "epoch": 4.490566037735849, "grad_norm": 0.1288725733757019, "learning_rate": 2.582128858102769e-05, "loss": 0.2567, "mean_token_accuracy": 0.8836309909820557, "step": 1428 }, { "epoch": 4.4937106918239, "grad_norm": 0.13104671239852905, "learning_rate": 2.5803368413767443e-05, "loss": 0.2503, "mean_token_accuracy": 0.8835127949714661, "step": 1429 }, { "epoch": 4.49685534591195, "grad_norm": 0.1426146924495697, "learning_rate": 2.5785444299759504e-05, "loss": 0.2556, "mean_token_accuracy": 0.8816784620285034, "step": 1430 }, { "epoch": 4.5, "grad_norm": 0.13342611491680145, "learning_rate": 2.576751625760368e-05, "loss": 0.2584, "mean_token_accuracy": 0.879668116569519, "step": 1431 }, { "epoch": 4.50314465408805, "grad_norm": 0.15487535297870636, "learning_rate": 2.5749584305903866e-05, "loss": 0.2569, "mean_token_accuracy": 0.8806970119476318, "step": 1432 }, { "epoch": 4.5062893081761, "grad_norm": 0.14090071618556976, "learning_rate": 2.5731648463268015e-05, "loss": 0.2415, "mean_token_accuracy": 0.8850799798965454, "step": 1433 }, { "epoch": 4.509433962264151, "grad_norm": 0.14992307126522064, "learning_rate": 2.571370874830811e-05, "loss": 0.2522, "mean_token_accuracy": 0.882178008556366, "step": 1434 }, { "epoch": 4.512578616352201, "grad_norm": 0.13169358670711517, "learning_rate": 2.569576517964016e-05, "loss": 0.2544, "mean_token_accuracy": 0.8801778554916382, "step": 1435 }, { "epoch": 4.515723270440252, "grad_norm": 0.16984504461288452, "learning_rate": 2.567781777588416e-05, "loss": 0.2474, "mean_token_accuracy": 0.8849585056304932, "step": 1436 }, { "epoch": 4.518867924528302, "grad_norm": 0.1379225105047226, "learning_rate": 2.565986655566411e-05, "loss": 0.2451, "mean_token_accuracy": 0.88438481092453, "step": 1437 }, { "epoch": 4.522012578616352, "grad_norm": 0.18996167182922363, "learning_rate": 2.5641911537607952e-05, "loss": 0.2607, "mean_token_accuracy": 0.8825079798698425, "step": 1438 }, { "epoch": 4.5251572327044025, "grad_norm": 0.13947978615760803, "learning_rate": 2.562395274034756e-05, "loss": 0.2568, "mean_token_accuracy": 0.8816097378730774, "step": 1439 }, { "epoch": 4.528301886792453, "grad_norm": 0.20490330457687378, "learning_rate": 2.5605990182518752e-05, "loss": 0.2699, "mean_token_accuracy": 0.8782700896263123, "step": 1440 }, { "epoch": 4.531446540880503, "grad_norm": 0.1351514607667923, "learning_rate": 2.5588023882761233e-05, "loss": 0.2508, "mean_token_accuracy": 0.8827007412910461, "step": 1441 }, { "epoch": 4.534591194968553, "grad_norm": 0.18768662214279175, "learning_rate": 2.5570053859718606e-05, "loss": 0.2583, "mean_token_accuracy": 0.8811473846435547, "step": 1442 }, { "epoch": 4.537735849056604, "grad_norm": 0.14161916077136993, "learning_rate": 2.5552080132038312e-05, "loss": 0.239, "mean_token_accuracy": 0.8842949867248535, "step": 1443 }, { "epoch": 4.540880503144654, "grad_norm": 0.1600307822227478, "learning_rate": 2.5534102718371664e-05, "loss": 0.2711, "mean_token_accuracy": 0.8794844746589661, "step": 1444 }, { "epoch": 4.544025157232705, "grad_norm": 0.1379203200340271, "learning_rate": 2.5516121637373782e-05, "loss": 0.2464, "mean_token_accuracy": 0.8868380188941956, "step": 1445 }, { "epoch": 4.547169811320755, "grad_norm": 0.15503491461277008, "learning_rate": 2.5498136907703594e-05, "loss": 0.2582, "mean_token_accuracy": 0.8819460868835449, "step": 1446 }, { "epoch": 4.550314465408805, "grad_norm": 0.13813835382461548, "learning_rate": 2.5480148548023823e-05, "loss": 0.2668, "mean_token_accuracy": 0.878455638885498, "step": 1447 }, { "epoch": 4.553459119496855, "grad_norm": 0.1430351883172989, "learning_rate": 2.5462156577000952e-05, "loss": 0.2432, "mean_token_accuracy": 0.8864967226982117, "step": 1448 }, { "epoch": 4.556603773584905, "grad_norm": 0.14724938571453094, "learning_rate": 2.5444161013305217e-05, "loss": 0.2618, "mean_token_accuracy": 0.8809149861335754, "step": 1449 }, { "epoch": 4.559748427672956, "grad_norm": 0.132796511054039, "learning_rate": 2.5426161875610572e-05, "loss": 0.2667, "mean_token_accuracy": 0.8805984854698181, "step": 1450 }, { "epoch": 4.562893081761006, "grad_norm": 0.1508307307958603, "learning_rate": 2.540815918259469e-05, "loss": 0.2494, "mean_token_accuracy": 0.8844383955001831, "step": 1451 }, { "epoch": 4.566037735849057, "grad_norm": 0.13710395991802216, "learning_rate": 2.539015295293893e-05, "loss": 0.2698, "mean_token_accuracy": 0.8794138431549072, "step": 1452 }, { "epoch": 4.569182389937107, "grad_norm": 0.13251298666000366, "learning_rate": 2.5372143205328306e-05, "loss": 0.2413, "mean_token_accuracy": 0.8852861523628235, "step": 1453 }, { "epoch": 4.572327044025157, "grad_norm": 0.1437736451625824, "learning_rate": 2.5354129958451513e-05, "loss": 0.2661, "mean_token_accuracy": 0.8799407482147217, "step": 1454 }, { "epoch": 4.5754716981132075, "grad_norm": 0.1383388191461563, "learning_rate": 2.5336113231000862e-05, "loss": 0.2517, "mean_token_accuracy": 0.8848426938056946, "step": 1455 }, { "epoch": 4.578616352201258, "grad_norm": 0.14111852645874023, "learning_rate": 2.5318093041672257e-05, "loss": 0.2618, "mean_token_accuracy": 0.8823397159576416, "step": 1456 }, { "epoch": 4.581761006289308, "grad_norm": 0.14448735117912292, "learning_rate": 2.5300069409165232e-05, "loss": 0.2371, "mean_token_accuracy": 0.8877670764923096, "step": 1457 }, { "epoch": 4.584905660377358, "grad_norm": 0.13009530305862427, "learning_rate": 2.5282042352182867e-05, "loss": 0.2415, "mean_token_accuracy": 0.8879152536392212, "step": 1458 }, { "epoch": 4.588050314465409, "grad_norm": 0.13649174571037292, "learning_rate": 2.5264011889431795e-05, "loss": 0.2632, "mean_token_accuracy": 0.8810548186302185, "step": 1459 }, { "epoch": 4.591194968553459, "grad_norm": 0.12930653989315033, "learning_rate": 2.5245978039622197e-05, "loss": 0.2498, "mean_token_accuracy": 0.8841792345046997, "step": 1460 }, { "epoch": 4.59433962264151, "grad_norm": 0.14395128190517426, "learning_rate": 2.522794082146776e-05, "loss": 0.2493, "mean_token_accuracy": 0.8833113312721252, "step": 1461 }, { "epoch": 4.59748427672956, "grad_norm": 0.13741520047187805, "learning_rate": 2.5209900253685674e-05, "loss": 0.2596, "mean_token_accuracy": 0.8786746859550476, "step": 1462 }, { "epoch": 4.60062893081761, "grad_norm": 0.14270655810832977, "learning_rate": 2.5191856354996595e-05, "loss": 0.2429, "mean_token_accuracy": 0.8827742338180542, "step": 1463 }, { "epoch": 4.60377358490566, "grad_norm": 0.13880446553230286, "learning_rate": 2.5173809144124635e-05, "loss": 0.2521, "mean_token_accuracy": 0.8830817341804504, "step": 1464 }, { "epoch": 4.6069182389937104, "grad_norm": 0.14457201957702637, "learning_rate": 2.5155758639797357e-05, "loss": 0.2489, "mean_token_accuracy": 0.8850651383399963, "step": 1465 }, { "epoch": 4.610062893081761, "grad_norm": 0.13805538415908813, "learning_rate": 2.513770486074574e-05, "loss": 0.2488, "mean_token_accuracy": 0.883724570274353, "step": 1466 }, { "epoch": 4.613207547169811, "grad_norm": 0.1541612446308136, "learning_rate": 2.5119647825704134e-05, "loss": 0.2526, "mean_token_accuracy": 0.8837642073631287, "step": 1467 }, { "epoch": 4.616352201257862, "grad_norm": 0.1382627934217453, "learning_rate": 2.51015875534103e-05, "loss": 0.2639, "mean_token_accuracy": 0.8827504515647888, "step": 1468 }, { "epoch": 4.619496855345912, "grad_norm": 0.15054289996623993, "learning_rate": 2.5083524062605344e-05, "loss": 0.2509, "mean_token_accuracy": 0.8836148977279663, "step": 1469 }, { "epoch": 4.622641509433962, "grad_norm": 0.14112670719623566, "learning_rate": 2.506545737203371e-05, "loss": 0.2523, "mean_token_accuracy": 0.8837305903434753, "step": 1470 }, { "epoch": 4.6257861635220126, "grad_norm": 0.14425958693027496, "learning_rate": 2.5047387500443177e-05, "loss": 0.247, "mean_token_accuracy": 0.8845484852790833, "step": 1471 }, { "epoch": 4.628930817610063, "grad_norm": 0.14548322558403015, "learning_rate": 2.5029314466584798e-05, "loss": 0.2467, "mean_token_accuracy": 0.883867084980011, "step": 1472 }, { "epoch": 4.632075471698113, "grad_norm": 0.14163750410079956, "learning_rate": 2.5011238289212948e-05, "loss": 0.2776, "mean_token_accuracy": 0.8781344294548035, "step": 1473 }, { "epoch": 4.635220125786163, "grad_norm": 0.14586465060710907, "learning_rate": 2.499315898708521e-05, "loss": 0.2493, "mean_token_accuracy": 0.8815778493881226, "step": 1474 }, { "epoch": 4.638364779874214, "grad_norm": 0.14209404587745667, "learning_rate": 2.4975076578962454e-05, "loss": 0.2525, "mean_token_accuracy": 0.8831738233566284, "step": 1475 }, { "epoch": 4.6415094339622645, "grad_norm": 0.14334115386009216, "learning_rate": 2.4956991083608766e-05, "loss": 0.2669, "mean_token_accuracy": 0.882244348526001, "step": 1476 }, { "epoch": 4.644654088050315, "grad_norm": 0.1406649500131607, "learning_rate": 2.493890251979141e-05, "loss": 0.2515, "mean_token_accuracy": 0.8821365833282471, "step": 1477 }, { "epoch": 4.647798742138365, "grad_norm": 0.1404079645872116, "learning_rate": 2.4920810906280873e-05, "loss": 0.2533, "mean_token_accuracy": 0.8844720125198364, "step": 1478 }, { "epoch": 4.650943396226415, "grad_norm": 0.13785113394260406, "learning_rate": 2.4902716261850764e-05, "loss": 0.2638, "mean_token_accuracy": 0.8819215893745422, "step": 1479 }, { "epoch": 4.654088050314465, "grad_norm": 0.15292143821716309, "learning_rate": 2.4884618605277874e-05, "loss": 0.2659, "mean_token_accuracy": 0.8811072111129761, "step": 1480 }, { "epoch": 4.6572327044025155, "grad_norm": 0.13847500085830688, "learning_rate": 2.4866517955342094e-05, "loss": 0.2564, "mean_token_accuracy": 0.8817123770713806, "step": 1481 }, { "epoch": 4.660377358490566, "grad_norm": 0.1566178798675537, "learning_rate": 2.4848414330826443e-05, "loss": 0.2505, "mean_token_accuracy": 0.8840153813362122, "step": 1482 }, { "epoch": 4.663522012578616, "grad_norm": 0.1307821422815323, "learning_rate": 2.4830307750517017e-05, "loss": 0.268, "mean_token_accuracy": 0.8796277046203613, "step": 1483 }, { "epoch": 4.666666666666667, "grad_norm": 0.152281254529953, "learning_rate": 2.481219823320296e-05, "loss": 0.254, "mean_token_accuracy": 0.8816496133804321, "step": 1484 }, { "epoch": 4.669811320754717, "grad_norm": 0.14067430794239044, "learning_rate": 2.479408579767649e-05, "loss": 0.2512, "mean_token_accuracy": 0.8842344284057617, "step": 1485 }, { "epoch": 4.672955974842767, "grad_norm": 0.13918046653270721, "learning_rate": 2.4775970462732858e-05, "loss": 0.2603, "mean_token_accuracy": 0.8810861110687256, "step": 1486 }, { "epoch": 4.676100628930818, "grad_norm": 0.1361314058303833, "learning_rate": 2.4757852247170293e-05, "loss": 0.256, "mean_token_accuracy": 0.8834766149520874, "step": 1487 }, { "epoch": 4.679245283018868, "grad_norm": 0.1420655995607376, "learning_rate": 2.4739731169790036e-05, "loss": 0.2482, "mean_token_accuracy": 0.8854100108146667, "step": 1488 }, { "epoch": 4.682389937106918, "grad_norm": 0.14817163348197937, "learning_rate": 2.4721607249396295e-05, "loss": 0.2581, "mean_token_accuracy": 0.8823645710945129, "step": 1489 }, { "epoch": 4.685534591194968, "grad_norm": 0.14173953235149384, "learning_rate": 2.4703480504796227e-05, "loss": 0.2423, "mean_token_accuracy": 0.8879494071006775, "step": 1490 }, { "epoch": 4.688679245283019, "grad_norm": 0.142574280500412, "learning_rate": 2.4685350954799908e-05, "loss": 0.2595, "mean_token_accuracy": 0.8818832635879517, "step": 1491 }, { "epoch": 4.6918238993710695, "grad_norm": 0.14576321840286255, "learning_rate": 2.466721861822034e-05, "loss": 0.256, "mean_token_accuracy": 0.8829867243766785, "step": 1492 }, { "epoch": 4.69496855345912, "grad_norm": 0.13140003383159637, "learning_rate": 2.4649083513873423e-05, "loss": 0.251, "mean_token_accuracy": 0.8816654086112976, "step": 1493 }, { "epoch": 4.69811320754717, "grad_norm": 0.1674601286649704, "learning_rate": 2.4630945660577907e-05, "loss": 0.2488, "mean_token_accuracy": 0.885489284992218, "step": 1494 }, { "epoch": 4.70125786163522, "grad_norm": 0.14407427608966827, "learning_rate": 2.4612805077155404e-05, "loss": 0.2642, "mean_token_accuracy": 0.8817337155342102, "step": 1495 }, { "epoch": 4.70440251572327, "grad_norm": 0.14941011369228363, "learning_rate": 2.4594661782430368e-05, "loss": 0.2667, "mean_token_accuracy": 0.8806317448616028, "step": 1496 }, { "epoch": 4.7075471698113205, "grad_norm": 0.1350492388010025, "learning_rate": 2.4576515795230057e-05, "loss": 0.265, "mean_token_accuracy": 0.876676619052887, "step": 1497 }, { "epoch": 4.710691823899371, "grad_norm": 0.14951270818710327, "learning_rate": 2.4558367134384516e-05, "loss": 0.2466, "mean_token_accuracy": 0.8872973918914795, "step": 1498 }, { "epoch": 4.713836477987421, "grad_norm": 0.14224649965763092, "learning_rate": 2.4540215818726587e-05, "loss": 0.2582, "mean_token_accuracy": 0.8833028078079224, "step": 1499 }, { "epoch": 4.716981132075472, "grad_norm": 0.15279057621955872, "learning_rate": 2.452206186709185e-05, "loss": 0.2644, "mean_token_accuracy": 0.8810228705406189, "step": 1500 }, { "epoch": 4.720125786163522, "grad_norm": 0.1417638510465622, "learning_rate": 2.4503905298318612e-05, "loss": 0.2682, "mean_token_accuracy": 0.8790037631988525, "step": 1501 }, { "epoch": 4.723270440251572, "grad_norm": 0.13526369631290436, "learning_rate": 2.448574613124793e-05, "loss": 0.2681, "mean_token_accuracy": 0.8802682161331177, "step": 1502 }, { "epoch": 4.726415094339623, "grad_norm": 0.1354237049818039, "learning_rate": 2.4467584384723512e-05, "loss": 0.2644, "mean_token_accuracy": 0.8789896965026855, "step": 1503 }, { "epoch": 4.729559748427673, "grad_norm": 0.13121561706066132, "learning_rate": 2.444942007759178e-05, "loss": 0.2537, "mean_token_accuracy": 0.8831503987312317, "step": 1504 }, { "epoch": 4.732704402515723, "grad_norm": 0.13547173142433167, "learning_rate": 2.4431253228701795e-05, "loss": 0.2565, "mean_token_accuracy": 0.87940514087677, "step": 1505 }, { "epoch": 4.735849056603773, "grad_norm": 0.13463617861270905, "learning_rate": 2.4413083856905257e-05, "loss": 0.2424, "mean_token_accuracy": 0.886320173740387, "step": 1506 }, { "epoch": 4.738993710691824, "grad_norm": 0.12674209475517273, "learning_rate": 2.4394911981056493e-05, "loss": 0.2573, "mean_token_accuracy": 0.880353569984436, "step": 1507 }, { "epoch": 4.7421383647798745, "grad_norm": 0.14087824523448944, "learning_rate": 2.437673762001241e-05, "loss": 0.2591, "mean_token_accuracy": 0.8803715705871582, "step": 1508 }, { "epoch": 4.745283018867925, "grad_norm": 0.13621707260608673, "learning_rate": 2.4358560792632515e-05, "loss": 0.2791, "mean_token_accuracy": 0.8760728240013123, "step": 1509 }, { "epoch": 4.748427672955975, "grad_norm": 0.12566280364990234, "learning_rate": 2.4340381517778867e-05, "loss": 0.2527, "mean_token_accuracy": 0.882553219795227, "step": 1510 }, { "epoch": 4.751572327044025, "grad_norm": 0.1349441558122635, "learning_rate": 2.432219981431605e-05, "loss": 0.2484, "mean_token_accuracy": 0.8867226839065552, "step": 1511 }, { "epoch": 4.754716981132075, "grad_norm": 0.12752796709537506, "learning_rate": 2.4304015701111197e-05, "loss": 0.2615, "mean_token_accuracy": 0.8807107210159302, "step": 1512 }, { "epoch": 4.7578616352201255, "grad_norm": 0.12869000434875488, "learning_rate": 2.428582919703391e-05, "loss": 0.2587, "mean_token_accuracy": 0.8817474842071533, "step": 1513 }, { "epoch": 4.761006289308176, "grad_norm": 0.12598003447055817, "learning_rate": 2.4267640320956302e-05, "loss": 0.2672, "mean_token_accuracy": 0.8799456357955933, "step": 1514 }, { "epoch": 4.764150943396227, "grad_norm": 0.125724658370018, "learning_rate": 2.4249449091752922e-05, "loss": 0.2374, "mean_token_accuracy": 0.8862662315368652, "step": 1515 }, { "epoch": 4.767295597484277, "grad_norm": 0.121979720890522, "learning_rate": 2.4231255528300778e-05, "loss": 0.2584, "mean_token_accuracy": 0.8822363615036011, "step": 1516 }, { "epoch": 4.770440251572327, "grad_norm": 0.12360676378011703, "learning_rate": 2.421305964947929e-05, "loss": 0.2559, "mean_token_accuracy": 0.8823013305664062, "step": 1517 }, { "epoch": 4.773584905660377, "grad_norm": 0.13420429825782776, "learning_rate": 2.419486147417028e-05, "loss": 0.2326, "mean_token_accuracy": 0.8917833566665649, "step": 1518 }, { "epoch": 4.776729559748428, "grad_norm": 0.12840516865253448, "learning_rate": 2.417666102125797e-05, "loss": 0.2455, "mean_token_accuracy": 0.8831151127815247, "step": 1519 }, { "epoch": 4.779874213836478, "grad_norm": 0.14521858096122742, "learning_rate": 2.415845830962892e-05, "loss": 0.2555, "mean_token_accuracy": 0.8834573030471802, "step": 1520 }, { "epoch": 4.783018867924528, "grad_norm": 0.13063116371631622, "learning_rate": 2.4140253358172064e-05, "loss": 0.2442, "mean_token_accuracy": 0.8842610120773315, "step": 1521 }, { "epoch": 4.786163522012579, "grad_norm": 0.13243721425533295, "learning_rate": 2.4122046185778628e-05, "loss": 0.2705, "mean_token_accuracy": 0.8768439888954163, "step": 1522 }, { "epoch": 4.789308176100629, "grad_norm": 0.14029352366924286, "learning_rate": 2.4103836811342167e-05, "loss": 0.2562, "mean_token_accuracy": 0.8825986385345459, "step": 1523 }, { "epoch": 4.7924528301886795, "grad_norm": 0.1397697478532791, "learning_rate": 2.4085625253758505e-05, "loss": 0.2495, "mean_token_accuracy": 0.8802828788757324, "step": 1524 }, { "epoch": 4.79559748427673, "grad_norm": 0.13132429122924805, "learning_rate": 2.4067411531925736e-05, "loss": 0.2662, "mean_token_accuracy": 0.8824609518051147, "step": 1525 }, { "epoch": 4.79874213836478, "grad_norm": 0.12849049270153046, "learning_rate": 2.404919566474422e-05, "loss": 0.24, "mean_token_accuracy": 0.887162446975708, "step": 1526 }, { "epoch": 4.80188679245283, "grad_norm": 0.1436852514743805, "learning_rate": 2.40309776711165e-05, "loss": 0.255, "mean_token_accuracy": 0.8830965757369995, "step": 1527 }, { "epoch": 4.80503144654088, "grad_norm": 0.12779057025909424, "learning_rate": 2.401275756994737e-05, "loss": 0.2673, "mean_token_accuracy": 0.8801604509353638, "step": 1528 }, { "epoch": 4.8081761006289305, "grad_norm": 0.13711079955101013, "learning_rate": 2.399453538014379e-05, "loss": 0.2391, "mean_token_accuracy": 0.8846153020858765, "step": 1529 }, { "epoch": 4.811320754716981, "grad_norm": 0.12312142550945282, "learning_rate": 2.397631112061488e-05, "loss": 0.2518, "mean_token_accuracy": 0.8821859955787659, "step": 1530 }, { "epoch": 4.814465408805032, "grad_norm": 0.13619542121887207, "learning_rate": 2.3958084810271927e-05, "loss": 0.2531, "mean_token_accuracy": 0.8836720585823059, "step": 1531 }, { "epoch": 4.817610062893082, "grad_norm": 0.12221843004226685, "learning_rate": 2.393985646802833e-05, "loss": 0.2481, "mean_token_accuracy": 0.8841218948364258, "step": 1532 }, { "epoch": 4.820754716981132, "grad_norm": 0.12619447708129883, "learning_rate": 2.392162611279961e-05, "loss": 0.2502, "mean_token_accuracy": 0.8820463418960571, "step": 1533 }, { "epoch": 4.823899371069182, "grad_norm": 0.12062551081180573, "learning_rate": 2.3903393763503355e-05, "loss": 0.2734, "mean_token_accuracy": 0.8766216039657593, "step": 1534 }, { "epoch": 4.827044025157233, "grad_norm": 0.12718960642814636, "learning_rate": 2.388515943905925e-05, "loss": 0.2495, "mean_token_accuracy": 0.8845596313476562, "step": 1535 }, { "epoch": 4.830188679245283, "grad_norm": 0.12057467550039291, "learning_rate": 2.3866923158389018e-05, "loss": 0.2325, "mean_token_accuracy": 0.8902129530906677, "step": 1536 }, { "epoch": 4.833333333333333, "grad_norm": 0.13578729331493378, "learning_rate": 2.3848684940416402e-05, "loss": 0.2785, "mean_token_accuracy": 0.8787068128585815, "step": 1537 }, { "epoch": 4.836477987421384, "grad_norm": 0.12010408192873001, "learning_rate": 2.383044480406717e-05, "loss": 0.2825, "mean_token_accuracy": 0.8767572641372681, "step": 1538 }, { "epoch": 4.839622641509434, "grad_norm": 0.12753932178020477, "learning_rate": 2.381220276826907e-05, "loss": 0.2458, "mean_token_accuracy": 0.8807628750801086, "step": 1539 }, { "epoch": 4.8427672955974845, "grad_norm": 0.12194069474935532, "learning_rate": 2.3793958851951828e-05, "loss": 0.2588, "mean_token_accuracy": 0.8817402124404907, "step": 1540 }, { "epoch": 4.845911949685535, "grad_norm": 0.1362626850605011, "learning_rate": 2.377571307404712e-05, "loss": 0.2514, "mean_token_accuracy": 0.8836599588394165, "step": 1541 }, { "epoch": 4.849056603773585, "grad_norm": 0.129854217171669, "learning_rate": 2.3757465453488557e-05, "loss": 0.262, "mean_token_accuracy": 0.8794586062431335, "step": 1542 }, { "epoch": 4.852201257861635, "grad_norm": 0.12912380695343018, "learning_rate": 2.3739216009211644e-05, "loss": 0.2602, "mean_token_accuracy": 0.8814219832420349, "step": 1543 }, { "epoch": 4.855345911949685, "grad_norm": 0.12774620950222015, "learning_rate": 2.37209647601538e-05, "loss": 0.2629, "mean_token_accuracy": 0.881087064743042, "step": 1544 }, { "epoch": 4.8584905660377355, "grad_norm": 0.1410323977470398, "learning_rate": 2.370271172525431e-05, "loss": 0.2469, "mean_token_accuracy": 0.8836389780044556, "step": 1545 }, { "epoch": 4.861635220125786, "grad_norm": 0.12685927748680115, "learning_rate": 2.3684456923454316e-05, "loss": 0.2493, "mean_token_accuracy": 0.8849629759788513, "step": 1546 }, { "epoch": 4.864779874213837, "grad_norm": 0.1375240981578827, "learning_rate": 2.3666200373696785e-05, "loss": 0.2671, "mean_token_accuracy": 0.881800651550293, "step": 1547 }, { "epoch": 4.867924528301887, "grad_norm": 0.128312885761261, "learning_rate": 2.36479420949265e-05, "loss": 0.2664, "mean_token_accuracy": 0.8808472752571106, "step": 1548 }, { "epoch": 4.871069182389937, "grad_norm": 0.13644251227378845, "learning_rate": 2.3629682106090036e-05, "loss": 0.2468, "mean_token_accuracy": 0.883743405342102, "step": 1549 }, { "epoch": 4.8742138364779874, "grad_norm": 0.143430694937706, "learning_rate": 2.361142042613576e-05, "loss": 0.2463, "mean_token_accuracy": 0.8866894245147705, "step": 1550 }, { "epoch": 4.877358490566038, "grad_norm": 0.12638427317142487, "learning_rate": 2.3593157074013768e-05, "loss": 0.2547, "mean_token_accuracy": 0.881319522857666, "step": 1551 }, { "epoch": 4.880503144654088, "grad_norm": 0.1527438908815384, "learning_rate": 2.35748920686759e-05, "loss": 0.2457, "mean_token_accuracy": 0.8852053880691528, "step": 1552 }, { "epoch": 4.883647798742138, "grad_norm": 0.12867127358913422, "learning_rate": 2.355662542907573e-05, "loss": 0.2653, "mean_token_accuracy": 0.8813451528549194, "step": 1553 }, { "epoch": 4.886792452830189, "grad_norm": 0.14862971007823944, "learning_rate": 2.3538357174168497e-05, "loss": 0.2605, "mean_token_accuracy": 0.8801792860031128, "step": 1554 }, { "epoch": 4.889937106918239, "grad_norm": 0.13223330676555634, "learning_rate": 2.352008732291115e-05, "loss": 0.2311, "mean_token_accuracy": 0.8890978097915649, "step": 1555 }, { "epoch": 4.8930817610062896, "grad_norm": 0.13772611320018768, "learning_rate": 2.3501815894262265e-05, "loss": 0.2641, "mean_token_accuracy": 0.8789441585540771, "step": 1556 }, { "epoch": 4.89622641509434, "grad_norm": 0.1223578229546547, "learning_rate": 2.3483542907182066e-05, "loss": 0.2585, "mean_token_accuracy": 0.8826157450675964, "step": 1557 }, { "epoch": 4.89937106918239, "grad_norm": 0.13819842040538788, "learning_rate": 2.3465268380632394e-05, "loss": 0.2657, "mean_token_accuracy": 0.880345344543457, "step": 1558 }, { "epoch": 4.90251572327044, "grad_norm": 0.12548887729644775, "learning_rate": 2.344699233357669e-05, "loss": 0.2589, "mean_token_accuracy": 0.8830808997154236, "step": 1559 }, { "epoch": 4.90566037735849, "grad_norm": 0.13984771072864532, "learning_rate": 2.342871478497998e-05, "loss": 0.2667, "mean_token_accuracy": 0.8818701505661011, "step": 1560 }, { "epoch": 4.908805031446541, "grad_norm": 0.12641951441764832, "learning_rate": 2.341043575380883e-05, "loss": 0.2563, "mean_token_accuracy": 0.883787214756012, "step": 1561 }, { "epoch": 4.911949685534591, "grad_norm": 0.14930589497089386, "learning_rate": 2.339215525903135e-05, "loss": 0.2669, "mean_token_accuracy": 0.8794329762458801, "step": 1562 }, { "epoch": 4.915094339622642, "grad_norm": 0.1343175768852234, "learning_rate": 2.337387331961718e-05, "loss": 0.2506, "mean_token_accuracy": 0.8854690790176392, "step": 1563 }, { "epoch": 4.918238993710692, "grad_norm": 0.13801869750022888, "learning_rate": 2.3355589954537448e-05, "loss": 0.2556, "mean_token_accuracy": 0.8809491395950317, "step": 1564 }, { "epoch": 4.921383647798742, "grad_norm": 0.13780134916305542, "learning_rate": 2.3337305182764768e-05, "loss": 0.2551, "mean_token_accuracy": 0.8815582394599915, "step": 1565 }, { "epoch": 4.9245283018867925, "grad_norm": 0.12479320913553238, "learning_rate": 2.3319019023273203e-05, "loss": 0.2442, "mean_token_accuracy": 0.8860586881637573, "step": 1566 }, { "epoch": 4.927672955974843, "grad_norm": 0.14212225377559662, "learning_rate": 2.3300731495038272e-05, "loss": 0.2428, "mean_token_accuracy": 0.8860717415809631, "step": 1567 }, { "epoch": 4.930817610062893, "grad_norm": 0.12582793831825256, "learning_rate": 2.3282442617036906e-05, "loss": 0.2483, "mean_token_accuracy": 0.8846839070320129, "step": 1568 }, { "epoch": 4.933962264150943, "grad_norm": 0.1322171986103058, "learning_rate": 2.3264152408247433e-05, "loss": 0.2527, "mean_token_accuracy": 0.8855623602867126, "step": 1569 }, { "epoch": 4.937106918238994, "grad_norm": 0.1252465844154358, "learning_rate": 2.3245860887649564e-05, "loss": 0.2588, "mean_token_accuracy": 0.8804665207862854, "step": 1570 }, { "epoch": 4.940251572327044, "grad_norm": 0.12859833240509033, "learning_rate": 2.3227568074224376e-05, "loss": 0.2635, "mean_token_accuracy": 0.8824833035469055, "step": 1571 }, { "epoch": 4.943396226415095, "grad_norm": 0.12550607323646545, "learning_rate": 2.3209273986954287e-05, "loss": 0.2491, "mean_token_accuracy": 0.886185884475708, "step": 1572 }, { "epoch": 4.946540880503145, "grad_norm": 0.1253989338874817, "learning_rate": 2.319097864482302e-05, "loss": 0.2512, "mean_token_accuracy": 0.8835926651954651, "step": 1573 }, { "epoch": 4.949685534591195, "grad_norm": 0.12692295014858246, "learning_rate": 2.3172682066815636e-05, "loss": 0.2585, "mean_token_accuracy": 0.880564272403717, "step": 1574 }, { "epoch": 4.952830188679245, "grad_norm": 0.1225670799612999, "learning_rate": 2.3154384271918443e-05, "loss": 0.2451, "mean_token_accuracy": 0.8834196329116821, "step": 1575 }, { "epoch": 4.955974842767295, "grad_norm": 0.11881422251462936, "learning_rate": 2.3136085279119033e-05, "loss": 0.2651, "mean_token_accuracy": 0.879045307636261, "step": 1576 }, { "epoch": 4.959119496855346, "grad_norm": 0.12771472334861755, "learning_rate": 2.3117785107406224e-05, "loss": 0.2527, "mean_token_accuracy": 0.883535623550415, "step": 1577 }, { "epoch": 4.962264150943396, "grad_norm": 0.12338591367006302, "learning_rate": 2.3099483775770078e-05, "loss": 0.2542, "mean_token_accuracy": 0.8821091055870056, "step": 1578 }, { "epoch": 4.965408805031447, "grad_norm": 0.12770599126815796, "learning_rate": 2.308118130320184e-05, "loss": 0.2622, "mean_token_accuracy": 0.8797197937965393, "step": 1579 }, { "epoch": 4.968553459119497, "grad_norm": 0.12757545709609985, "learning_rate": 2.3062877708693957e-05, "loss": 0.2512, "mean_token_accuracy": 0.8837404847145081, "step": 1580 }, { "epoch": 4.971698113207547, "grad_norm": 0.13121959567070007, "learning_rate": 2.3044573011240025e-05, "loss": 0.2521, "mean_token_accuracy": 0.8823564052581787, "step": 1581 }, { "epoch": 4.9748427672955975, "grad_norm": 0.12287531793117523, "learning_rate": 2.30262672298348e-05, "loss": 0.2531, "mean_token_accuracy": 0.8842577338218689, "step": 1582 }, { "epoch": 4.977987421383648, "grad_norm": 0.13713645935058594, "learning_rate": 2.3007960383474148e-05, "loss": 0.2495, "mean_token_accuracy": 0.8849415183067322, "step": 1583 }, { "epoch": 4.981132075471698, "grad_norm": 0.1428612619638443, "learning_rate": 2.2989652491155043e-05, "loss": 0.2563, "mean_token_accuracy": 0.8826706409454346, "step": 1584 }, { "epoch": 4.984276729559748, "grad_norm": 0.12580153346061707, "learning_rate": 2.297134357187556e-05, "loss": 0.2639, "mean_token_accuracy": 0.878861665725708, "step": 1585 }, { "epoch": 4.987421383647799, "grad_norm": 0.13792036473751068, "learning_rate": 2.2953033644634813e-05, "loss": 0.2754, "mean_token_accuracy": 0.8782982230186462, "step": 1586 }, { "epoch": 4.990566037735849, "grad_norm": 0.13837353885173798, "learning_rate": 2.293472272843299e-05, "loss": 0.2526, "mean_token_accuracy": 0.8833487033843994, "step": 1587 }, { "epoch": 4.9937106918239, "grad_norm": 0.14238163828849792, "learning_rate": 2.2916410842271274e-05, "loss": 0.2435, "mean_token_accuracy": 0.8903558254241943, "step": 1588 }, { "epoch": 4.99685534591195, "grad_norm": 0.14869339764118195, "learning_rate": 2.2898098005151893e-05, "loss": 0.2322, "mean_token_accuracy": 0.8898493647575378, "step": 1589 }, { "epoch": 5.0, "grad_norm": 0.14632923901081085, "learning_rate": 2.2879784236078023e-05, "loss": 0.2493, "mean_token_accuracy": 0.8759793639183044, "step": 1590 }, { "epoch": 5.00314465408805, "grad_norm": 0.22233015298843384, "learning_rate": 2.286146955405384e-05, "loss": 0.1916, "mean_token_accuracy": 0.9095132946968079, "step": 1591 }, { "epoch": 5.0062893081761, "grad_norm": 0.25025674700737, "learning_rate": 2.2843153978084445e-05, "loss": 0.1819, "mean_token_accuracy": 0.9113137125968933, "step": 1592 }, { "epoch": 5.009433962264151, "grad_norm": 0.2684067189693451, "learning_rate": 2.282483752717587e-05, "loss": 0.1972, "mean_token_accuracy": 0.9079714417457581, "step": 1593 }, { "epoch": 5.012578616352202, "grad_norm": 0.24029596149921417, "learning_rate": 2.2806520220335066e-05, "loss": 0.1678, "mean_token_accuracy": 0.9172501564025879, "step": 1594 }, { "epoch": 5.015723270440252, "grad_norm": 0.2321806699037552, "learning_rate": 2.2788202076569866e-05, "loss": 0.1724, "mean_token_accuracy": 0.9155396223068237, "step": 1595 }, { "epoch": 5.018867924528302, "grad_norm": 0.18021827936172485, "learning_rate": 2.2769883114888964e-05, "loss": 0.1703, "mean_token_accuracy": 0.9156048893928528, "step": 1596 }, { "epoch": 5.022012578616352, "grad_norm": 0.23715201020240784, "learning_rate": 2.2751563354301915e-05, "loss": 0.1776, "mean_token_accuracy": 0.9144327044487, "step": 1597 }, { "epoch": 5.0251572327044025, "grad_norm": 0.16609273850917816, "learning_rate": 2.273324281381909e-05, "loss": 0.1682, "mean_token_accuracy": 0.9172971844673157, "step": 1598 }, { "epoch": 5.028301886792453, "grad_norm": 0.1903139054775238, "learning_rate": 2.271492151245169e-05, "loss": 0.1598, "mean_token_accuracy": 0.917824387550354, "step": 1599 }, { "epoch": 5.031446540880503, "grad_norm": 0.17430105805397034, "learning_rate": 2.269659946921168e-05, "loss": 0.1585, "mean_token_accuracy": 0.9176533818244934, "step": 1600 }, { "epoch": 5.034591194968553, "grad_norm": 0.18532267212867737, "learning_rate": 2.2678276703111814e-05, "loss": 0.1731, "mean_token_accuracy": 0.9151170253753662, "step": 1601 }, { "epoch": 5.037735849056604, "grad_norm": 0.15971753001213074, "learning_rate": 2.2659953233165583e-05, "loss": 0.1707, "mean_token_accuracy": 0.915444552898407, "step": 1602 }, { "epoch": 5.040880503144654, "grad_norm": 0.1557256132364273, "learning_rate": 2.2641629078387224e-05, "loss": 0.1737, "mean_token_accuracy": 0.9138659238815308, "step": 1603 }, { "epoch": 5.044025157232705, "grad_norm": 0.16258005797863007, "learning_rate": 2.2623304257791667e-05, "loss": 0.1739, "mean_token_accuracy": 0.91518634557724, "step": 1604 }, { "epoch": 5.047169811320755, "grad_norm": 0.149249866604805, "learning_rate": 2.260497879039455e-05, "loss": 0.188, "mean_token_accuracy": 0.9087100625038147, "step": 1605 }, { "epoch": 5.050314465408805, "grad_norm": 0.17389704287052155, "learning_rate": 2.2586652695212158e-05, "loss": 0.1846, "mean_token_accuracy": 0.9124392867088318, "step": 1606 }, { "epoch": 5.053459119496855, "grad_norm": 0.1585531234741211, "learning_rate": 2.2568325991261456e-05, "loss": 0.1816, "mean_token_accuracy": 0.9147888422012329, "step": 1607 }, { "epoch": 5.056603773584905, "grad_norm": 0.14732636511325836, "learning_rate": 2.254999869756002e-05, "loss": 0.1642, "mean_token_accuracy": 0.9162658452987671, "step": 1608 }, { "epoch": 5.059748427672956, "grad_norm": 0.14769305288791656, "learning_rate": 2.2531670833126056e-05, "loss": 0.1838, "mean_token_accuracy": 0.913840651512146, "step": 1609 }, { "epoch": 5.062893081761007, "grad_norm": 0.1692722588777542, "learning_rate": 2.2513342416978347e-05, "loss": 0.1761, "mean_token_accuracy": 0.9122536778450012, "step": 1610 }, { "epoch": 5.066037735849057, "grad_norm": 0.14742763340473175, "learning_rate": 2.2495013468136248e-05, "loss": 0.1708, "mean_token_accuracy": 0.9144207239151001, "step": 1611 }, { "epoch": 5.069182389937107, "grad_norm": 0.14860284328460693, "learning_rate": 2.2476684005619675e-05, "loss": 0.1656, "mean_token_accuracy": 0.9158710241317749, "step": 1612 }, { "epoch": 5.072327044025157, "grad_norm": 0.14176800847053528, "learning_rate": 2.2458354048449074e-05, "loss": 0.1819, "mean_token_accuracy": 0.9101191759109497, "step": 1613 }, { "epoch": 5.0754716981132075, "grad_norm": 0.14405949413776398, "learning_rate": 2.2440023615645404e-05, "loss": 0.181, "mean_token_accuracy": 0.9110443592071533, "step": 1614 }, { "epoch": 5.078616352201258, "grad_norm": 0.13683363795280457, "learning_rate": 2.242169272623012e-05, "loss": 0.1714, "mean_token_accuracy": 0.9149702191352844, "step": 1615 }, { "epoch": 5.081761006289308, "grad_norm": 0.13986143469810486, "learning_rate": 2.240336139922515e-05, "loss": 0.1745, "mean_token_accuracy": 0.9130762219429016, "step": 1616 }, { "epoch": 5.084905660377358, "grad_norm": 0.1361878365278244, "learning_rate": 2.2385029653652868e-05, "loss": 0.1701, "mean_token_accuracy": 0.9145857691764832, "step": 1617 }, { "epoch": 5.088050314465409, "grad_norm": 0.1427004188299179, "learning_rate": 2.23666975085361e-05, "loss": 0.1744, "mean_token_accuracy": 0.9139806032180786, "step": 1618 }, { "epoch": 5.091194968553459, "grad_norm": 0.1479874551296234, "learning_rate": 2.2348364982898075e-05, "loss": 0.1791, "mean_token_accuracy": 0.9146347045898438, "step": 1619 }, { "epoch": 5.09433962264151, "grad_norm": 0.13859923183918, "learning_rate": 2.2330032095762406e-05, "loss": 0.1758, "mean_token_accuracy": 0.9162372350692749, "step": 1620 }, { "epoch": 5.09748427672956, "grad_norm": 0.14057232439517975, "learning_rate": 2.2311698866153103e-05, "loss": 0.1773, "mean_token_accuracy": 0.9151803255081177, "step": 1621 }, { "epoch": 5.10062893081761, "grad_norm": 0.1407633125782013, "learning_rate": 2.2293365313094514e-05, "loss": 0.1623, "mean_token_accuracy": 0.9161638021469116, "step": 1622 }, { "epoch": 5.10377358490566, "grad_norm": 0.14112257957458496, "learning_rate": 2.2275031455611332e-05, "loss": 0.1625, "mean_token_accuracy": 0.9182310104370117, "step": 1623 }, { "epoch": 5.1069182389937104, "grad_norm": 0.13362768292427063, "learning_rate": 2.225669731272857e-05, "loss": 0.1804, "mean_token_accuracy": 0.9142118096351624, "step": 1624 }, { "epoch": 5.110062893081761, "grad_norm": 0.1374814510345459, "learning_rate": 2.2238362903471525e-05, "loss": 0.1758, "mean_token_accuracy": 0.9115723371505737, "step": 1625 }, { "epoch": 5.113207547169812, "grad_norm": 0.13537247478961945, "learning_rate": 2.222002824686578e-05, "loss": 0.1618, "mean_token_accuracy": 0.9196138978004456, "step": 1626 }, { "epoch": 5.116352201257862, "grad_norm": 0.13499002158641815, "learning_rate": 2.2201693361937164e-05, "loss": 0.1802, "mean_token_accuracy": 0.9118757843971252, "step": 1627 }, { "epoch": 5.119496855345912, "grad_norm": 0.1436554193496704, "learning_rate": 2.218335826771176e-05, "loss": 0.1892, "mean_token_accuracy": 0.913590133190155, "step": 1628 }, { "epoch": 5.122641509433962, "grad_norm": 0.12786825001239777, "learning_rate": 2.216502298321585e-05, "loss": 0.1811, "mean_token_accuracy": 0.9122364521026611, "step": 1629 }, { "epoch": 5.1257861635220126, "grad_norm": 0.1332731992006302, "learning_rate": 2.2146687527475924e-05, "loss": 0.1716, "mean_token_accuracy": 0.9132739901542664, "step": 1630 }, { "epoch": 5.128930817610063, "grad_norm": 0.12857694923877716, "learning_rate": 2.2128351919518655e-05, "loss": 0.1814, "mean_token_accuracy": 0.9113665819168091, "step": 1631 }, { "epoch": 5.132075471698113, "grad_norm": 0.13038919866085052, "learning_rate": 2.211001617837085e-05, "loss": 0.1664, "mean_token_accuracy": 0.916091799736023, "step": 1632 }, { "epoch": 5.135220125786163, "grad_norm": 0.1276601105928421, "learning_rate": 2.2091680323059487e-05, "loss": 0.1762, "mean_token_accuracy": 0.9136810898780823, "step": 1633 }, { "epoch": 5.138364779874214, "grad_norm": 0.1319180130958557, "learning_rate": 2.2073344372611628e-05, "loss": 0.1737, "mean_token_accuracy": 0.9124643802642822, "step": 1634 }, { "epoch": 5.1415094339622645, "grad_norm": 0.13005971908569336, "learning_rate": 2.205500834605447e-05, "loss": 0.163, "mean_token_accuracy": 0.9175032377243042, "step": 1635 }, { "epoch": 5.144654088050315, "grad_norm": 0.12882187962532043, "learning_rate": 2.2036672262415265e-05, "loss": 0.1598, "mean_token_accuracy": 0.9175060391426086, "step": 1636 }, { "epoch": 5.147798742138365, "grad_norm": 0.1321483552455902, "learning_rate": 2.201833614072132e-05, "loss": 0.1992, "mean_token_accuracy": 0.9085779190063477, "step": 1637 }, { "epoch": 5.150943396226415, "grad_norm": 0.12345718592405319, "learning_rate": 2.2000000000000003e-05, "loss": 0.1769, "mean_token_accuracy": 0.9143081307411194, "step": 1638 }, { "epoch": 5.154088050314465, "grad_norm": 0.13488464057445526, "learning_rate": 2.1981663859278684e-05, "loss": 0.1526, "mean_token_accuracy": 0.9188219308853149, "step": 1639 }, { "epoch": 5.1572327044025155, "grad_norm": 0.129329651594162, "learning_rate": 2.1963327737584745e-05, "loss": 0.1589, "mean_token_accuracy": 0.9180588126182556, "step": 1640 }, { "epoch": 5.160377358490566, "grad_norm": 0.13226263225078583, "learning_rate": 2.194499165394554e-05, "loss": 0.1732, "mean_token_accuracy": 0.9152401685714722, "step": 1641 }, { "epoch": 5.163522012578617, "grad_norm": 0.1298309713602066, "learning_rate": 2.1926655627388378e-05, "loss": 0.1664, "mean_token_accuracy": 0.9183374047279358, "step": 1642 }, { "epoch": 5.166666666666667, "grad_norm": 0.13375401496887207, "learning_rate": 2.1908319676940522e-05, "loss": 0.164, "mean_token_accuracy": 0.9196304082870483, "step": 1643 }, { "epoch": 5.169811320754717, "grad_norm": 0.13043470680713654, "learning_rate": 2.1889983821629154e-05, "loss": 0.1738, "mean_token_accuracy": 0.9157758951187134, "step": 1644 }, { "epoch": 5.172955974842767, "grad_norm": 0.13828632235527039, "learning_rate": 2.1871648080481354e-05, "loss": 0.1681, "mean_token_accuracy": 0.9176316261291504, "step": 1645 }, { "epoch": 5.176100628930818, "grad_norm": 0.12848587334156036, "learning_rate": 2.185331247252408e-05, "loss": 0.1735, "mean_token_accuracy": 0.9150087237358093, "step": 1646 }, { "epoch": 5.179245283018868, "grad_norm": 0.13007590174674988, "learning_rate": 2.1834977016784155e-05, "loss": 0.2002, "mean_token_accuracy": 0.9054135084152222, "step": 1647 }, { "epoch": 5.182389937106918, "grad_norm": 0.1344444900751114, "learning_rate": 2.181664173228825e-05, "loss": 0.1761, "mean_token_accuracy": 0.9127033948898315, "step": 1648 }, { "epoch": 5.185534591194968, "grad_norm": 0.1295919120311737, "learning_rate": 2.1798306638062842e-05, "loss": 0.1714, "mean_token_accuracy": 0.9170975685119629, "step": 1649 }, { "epoch": 5.188679245283019, "grad_norm": 0.1406605988740921, "learning_rate": 2.177997175313423e-05, "loss": 0.1525, "mean_token_accuracy": 0.9223353266716003, "step": 1650 }, { "epoch": 5.1918238993710695, "grad_norm": 0.127178356051445, "learning_rate": 2.1761637096528477e-05, "loss": 0.1552, "mean_token_accuracy": 0.9182380437850952, "step": 1651 }, { "epoch": 5.19496855345912, "grad_norm": 0.13574561476707458, "learning_rate": 2.174330268727143e-05, "loss": 0.1718, "mean_token_accuracy": 0.9154348969459534, "step": 1652 }, { "epoch": 5.19811320754717, "grad_norm": 0.1314130425453186, "learning_rate": 2.172496854438867e-05, "loss": 0.1687, "mean_token_accuracy": 0.9156013131141663, "step": 1653 }, { "epoch": 5.20125786163522, "grad_norm": 0.13452252745628357, "learning_rate": 2.1706634686905495e-05, "loss": 0.161, "mean_token_accuracy": 0.9191825985908508, "step": 1654 }, { "epoch": 5.20440251572327, "grad_norm": 0.13440141081809998, "learning_rate": 2.1688301133846906e-05, "loss": 0.1809, "mean_token_accuracy": 0.9128972887992859, "step": 1655 }, { "epoch": 5.2075471698113205, "grad_norm": 0.12809255719184875, "learning_rate": 2.1669967904237603e-05, "loss": 0.1729, "mean_token_accuracy": 0.9155882000923157, "step": 1656 }, { "epoch": 5.210691823899371, "grad_norm": 0.1266845464706421, "learning_rate": 2.1651635017101934e-05, "loss": 0.1741, "mean_token_accuracy": 0.9155396223068237, "step": 1657 }, { "epoch": 5.213836477987422, "grad_norm": 0.1275676041841507, "learning_rate": 2.1633302491463905e-05, "loss": 0.1685, "mean_token_accuracy": 0.916077196598053, "step": 1658 }, { "epoch": 5.216981132075472, "grad_norm": 0.13584747910499573, "learning_rate": 2.1614970346347137e-05, "loss": 0.1769, "mean_token_accuracy": 0.9124895334243774, "step": 1659 }, { "epoch": 5.220125786163522, "grad_norm": 0.12740573287010193, "learning_rate": 2.1596638600774855e-05, "loss": 0.1623, "mean_token_accuracy": 0.9163357615470886, "step": 1660 }, { "epoch": 5.223270440251572, "grad_norm": 0.12712137401103973, "learning_rate": 2.1578307273769887e-05, "loss": 0.1706, "mean_token_accuracy": 0.9162405133247375, "step": 1661 }, { "epoch": 5.226415094339623, "grad_norm": 0.12990765273571014, "learning_rate": 2.1559976384354605e-05, "loss": 0.1547, "mean_token_accuracy": 0.9218593835830688, "step": 1662 }, { "epoch": 5.229559748427673, "grad_norm": 0.13632164895534515, "learning_rate": 2.1541645951550935e-05, "loss": 0.1727, "mean_token_accuracy": 0.9177563190460205, "step": 1663 }, { "epoch": 5.232704402515723, "grad_norm": 0.1234545111656189, "learning_rate": 2.1523315994380338e-05, "loss": 0.1638, "mean_token_accuracy": 0.9165735840797424, "step": 1664 }, { "epoch": 5.235849056603773, "grad_norm": 0.14210264384746552, "learning_rate": 2.1504986531863765e-05, "loss": 0.1689, "mean_token_accuracy": 0.9141770005226135, "step": 1665 }, { "epoch": 5.238993710691824, "grad_norm": 0.13037016987800598, "learning_rate": 2.148665758302167e-05, "loss": 0.1708, "mean_token_accuracy": 0.915972113609314, "step": 1666 }, { "epoch": 5.2421383647798745, "grad_norm": 0.1427024006843567, "learning_rate": 2.1468329166873953e-05, "loss": 0.1919, "mean_token_accuracy": 0.9098140597343445, "step": 1667 }, { "epoch": 5.245283018867925, "grad_norm": 0.12425141036510468, "learning_rate": 2.1450001302439984e-05, "loss": 0.1646, "mean_token_accuracy": 0.9146043658256531, "step": 1668 }, { "epoch": 5.248427672955975, "grad_norm": 0.13127084076404572, "learning_rate": 2.1431674008738553e-05, "loss": 0.1644, "mean_token_accuracy": 0.9161396026611328, "step": 1669 }, { "epoch": 5.251572327044025, "grad_norm": 0.12927378714084625, "learning_rate": 2.1413347304787854e-05, "loss": 0.1653, "mean_token_accuracy": 0.9174731373786926, "step": 1670 }, { "epoch": 5.254716981132075, "grad_norm": 0.13044291734695435, "learning_rate": 2.1395021209605464e-05, "loss": 0.1757, "mean_token_accuracy": 0.9150776863098145, "step": 1671 }, { "epoch": 5.2578616352201255, "grad_norm": 0.1301099807024002, "learning_rate": 2.1376695742208345e-05, "loss": 0.1895, "mean_token_accuracy": 0.9120878577232361, "step": 1672 }, { "epoch": 5.261006289308176, "grad_norm": 0.135678231716156, "learning_rate": 2.135837092161279e-05, "loss": 0.1811, "mean_token_accuracy": 0.9135366082191467, "step": 1673 }, { "epoch": 5.264150943396227, "grad_norm": 0.12597790360450745, "learning_rate": 2.1340046766834423e-05, "loss": 0.183, "mean_token_accuracy": 0.9111697673797607, "step": 1674 }, { "epoch": 5.267295597484277, "grad_norm": 0.1251760572195053, "learning_rate": 2.1321723296888198e-05, "loss": 0.1835, "mean_token_accuracy": 0.9082761406898499, "step": 1675 }, { "epoch": 5.270440251572327, "grad_norm": 0.12519238889217377, "learning_rate": 2.130340053078833e-05, "loss": 0.166, "mean_token_accuracy": 0.9158846735954285, "step": 1676 }, { "epoch": 5.273584905660377, "grad_norm": 0.1259630024433136, "learning_rate": 2.128507848754832e-05, "loss": 0.1748, "mean_token_accuracy": 0.9130755662918091, "step": 1677 }, { "epoch": 5.276729559748428, "grad_norm": 0.13136066496372223, "learning_rate": 2.1266757186180915e-05, "loss": 0.1796, "mean_token_accuracy": 0.9120888113975525, "step": 1678 }, { "epoch": 5.279874213836478, "grad_norm": 0.1244376003742218, "learning_rate": 2.1248436645698094e-05, "loss": 0.1751, "mean_token_accuracy": 0.9147348999977112, "step": 1679 }, { "epoch": 5.283018867924528, "grad_norm": 0.1334747076034546, "learning_rate": 2.1230116885111048e-05, "loss": 0.1773, "mean_token_accuracy": 0.9149150252342224, "step": 1680 }, { "epoch": 5.286163522012578, "grad_norm": 0.12724976241588593, "learning_rate": 2.1211797923430146e-05, "loss": 0.1748, "mean_token_accuracy": 0.9149224162101746, "step": 1681 }, { "epoch": 5.289308176100629, "grad_norm": 0.13602736592292786, "learning_rate": 2.119347977966494e-05, "loss": 0.1758, "mean_token_accuracy": 0.9136221408843994, "step": 1682 }, { "epoch": 5.2924528301886795, "grad_norm": 0.12472439557313919, "learning_rate": 2.117516247282414e-05, "loss": 0.1674, "mean_token_accuracy": 0.9162774682044983, "step": 1683 }, { "epoch": 5.29559748427673, "grad_norm": 0.1277409791946411, "learning_rate": 2.1156846021915568e-05, "loss": 0.1737, "mean_token_accuracy": 0.9161177277565002, "step": 1684 }, { "epoch": 5.29874213836478, "grad_norm": 0.13467463850975037, "learning_rate": 2.1138530445946167e-05, "loss": 0.1769, "mean_token_accuracy": 0.9126297235488892, "step": 1685 }, { "epoch": 5.30188679245283, "grad_norm": 0.13067930936813354, "learning_rate": 2.1120215763921982e-05, "loss": 0.1792, "mean_token_accuracy": 0.9156489968299866, "step": 1686 }, { "epoch": 5.30503144654088, "grad_norm": 0.13587665557861328, "learning_rate": 2.1101901994848113e-05, "loss": 0.1763, "mean_token_accuracy": 0.9159653186798096, "step": 1687 }, { "epoch": 5.3081761006289305, "grad_norm": 0.13601058721542358, "learning_rate": 2.1083589157728728e-05, "loss": 0.1758, "mean_token_accuracy": 0.9139232635498047, "step": 1688 }, { "epoch": 5.311320754716981, "grad_norm": 0.13745374977588654, "learning_rate": 2.1065277271567017e-05, "loss": 0.1679, "mean_token_accuracy": 0.9164733290672302, "step": 1689 }, { "epoch": 5.314465408805032, "grad_norm": 0.1407531201839447, "learning_rate": 2.104696635536519e-05, "loss": 0.1876, "mean_token_accuracy": 0.9122368693351746, "step": 1690 }, { "epoch": 5.317610062893082, "grad_norm": 0.1359671652317047, "learning_rate": 2.1028656428124442e-05, "loss": 0.1859, "mean_token_accuracy": 0.9110690355300903, "step": 1691 }, { "epoch": 5.320754716981132, "grad_norm": 0.13148832321166992, "learning_rate": 2.101034750884496e-05, "loss": 0.1611, "mean_token_accuracy": 0.9165538549423218, "step": 1692 }, { "epoch": 5.323899371069182, "grad_norm": 0.13991323113441467, "learning_rate": 2.0992039616525858e-05, "loss": 0.1793, "mean_token_accuracy": 0.9136778712272644, "step": 1693 }, { "epoch": 5.327044025157233, "grad_norm": 0.14232312142848969, "learning_rate": 2.09737327701652e-05, "loss": 0.1744, "mean_token_accuracy": 0.9157080054283142, "step": 1694 }, { "epoch": 5.330188679245283, "grad_norm": 0.12538862228393555, "learning_rate": 2.0955426988759978e-05, "loss": 0.1733, "mean_token_accuracy": 0.9145515561103821, "step": 1695 }, { "epoch": 5.333333333333333, "grad_norm": 0.14959652721881866, "learning_rate": 2.093712229130605e-05, "loss": 0.1776, "mean_token_accuracy": 0.9126619100570679, "step": 1696 }, { "epoch": 5.336477987421383, "grad_norm": 0.13184374570846558, "learning_rate": 2.0918818696798162e-05, "loss": 0.1756, "mean_token_accuracy": 0.9145973920822144, "step": 1697 }, { "epoch": 5.339622641509434, "grad_norm": 0.15026512742042542, "learning_rate": 2.0900516224229924e-05, "loss": 0.1735, "mean_token_accuracy": 0.9127100706100464, "step": 1698 }, { "epoch": 5.3427672955974845, "grad_norm": 0.12787233293056488, "learning_rate": 2.088221489259378e-05, "loss": 0.165, "mean_token_accuracy": 0.9178786873817444, "step": 1699 }, { "epoch": 5.345911949685535, "grad_norm": 0.14001305401325226, "learning_rate": 2.086391472088097e-05, "loss": 0.1707, "mean_token_accuracy": 0.916415810585022, "step": 1700 }, { "epoch": 5.349056603773585, "grad_norm": 0.13622647523880005, "learning_rate": 2.084561572808156e-05, "loss": 0.1764, "mean_token_accuracy": 0.9140340685844421, "step": 1701 }, { "epoch": 5.352201257861635, "grad_norm": 0.13659904897212982, "learning_rate": 2.0827317933184366e-05, "loss": 0.1679, "mean_token_accuracy": 0.9176750779151917, "step": 1702 }, { "epoch": 5.355345911949685, "grad_norm": 0.1333339959383011, "learning_rate": 2.0809021355176982e-05, "loss": 0.1776, "mean_token_accuracy": 0.9108352065086365, "step": 1703 }, { "epoch": 5.3584905660377355, "grad_norm": 0.1358334869146347, "learning_rate": 2.0790726013045722e-05, "loss": 0.1773, "mean_token_accuracy": 0.9096439480781555, "step": 1704 }, { "epoch": 5.361635220125786, "grad_norm": 0.13096484541893005, "learning_rate": 2.077243192577563e-05, "loss": 0.175, "mean_token_accuracy": 0.915550172328949, "step": 1705 }, { "epoch": 5.364779874213837, "grad_norm": 0.1310902237892151, "learning_rate": 2.0754139112350442e-05, "loss": 0.1847, "mean_token_accuracy": 0.9114246368408203, "step": 1706 }, { "epoch": 5.367924528301887, "grad_norm": 0.12905707955360413, "learning_rate": 2.0735847591752573e-05, "loss": 0.1707, "mean_token_accuracy": 0.9200414419174194, "step": 1707 }, { "epoch": 5.371069182389937, "grad_norm": 0.14065292477607727, "learning_rate": 2.07175573829631e-05, "loss": 0.1855, "mean_token_accuracy": 0.9103058576583862, "step": 1708 }, { "epoch": 5.3742138364779874, "grad_norm": 0.1275883913040161, "learning_rate": 2.069926850496173e-05, "loss": 0.1907, "mean_token_accuracy": 0.9100257158279419, "step": 1709 }, { "epoch": 5.377358490566038, "grad_norm": 0.12739497423171997, "learning_rate": 2.0680980976726803e-05, "loss": 0.1741, "mean_token_accuracy": 0.9135763049125671, "step": 1710 }, { "epoch": 5.380503144654088, "grad_norm": 0.13199791312217712, "learning_rate": 2.066269481723524e-05, "loss": 0.1763, "mean_token_accuracy": 0.9162415862083435, "step": 1711 }, { "epoch": 5.383647798742138, "grad_norm": 0.13175837695598602, "learning_rate": 2.0644410045462558e-05, "loss": 0.171, "mean_token_accuracy": 0.9147182703018188, "step": 1712 }, { "epoch": 5.386792452830189, "grad_norm": 0.12402436137199402, "learning_rate": 2.0626126680382827e-05, "loss": 0.1772, "mean_token_accuracy": 0.9118449091911316, "step": 1713 }, { "epoch": 5.389937106918239, "grad_norm": 0.13606113195419312, "learning_rate": 2.060784474096866e-05, "loss": 0.1791, "mean_token_accuracy": 0.9129552841186523, "step": 1714 }, { "epoch": 5.3930817610062896, "grad_norm": 0.12390584498643875, "learning_rate": 2.0589564246191175e-05, "loss": 0.1739, "mean_token_accuracy": 0.911918044090271, "step": 1715 }, { "epoch": 5.39622641509434, "grad_norm": 0.13724228739738464, "learning_rate": 2.057128521502002e-05, "loss": 0.1911, "mean_token_accuracy": 0.9111776947975159, "step": 1716 }, { "epoch": 5.39937106918239, "grad_norm": 0.1254095882177353, "learning_rate": 2.055300766642331e-05, "loss": 0.1768, "mean_token_accuracy": 0.9121692180633545, "step": 1717 }, { "epoch": 5.40251572327044, "grad_norm": 0.12512388825416565, "learning_rate": 2.0534731619367608e-05, "loss": 0.1692, "mean_token_accuracy": 0.9146745800971985, "step": 1718 }, { "epoch": 5.40566037735849, "grad_norm": 0.131209135055542, "learning_rate": 2.0516457092817946e-05, "loss": 0.1871, "mean_token_accuracy": 0.9084495306015015, "step": 1719 }, { "epoch": 5.408805031446541, "grad_norm": 0.12044340372085571, "learning_rate": 2.0498184105737744e-05, "loss": 0.185, "mean_token_accuracy": 0.9076829552650452, "step": 1720 }, { "epoch": 5.411949685534591, "grad_norm": 0.13348786532878876, "learning_rate": 2.047991267708886e-05, "loss": 0.1724, "mean_token_accuracy": 0.9145122170448303, "step": 1721 }, { "epoch": 5.415094339622642, "grad_norm": 0.132951557636261, "learning_rate": 2.0461642825831505e-05, "loss": 0.1833, "mean_token_accuracy": 0.9118179678916931, "step": 1722 }, { "epoch": 5.418238993710692, "grad_norm": 0.1334230601787567, "learning_rate": 2.0443374570924276e-05, "loss": 0.1851, "mean_token_accuracy": 0.9120337963104248, "step": 1723 }, { "epoch": 5.421383647798742, "grad_norm": 0.12848825752735138, "learning_rate": 2.04251079313241e-05, "loss": 0.1833, "mean_token_accuracy": 0.9081471562385559, "step": 1724 }, { "epoch": 5.4245283018867925, "grad_norm": 0.13386452198028564, "learning_rate": 2.0406842925986244e-05, "loss": 0.1887, "mean_token_accuracy": 0.9072440266609192, "step": 1725 }, { "epoch": 5.427672955974843, "grad_norm": 0.12583881616592407, "learning_rate": 2.0388579573864248e-05, "loss": 0.1695, "mean_token_accuracy": 0.9142518639564514, "step": 1726 }, { "epoch": 5.430817610062893, "grad_norm": 0.12993434071540833, "learning_rate": 2.0370317893909967e-05, "loss": 0.181, "mean_token_accuracy": 0.9098360538482666, "step": 1727 }, { "epoch": 5.433962264150943, "grad_norm": 0.13069041073322296, "learning_rate": 2.035205790507351e-05, "loss": 0.1693, "mean_token_accuracy": 0.9145693778991699, "step": 1728 }, { "epoch": 5.437106918238994, "grad_norm": 0.12848743796348572, "learning_rate": 2.0333799626303224e-05, "loss": 0.1598, "mean_token_accuracy": 0.9185672998428345, "step": 1729 }, { "epoch": 5.440251572327044, "grad_norm": 0.12107231467962265, "learning_rate": 2.031554307654569e-05, "loss": 0.1802, "mean_token_accuracy": 0.9108401536941528, "step": 1730 }, { "epoch": 5.443396226415095, "grad_norm": 0.1265760362148285, "learning_rate": 2.0297288274745694e-05, "loss": 0.1724, "mean_token_accuracy": 0.9128392338752747, "step": 1731 }, { "epoch": 5.446540880503145, "grad_norm": 0.1304648518562317, "learning_rate": 2.0279035239846204e-05, "loss": 0.1863, "mean_token_accuracy": 0.9115615487098694, "step": 1732 }, { "epoch": 5.449685534591195, "grad_norm": 0.12654311954975128, "learning_rate": 2.0260783990788365e-05, "loss": 0.1839, "mean_token_accuracy": 0.9107187986373901, "step": 1733 }, { "epoch": 5.452830188679245, "grad_norm": 0.12844638526439667, "learning_rate": 2.0242534546511456e-05, "loss": 0.1736, "mean_token_accuracy": 0.9131293296813965, "step": 1734 }, { "epoch": 5.455974842767295, "grad_norm": 0.12657323479652405, "learning_rate": 2.0224286925952885e-05, "loss": 0.1663, "mean_token_accuracy": 0.9194426536560059, "step": 1735 }, { "epoch": 5.459119496855346, "grad_norm": 0.12937375903129578, "learning_rate": 2.0206041148048177e-05, "loss": 0.1948, "mean_token_accuracy": 0.9075583219528198, "step": 1736 }, { "epoch": 5.462264150943396, "grad_norm": 0.12683670222759247, "learning_rate": 2.0187797231730933e-05, "loss": 0.1623, "mean_token_accuracy": 0.9198793172836304, "step": 1737 }, { "epoch": 5.465408805031447, "grad_norm": 0.12588712573051453, "learning_rate": 2.016955519593284e-05, "loss": 0.1909, "mean_token_accuracy": 0.9105217456817627, "step": 1738 }, { "epoch": 5.468553459119497, "grad_norm": 0.12617500126361847, "learning_rate": 2.0151315059583603e-05, "loss": 0.1721, "mean_token_accuracy": 0.915179431438446, "step": 1739 }, { "epoch": 5.471698113207547, "grad_norm": 0.12191683799028397, "learning_rate": 2.0133076841610987e-05, "loss": 0.1552, "mean_token_accuracy": 0.9211398959159851, "step": 1740 }, { "epoch": 5.4748427672955975, "grad_norm": 0.12133314460515976, "learning_rate": 2.011484056094075e-05, "loss": 0.1857, "mean_token_accuracy": 0.9101731777191162, "step": 1741 }, { "epoch": 5.477987421383648, "grad_norm": 0.12632842361927032, "learning_rate": 2.009660623649665e-05, "loss": 0.1752, "mean_token_accuracy": 0.9160427451133728, "step": 1742 }, { "epoch": 5.481132075471698, "grad_norm": 0.12612132728099823, "learning_rate": 2.0078373887200402e-05, "loss": 0.1484, "mean_token_accuracy": 0.9197033047676086, "step": 1743 }, { "epoch": 5.484276729559748, "grad_norm": 0.1312422752380371, "learning_rate": 2.0060143531971676e-05, "loss": 0.1624, "mean_token_accuracy": 0.9184711575508118, "step": 1744 }, { "epoch": 5.487421383647799, "grad_norm": 0.12046711146831512, "learning_rate": 2.0041915189728082e-05, "loss": 0.1493, "mean_token_accuracy": 0.9204055070877075, "step": 1745 }, { "epoch": 5.490566037735849, "grad_norm": 0.13251107931137085, "learning_rate": 2.0023688879385123e-05, "loss": 0.1806, "mean_token_accuracy": 0.9120727181434631, "step": 1746 }, { "epoch": 5.4937106918239, "grad_norm": 0.11958485841751099, "learning_rate": 2.000546461985622e-05, "loss": 0.1747, "mean_token_accuracy": 0.9143047332763672, "step": 1747 }, { "epoch": 5.49685534591195, "grad_norm": 0.12881681323051453, "learning_rate": 1.998724243005264e-05, "loss": 0.1825, "mean_token_accuracy": 0.9094586968421936, "step": 1748 }, { "epoch": 5.5, "grad_norm": 0.1296965628862381, "learning_rate": 1.99690223288835e-05, "loss": 0.1849, "mean_token_accuracy": 0.9092406630516052, "step": 1749 }, { "epoch": 5.50314465408805, "grad_norm": 0.1288004368543625, "learning_rate": 1.995080433525579e-05, "loss": 0.1914, "mean_token_accuracy": 0.9089462757110596, "step": 1750 }, { "epoch": 5.5062893081761, "grad_norm": 0.11885450780391693, "learning_rate": 1.9932588468074266e-05, "loss": 0.1653, "mean_token_accuracy": 0.9152488708496094, "step": 1751 }, { "epoch": 5.509433962264151, "grad_norm": 0.12399091571569443, "learning_rate": 1.9914374746241504e-05, "loss": 0.1836, "mean_token_accuracy": 0.9103871583938599, "step": 1752 }, { "epoch": 5.512578616352201, "grad_norm": 0.12655200064182281, "learning_rate": 1.9896163188657846e-05, "loss": 0.1718, "mean_token_accuracy": 0.9169356822967529, "step": 1753 }, { "epoch": 5.515723270440252, "grad_norm": 0.13177834451198578, "learning_rate": 1.9877953814221378e-05, "loss": 0.182, "mean_token_accuracy": 0.9119299650192261, "step": 1754 }, { "epoch": 5.518867924528302, "grad_norm": 0.12567467987537384, "learning_rate": 1.9859746641827945e-05, "loss": 0.1893, "mean_token_accuracy": 0.9095203876495361, "step": 1755 }, { "epoch": 5.522012578616352, "grad_norm": 0.12787534296512604, "learning_rate": 1.984154169037108e-05, "loss": 0.1861, "mean_token_accuracy": 0.9093389511108398, "step": 1756 }, { "epoch": 5.5251572327044025, "grad_norm": 0.12269425392150879, "learning_rate": 1.9823338978742038e-05, "loss": 0.1841, "mean_token_accuracy": 0.9123227000236511, "step": 1757 }, { "epoch": 5.528301886792453, "grad_norm": 0.12528327107429504, "learning_rate": 1.9805138525829724e-05, "loss": 0.176, "mean_token_accuracy": 0.9133109450340271, "step": 1758 }, { "epoch": 5.531446540880503, "grad_norm": 0.1250915825366974, "learning_rate": 1.978694035052072e-05, "loss": 0.1739, "mean_token_accuracy": 0.9134659171104431, "step": 1759 }, { "epoch": 5.534591194968553, "grad_norm": 0.1279488205909729, "learning_rate": 1.9768744471699234e-05, "loss": 0.1851, "mean_token_accuracy": 0.9121567606925964, "step": 1760 }, { "epoch": 5.537735849056604, "grad_norm": 0.12545357644557953, "learning_rate": 1.9750550908247087e-05, "loss": 0.1719, "mean_token_accuracy": 0.9140828251838684, "step": 1761 }, { "epoch": 5.540880503144654, "grad_norm": 0.12257158756256104, "learning_rate": 1.973235967904371e-05, "loss": 0.166, "mean_token_accuracy": 0.9187101125717163, "step": 1762 }, { "epoch": 5.544025157232705, "grad_norm": 0.12817668914794922, "learning_rate": 1.97141708029661e-05, "loss": 0.165, "mean_token_accuracy": 0.9164320230484009, "step": 1763 }, { "epoch": 5.547169811320755, "grad_norm": 0.11913950741291046, "learning_rate": 1.9695984298888815e-05, "loss": 0.1715, "mean_token_accuracy": 0.9155896902084351, "step": 1764 }, { "epoch": 5.550314465408805, "grad_norm": 0.12410813570022583, "learning_rate": 1.9677800185683957e-05, "loss": 0.1698, "mean_token_accuracy": 0.9146424531936646, "step": 1765 }, { "epoch": 5.553459119496855, "grad_norm": 0.12645314633846283, "learning_rate": 1.9659618482221142e-05, "loss": 0.1757, "mean_token_accuracy": 0.9114490747451782, "step": 1766 }, { "epoch": 5.556603773584905, "grad_norm": 0.1207004114985466, "learning_rate": 1.964143920736749e-05, "loss": 0.1799, "mean_token_accuracy": 0.9112210273742676, "step": 1767 }, { "epoch": 5.559748427672956, "grad_norm": 0.12781473994255066, "learning_rate": 1.96232623799876e-05, "loss": 0.1891, "mean_token_accuracy": 0.9095780253410339, "step": 1768 }, { "epoch": 5.562893081761006, "grad_norm": 0.12544502317905426, "learning_rate": 1.960508801894352e-05, "loss": 0.168, "mean_token_accuracy": 0.9177098274230957, "step": 1769 }, { "epoch": 5.566037735849057, "grad_norm": 0.13463300466537476, "learning_rate": 1.958691614309475e-05, "loss": 0.1853, "mean_token_accuracy": 0.9120203256607056, "step": 1770 }, { "epoch": 5.569182389937107, "grad_norm": 0.12383488565683365, "learning_rate": 1.9568746771298214e-05, "loss": 0.1787, "mean_token_accuracy": 0.9111815094947815, "step": 1771 }, { "epoch": 5.572327044025157, "grad_norm": 0.13381510972976685, "learning_rate": 1.9550579922408232e-05, "loss": 0.1758, "mean_token_accuracy": 0.9136260747909546, "step": 1772 }, { "epoch": 5.5754716981132075, "grad_norm": 0.11905031651258469, "learning_rate": 1.9532415615276497e-05, "loss": 0.1768, "mean_token_accuracy": 0.9129524827003479, "step": 1773 }, { "epoch": 5.578616352201258, "grad_norm": 0.12877804040908813, "learning_rate": 1.951425386875208e-05, "loss": 0.1911, "mean_token_accuracy": 0.9062658548355103, "step": 1774 }, { "epoch": 5.581761006289308, "grad_norm": 0.13063427805900574, "learning_rate": 1.949609470168139e-05, "loss": 0.1784, "mean_token_accuracy": 0.9149458408355713, "step": 1775 }, { "epoch": 5.584905660377358, "grad_norm": 0.1270115077495575, "learning_rate": 1.947793813290816e-05, "loss": 0.1811, "mean_token_accuracy": 0.911460816860199, "step": 1776 }, { "epoch": 5.588050314465409, "grad_norm": 0.12923474609851837, "learning_rate": 1.945978418127342e-05, "loss": 0.1786, "mean_token_accuracy": 0.9101095199584961, "step": 1777 }, { "epoch": 5.591194968553459, "grad_norm": 0.12055451422929764, "learning_rate": 1.944163286561549e-05, "loss": 0.1749, "mean_token_accuracy": 0.9124361872673035, "step": 1778 }, { "epoch": 5.59433962264151, "grad_norm": 0.13345621526241302, "learning_rate": 1.9423484204769955e-05, "loss": 0.1647, "mean_token_accuracy": 0.9183674454689026, "step": 1779 }, { "epoch": 5.59748427672956, "grad_norm": 0.1298670917749405, "learning_rate": 1.940533821756964e-05, "loss": 0.172, "mean_token_accuracy": 0.9127525091171265, "step": 1780 }, { "epoch": 5.60062893081761, "grad_norm": 0.12359672784805298, "learning_rate": 1.93871949228446e-05, "loss": 0.182, "mean_token_accuracy": 0.9089277386665344, "step": 1781 }, { "epoch": 5.60377358490566, "grad_norm": 0.12858720123767853, "learning_rate": 1.9369054339422102e-05, "loss": 0.1941, "mean_token_accuracy": 0.9098080992698669, "step": 1782 }, { "epoch": 5.6069182389937104, "grad_norm": 0.12454716861248016, "learning_rate": 1.935091648612658e-05, "loss": 0.1633, "mean_token_accuracy": 0.9196724891662598, "step": 1783 }, { "epoch": 5.610062893081761, "grad_norm": 0.1321592628955841, "learning_rate": 1.9332781381779657e-05, "loss": 0.1621, "mean_token_accuracy": 0.9194785952568054, "step": 1784 }, { "epoch": 5.613207547169811, "grad_norm": 0.12804855406284332, "learning_rate": 1.9314649045200098e-05, "loss": 0.187, "mean_token_accuracy": 0.912610650062561, "step": 1785 }, { "epoch": 5.616352201257862, "grad_norm": 0.12372619658708572, "learning_rate": 1.9296519495203778e-05, "loss": 0.176, "mean_token_accuracy": 0.9116061925888062, "step": 1786 }, { "epoch": 5.619496855345912, "grad_norm": 0.12314292788505554, "learning_rate": 1.9278392750603704e-05, "loss": 0.184, "mean_token_accuracy": 0.9096059203147888, "step": 1787 }, { "epoch": 5.622641509433962, "grad_norm": 0.12165670096874237, "learning_rate": 1.9260268830209963e-05, "loss": 0.168, "mean_token_accuracy": 0.9138624668121338, "step": 1788 }, { "epoch": 5.6257861635220126, "grad_norm": 0.1263875663280487, "learning_rate": 1.924214775282971e-05, "loss": 0.1991, "mean_token_accuracy": 0.906342089176178, "step": 1789 }, { "epoch": 5.628930817610063, "grad_norm": 0.12722733616828918, "learning_rate": 1.9224029537267147e-05, "loss": 0.178, "mean_token_accuracy": 0.9143258929252625, "step": 1790 }, { "epoch": 5.632075471698113, "grad_norm": 0.12726683914661407, "learning_rate": 1.9205914202323506e-05, "loss": 0.1733, "mean_token_accuracy": 0.9135202765464783, "step": 1791 }, { "epoch": 5.635220125786163, "grad_norm": 0.13158226013183594, "learning_rate": 1.9187801766797042e-05, "loss": 0.1799, "mean_token_accuracy": 0.9105492830276489, "step": 1792 }, { "epoch": 5.638364779874214, "grad_norm": 0.1274365484714508, "learning_rate": 1.9169692249482992e-05, "loss": 0.1552, "mean_token_accuracy": 0.9198361039161682, "step": 1793 }, { "epoch": 5.6415094339622645, "grad_norm": 0.12876805663108826, "learning_rate": 1.915158566917356e-05, "loss": 0.1762, "mean_token_accuracy": 0.9127059578895569, "step": 1794 }, { "epoch": 5.644654088050315, "grad_norm": 0.12105433642864227, "learning_rate": 1.9133482044657904e-05, "loss": 0.17, "mean_token_accuracy": 0.9126316905021667, "step": 1795 }, { "epoch": 5.647798742138365, "grad_norm": 0.12377595901489258, "learning_rate": 1.911538139472213e-05, "loss": 0.1673, "mean_token_accuracy": 0.9142662882804871, "step": 1796 }, { "epoch": 5.650943396226415, "grad_norm": 0.12388768792152405, "learning_rate": 1.9097283738149238e-05, "loss": 0.1675, "mean_token_accuracy": 0.9142235517501831, "step": 1797 }, { "epoch": 5.654088050314465, "grad_norm": 0.12548349797725677, "learning_rate": 1.9079189093719136e-05, "loss": 0.1847, "mean_token_accuracy": 0.9101769328117371, "step": 1798 }, { "epoch": 5.6572327044025155, "grad_norm": 0.12843972444534302, "learning_rate": 1.906109748020859e-05, "loss": 0.1691, "mean_token_accuracy": 0.9158428311347961, "step": 1799 }, { "epoch": 5.660377358490566, "grad_norm": 0.12968730926513672, "learning_rate": 1.904300891639124e-05, "loss": 0.1724, "mean_token_accuracy": 0.9115962386131287, "step": 1800 }, { "epoch": 5.663522012578616, "grad_norm": 0.13056190311908722, "learning_rate": 1.9024923421037548e-05, "loss": 0.1788, "mean_token_accuracy": 0.9096761345863342, "step": 1801 }, { "epoch": 5.666666666666667, "grad_norm": 0.13141171634197235, "learning_rate": 1.9006841012914797e-05, "loss": 0.1787, "mean_token_accuracy": 0.9131348729133606, "step": 1802 }, { "epoch": 5.669811320754717, "grad_norm": 0.14636099338531494, "learning_rate": 1.8988761710787064e-05, "loss": 0.1755, "mean_token_accuracy": 0.910908043384552, "step": 1803 }, { "epoch": 5.672955974842767, "grad_norm": 0.13519425690174103, "learning_rate": 1.8970685533415197e-05, "loss": 0.1797, "mean_token_accuracy": 0.9129673838615417, "step": 1804 }, { "epoch": 5.676100628930818, "grad_norm": 0.1360684335231781, "learning_rate": 1.8952612499556825e-05, "loss": 0.1927, "mean_token_accuracy": 0.9077761769294739, "step": 1805 }, { "epoch": 5.679245283018868, "grad_norm": 0.13671882450580597, "learning_rate": 1.893454262796629e-05, "loss": 0.1778, "mean_token_accuracy": 0.9140453338623047, "step": 1806 }, { "epoch": 5.682389937106918, "grad_norm": 0.13327039778232574, "learning_rate": 1.8916475937394662e-05, "loss": 0.1719, "mean_token_accuracy": 0.9152608513832092, "step": 1807 }, { "epoch": 5.685534591194968, "grad_norm": 0.14007729291915894, "learning_rate": 1.8898412446589707e-05, "loss": 0.1847, "mean_token_accuracy": 0.9102155566215515, "step": 1808 }, { "epoch": 5.688679245283019, "grad_norm": 0.12568777799606323, "learning_rate": 1.8880352174295872e-05, "loss": 0.1674, "mean_token_accuracy": 0.9137527346611023, "step": 1809 }, { "epoch": 5.6918238993710695, "grad_norm": 0.12971539795398712, "learning_rate": 1.886229513925427e-05, "loss": 0.1766, "mean_token_accuracy": 0.9112458229064941, "step": 1810 }, { "epoch": 5.69496855345912, "grad_norm": 0.12138007581233978, "learning_rate": 1.8844241360202645e-05, "loss": 0.1816, "mean_token_accuracy": 0.9118508696556091, "step": 1811 }, { "epoch": 5.69811320754717, "grad_norm": 0.13702593743801117, "learning_rate": 1.8826190855875367e-05, "loss": 0.1659, "mean_token_accuracy": 0.9177764058113098, "step": 1812 }, { "epoch": 5.70125786163522, "grad_norm": 0.12681394815444946, "learning_rate": 1.8808143645003417e-05, "loss": 0.1936, "mean_token_accuracy": 0.907345712184906, "step": 1813 }, { "epoch": 5.70440251572327, "grad_norm": 0.12529529631137848, "learning_rate": 1.8790099746314335e-05, "loss": 0.1643, "mean_token_accuracy": 0.9167233109474182, "step": 1814 }, { "epoch": 5.7075471698113205, "grad_norm": 0.12051253020763397, "learning_rate": 1.877205917853225e-05, "loss": 0.1693, "mean_token_accuracy": 0.9154717326164246, "step": 1815 }, { "epoch": 5.710691823899371, "grad_norm": 0.1212969496846199, "learning_rate": 1.8754021960377812e-05, "loss": 0.1735, "mean_token_accuracy": 0.9160333275794983, "step": 1816 }, { "epoch": 5.713836477987421, "grad_norm": 0.12194069474935532, "learning_rate": 1.8735988110568214e-05, "loss": 0.1646, "mean_token_accuracy": 0.9180842041969299, "step": 1817 }, { "epoch": 5.716981132075472, "grad_norm": 0.12107089906930923, "learning_rate": 1.8717957647817142e-05, "loss": 0.1635, "mean_token_accuracy": 0.9168097972869873, "step": 1818 }, { "epoch": 5.720125786163522, "grad_norm": 0.1317150741815567, "learning_rate": 1.8699930590834774e-05, "loss": 0.1687, "mean_token_accuracy": 0.9154680967330933, "step": 1819 }, { "epoch": 5.723270440251572, "grad_norm": 0.12018724530935287, "learning_rate": 1.8681906958327748e-05, "loss": 0.1571, "mean_token_accuracy": 0.9167401194572449, "step": 1820 }, { "epoch": 5.726415094339623, "grad_norm": 0.12499828636646271, "learning_rate": 1.8663886768999144e-05, "loss": 0.1754, "mean_token_accuracy": 0.9121738076210022, "step": 1821 }, { "epoch": 5.729559748427673, "grad_norm": 0.1199810653924942, "learning_rate": 1.864587004154849e-05, "loss": 0.158, "mean_token_accuracy": 0.9177903532981873, "step": 1822 }, { "epoch": 5.732704402515723, "grad_norm": 0.12288343161344528, "learning_rate": 1.86278567946717e-05, "loss": 0.18, "mean_token_accuracy": 0.911756694316864, "step": 1823 }, { "epoch": 5.735849056603773, "grad_norm": 0.11919686943292618, "learning_rate": 1.860984704706108e-05, "loss": 0.1709, "mean_token_accuracy": 0.9164305925369263, "step": 1824 }, { "epoch": 5.738993710691824, "grad_norm": 0.12749914824962616, "learning_rate": 1.8591840817405317e-05, "loss": 0.1828, "mean_token_accuracy": 0.9121153950691223, "step": 1825 }, { "epoch": 5.7421383647798745, "grad_norm": 0.11674331873655319, "learning_rate": 1.8573838124389433e-05, "loss": 0.1784, "mean_token_accuracy": 0.9080796837806702, "step": 1826 }, { "epoch": 5.745283018867925, "grad_norm": 0.12508632242679596, "learning_rate": 1.855583898669479e-05, "loss": 0.1792, "mean_token_accuracy": 0.9131340980529785, "step": 1827 }, { "epoch": 5.748427672955975, "grad_norm": 0.12435121834278107, "learning_rate": 1.8537843422999057e-05, "loss": 0.171, "mean_token_accuracy": 0.9142288565635681, "step": 1828 }, { "epoch": 5.751572327044025, "grad_norm": 0.12231216579675674, "learning_rate": 1.8519851451976182e-05, "loss": 0.1597, "mean_token_accuracy": 0.9187856316566467, "step": 1829 }, { "epoch": 5.754716981132075, "grad_norm": 0.1260603815317154, "learning_rate": 1.8501863092296415e-05, "loss": 0.18, "mean_token_accuracy": 0.9122654795646667, "step": 1830 }, { "epoch": 5.7578616352201255, "grad_norm": 0.12207242846488953, "learning_rate": 1.8483878362626227e-05, "loss": 0.1543, "mean_token_accuracy": 0.9220340251922607, "step": 1831 }, { "epoch": 5.761006289308176, "grad_norm": 0.12331138551235199, "learning_rate": 1.8465897281628345e-05, "loss": 0.1837, "mean_token_accuracy": 0.9105554223060608, "step": 1832 }, { "epoch": 5.764150943396227, "grad_norm": 0.1287182718515396, "learning_rate": 1.8447919867961697e-05, "loss": 0.157, "mean_token_accuracy": 0.9220572710037231, "step": 1833 }, { "epoch": 5.767295597484277, "grad_norm": 0.1238393783569336, "learning_rate": 1.84299461402814e-05, "loss": 0.1662, "mean_token_accuracy": 0.915483295917511, "step": 1834 }, { "epoch": 5.770440251572327, "grad_norm": 0.12729449570178986, "learning_rate": 1.8411976117238772e-05, "loss": 0.1559, "mean_token_accuracy": 0.9172618985176086, "step": 1835 }, { "epoch": 5.773584905660377, "grad_norm": 0.1212867870926857, "learning_rate": 1.8394009817481257e-05, "loss": 0.183, "mean_token_accuracy": 0.9098618030548096, "step": 1836 }, { "epoch": 5.776729559748428, "grad_norm": 0.1259167343378067, "learning_rate": 1.8376047259652448e-05, "loss": 0.1708, "mean_token_accuracy": 0.9136357307434082, "step": 1837 }, { "epoch": 5.779874213836478, "grad_norm": 0.12603263556957245, "learning_rate": 1.8358088462392057e-05, "loss": 0.1713, "mean_token_accuracy": 0.9129566550254822, "step": 1838 }, { "epoch": 5.783018867924528, "grad_norm": 0.12520869076251984, "learning_rate": 1.8340133444335894e-05, "loss": 0.1777, "mean_token_accuracy": 0.9123414158821106, "step": 1839 }, { "epoch": 5.786163522012579, "grad_norm": 0.1280491203069687, "learning_rate": 1.832218222411584e-05, "loss": 0.161, "mean_token_accuracy": 0.9205901026725769, "step": 1840 }, { "epoch": 5.789308176100629, "grad_norm": 0.12988995015621185, "learning_rate": 1.8304234820359852e-05, "loss": 0.1622, "mean_token_accuracy": 0.9184454679489136, "step": 1841 }, { "epoch": 5.7924528301886795, "grad_norm": 0.12439626455307007, "learning_rate": 1.8286291251691897e-05, "loss": 0.1705, "mean_token_accuracy": 0.9161950945854187, "step": 1842 }, { "epoch": 5.79559748427673, "grad_norm": 0.13772748410701752, "learning_rate": 1.826835153673199e-05, "loss": 0.1752, "mean_token_accuracy": 0.9136615991592407, "step": 1843 }, { "epoch": 5.79874213836478, "grad_norm": 0.1208910346031189, "learning_rate": 1.825041569409614e-05, "loss": 0.1845, "mean_token_accuracy": 0.9119904637336731, "step": 1844 }, { "epoch": 5.80188679245283, "grad_norm": 0.1381557136774063, "learning_rate": 1.8232483742396327e-05, "loss": 0.1734, "mean_token_accuracy": 0.9145846962928772, "step": 1845 }, { "epoch": 5.80503144654088, "grad_norm": 0.13252446055412292, "learning_rate": 1.8214555700240498e-05, "loss": 0.1712, "mean_token_accuracy": 0.9147006869316101, "step": 1846 }, { "epoch": 5.8081761006289305, "grad_norm": 0.1245056763291359, "learning_rate": 1.819663158623256e-05, "loss": 0.1695, "mean_token_accuracy": 0.9146434664726257, "step": 1847 }, { "epoch": 5.811320754716981, "grad_norm": 0.1266990602016449, "learning_rate": 1.817871141897232e-05, "loss": 0.1776, "mean_token_accuracy": 0.9120575785636902, "step": 1848 }, { "epoch": 5.814465408805032, "grad_norm": 0.12160525470972061, "learning_rate": 1.8160795217055496e-05, "loss": 0.1824, "mean_token_accuracy": 0.9122020602226257, "step": 1849 }, { "epoch": 5.817610062893082, "grad_norm": 0.127620130777359, "learning_rate": 1.8142882999073704e-05, "loss": 0.1719, "mean_token_accuracy": 0.915385901927948, "step": 1850 }, { "epoch": 5.820754716981132, "grad_norm": 0.11782664060592651, "learning_rate": 1.8124974783614414e-05, "loss": 0.1701, "mean_token_accuracy": 0.9131607413291931, "step": 1851 }, { "epoch": 5.823899371069182, "grad_norm": 0.13343071937561035, "learning_rate": 1.8107070589260943e-05, "loss": 0.1533, "mean_token_accuracy": 0.9223355650901794, "step": 1852 }, { "epoch": 5.827044025157233, "grad_norm": 0.11941219866275787, "learning_rate": 1.808917043459243e-05, "loss": 0.1745, "mean_token_accuracy": 0.9113269448280334, "step": 1853 }, { "epoch": 5.830188679245283, "grad_norm": 0.11961886286735535, "learning_rate": 1.807127433818384e-05, "loss": 0.1745, "mean_token_accuracy": 0.9124999642372131, "step": 1854 }, { "epoch": 5.833333333333333, "grad_norm": 0.12838539481163025, "learning_rate": 1.8053382318605907e-05, "loss": 0.1952, "mean_token_accuracy": 0.9072304964065552, "step": 1855 }, { "epoch": 5.836477987421384, "grad_norm": 0.11784368008375168, "learning_rate": 1.8035494394425158e-05, "loss": 0.1655, "mean_token_accuracy": 0.9166972041130066, "step": 1856 }, { "epoch": 5.839622641509434, "grad_norm": 0.12901045382022858, "learning_rate": 1.801761058420384e-05, "loss": 0.1855, "mean_token_accuracy": 0.9092808365821838, "step": 1857 }, { "epoch": 5.8427672955974845, "grad_norm": 0.11686600744724274, "learning_rate": 1.799973090649995e-05, "loss": 0.1754, "mean_token_accuracy": 0.9145554900169373, "step": 1858 }, { "epoch": 5.845911949685535, "grad_norm": 0.12752406299114227, "learning_rate": 1.7981855379867204e-05, "loss": 0.1793, "mean_token_accuracy": 0.913017213344574, "step": 1859 }, { "epoch": 5.849056603773585, "grad_norm": 0.12086496502161026, "learning_rate": 1.796398402285499e-05, "loss": 0.1873, "mean_token_accuracy": 0.9081063866615295, "step": 1860 }, { "epoch": 5.852201257861635, "grad_norm": 0.1289493292570114, "learning_rate": 1.7946116854008383e-05, "loss": 0.1715, "mean_token_accuracy": 0.9141929149627686, "step": 1861 }, { "epoch": 5.855345911949685, "grad_norm": 0.12896886467933655, "learning_rate": 1.7928253891868103e-05, "loss": 0.174, "mean_token_accuracy": 0.9146431684494019, "step": 1862 }, { "epoch": 5.8584905660377355, "grad_norm": 0.1219131350517273, "learning_rate": 1.7910395154970505e-05, "loss": 0.1889, "mean_token_accuracy": 0.9074649214744568, "step": 1863 }, { "epoch": 5.861635220125786, "grad_norm": 0.13309486210346222, "learning_rate": 1.789254066184758e-05, "loss": 0.1715, "mean_token_accuracy": 0.9138280749320984, "step": 1864 }, { "epoch": 5.864779874213837, "grad_norm": 0.11922544986009598, "learning_rate": 1.7874690431026887e-05, "loss": 0.1702, "mean_token_accuracy": 0.9120256304740906, "step": 1865 }, { "epoch": 5.867924528301887, "grad_norm": 0.12653562426567078, "learning_rate": 1.7856844481031585e-05, "loss": 0.1836, "mean_token_accuracy": 0.9097647666931152, "step": 1866 }, { "epoch": 5.871069182389937, "grad_norm": 0.12282025814056396, "learning_rate": 1.7839002830380366e-05, "loss": 0.1778, "mean_token_accuracy": 0.9133232831954956, "step": 1867 }, { "epoch": 5.8742138364779874, "grad_norm": 0.12195201963186264, "learning_rate": 1.7821165497587488e-05, "loss": 0.1622, "mean_token_accuracy": 0.9166160821914673, "step": 1868 }, { "epoch": 5.877358490566038, "grad_norm": 0.12466923892498016, "learning_rate": 1.780333250116271e-05, "loss": 0.1641, "mean_token_accuracy": 0.9177669286727905, "step": 1869 }, { "epoch": 5.880503144654088, "grad_norm": 0.13061249256134033, "learning_rate": 1.778550385961129e-05, "loss": 0.1789, "mean_token_accuracy": 0.913748562335968, "step": 1870 }, { "epoch": 5.883647798742138, "grad_norm": 0.12507785856723785, "learning_rate": 1.7767679591433984e-05, "loss": 0.1836, "mean_token_accuracy": 0.9113100171089172, "step": 1871 }, { "epoch": 5.886792452830189, "grad_norm": 0.13062918186187744, "learning_rate": 1.774985971512699e-05, "loss": 0.1872, "mean_token_accuracy": 0.9101560115814209, "step": 1872 }, { "epoch": 5.889937106918239, "grad_norm": 0.12669727206230164, "learning_rate": 1.773204424918196e-05, "loss": 0.1863, "mean_token_accuracy": 0.9074638485908508, "step": 1873 }, { "epoch": 5.8930817610062896, "grad_norm": 0.13299503922462463, "learning_rate": 1.771423321208597e-05, "loss": 0.1741, "mean_token_accuracy": 0.9125397205352783, "step": 1874 }, { "epoch": 5.89622641509434, "grad_norm": 0.13146135210990906, "learning_rate": 1.7696426622321503e-05, "loss": 0.1784, "mean_token_accuracy": 0.9100609421730042, "step": 1875 }, { "epoch": 5.89937106918239, "grad_norm": 0.13344912230968475, "learning_rate": 1.7678624498366406e-05, "loss": 0.1886, "mean_token_accuracy": 0.9081234931945801, "step": 1876 }, { "epoch": 5.90251572327044, "grad_norm": 0.13202603161334991, "learning_rate": 1.7660826858693914e-05, "loss": 0.1778, "mean_token_accuracy": 0.9113733768463135, "step": 1877 }, { "epoch": 5.90566037735849, "grad_norm": 0.12540169060230255, "learning_rate": 1.7643033721772608e-05, "loss": 0.1765, "mean_token_accuracy": 0.912645697593689, "step": 1878 }, { "epoch": 5.908805031446541, "grad_norm": 0.13628825545310974, "learning_rate": 1.7625245106066372e-05, "loss": 0.1614, "mean_token_accuracy": 0.917786717414856, "step": 1879 }, { "epoch": 5.911949685534591, "grad_norm": 0.12427868694067001, "learning_rate": 1.7607461030034437e-05, "loss": 0.1746, "mean_token_accuracy": 0.9142725467681885, "step": 1880 }, { "epoch": 5.915094339622642, "grad_norm": 0.13920536637306213, "learning_rate": 1.7589681512131295e-05, "loss": 0.1803, "mean_token_accuracy": 0.9117159247398376, "step": 1881 }, { "epoch": 5.918238993710692, "grad_norm": 0.13377030193805695, "learning_rate": 1.7571906570806718e-05, "loss": 0.184, "mean_token_accuracy": 0.9084151983261108, "step": 1882 }, { "epoch": 5.921383647798742, "grad_norm": 0.12509870529174805, "learning_rate": 1.755413622450572e-05, "loss": 0.1859, "mean_token_accuracy": 0.907892107963562, "step": 1883 }, { "epoch": 5.9245283018867925, "grad_norm": 0.13371077179908752, "learning_rate": 1.7536370491668556e-05, "loss": 0.175, "mean_token_accuracy": 0.9129257798194885, "step": 1884 }, { "epoch": 5.927672955974843, "grad_norm": 0.12641875445842743, "learning_rate": 1.7518609390730694e-05, "loss": 0.1706, "mean_token_accuracy": 0.9163611531257629, "step": 1885 }, { "epoch": 5.930817610062893, "grad_norm": 0.1306959092617035, "learning_rate": 1.7500852940122796e-05, "loss": 0.1617, "mean_token_accuracy": 0.9185019731521606, "step": 1886 }, { "epoch": 5.933962264150943, "grad_norm": 0.1335182934999466, "learning_rate": 1.7483101158270683e-05, "loss": 0.185, "mean_token_accuracy": 0.9097692966461182, "step": 1887 }, { "epoch": 5.937106918238994, "grad_norm": 0.12307420372962952, "learning_rate": 1.7465354063595354e-05, "loss": 0.1744, "mean_token_accuracy": 0.9118009209632874, "step": 1888 }, { "epoch": 5.940251572327044, "grad_norm": 0.13397091627120972, "learning_rate": 1.744761167451292e-05, "loss": 0.1726, "mean_token_accuracy": 0.9162708520889282, "step": 1889 }, { "epoch": 5.943396226415095, "grad_norm": 0.1265205293893814, "learning_rate": 1.7429874009434642e-05, "loss": 0.1685, "mean_token_accuracy": 0.9149485230445862, "step": 1890 }, { "epoch": 5.946540880503145, "grad_norm": 0.12802155315876007, "learning_rate": 1.741214108676684e-05, "loss": 0.1845, "mean_token_accuracy": 0.9122422337532043, "step": 1891 }, { "epoch": 5.949685534591195, "grad_norm": 0.12957172095775604, "learning_rate": 1.7394412924910946e-05, "loss": 0.1953, "mean_token_accuracy": 0.9067781567573547, "step": 1892 }, { "epoch": 5.952830188679245, "grad_norm": 0.1264718621969223, "learning_rate": 1.7376689542263424e-05, "loss": 0.1666, "mean_token_accuracy": 0.9175181984901428, "step": 1893 }, { "epoch": 5.955974842767295, "grad_norm": 0.12541256844997406, "learning_rate": 1.7358970957215805e-05, "loss": 0.1789, "mean_token_accuracy": 0.9094677567481995, "step": 1894 }, { "epoch": 5.959119496855346, "grad_norm": 0.12028246372938156, "learning_rate": 1.7341257188154625e-05, "loss": 0.1975, "mean_token_accuracy": 0.9086244106292725, "step": 1895 }, { "epoch": 5.962264150943396, "grad_norm": 0.13436433672904968, "learning_rate": 1.7323548253461425e-05, "loss": 0.1808, "mean_token_accuracy": 0.9111320972442627, "step": 1896 }, { "epoch": 5.965408805031447, "grad_norm": 0.11996529251337051, "learning_rate": 1.730584417151273e-05, "loss": 0.1758, "mean_token_accuracy": 0.9111454486846924, "step": 1897 }, { "epoch": 5.968553459119497, "grad_norm": 0.12315444648265839, "learning_rate": 1.728814496068003e-05, "loss": 0.1925, "mean_token_accuracy": 0.9099141955375671, "step": 1898 }, { "epoch": 5.971698113207547, "grad_norm": 0.12306390702724457, "learning_rate": 1.7270450639329762e-05, "loss": 0.1768, "mean_token_accuracy": 0.9125403761863708, "step": 1899 }, { "epoch": 5.9748427672955975, "grad_norm": 0.1273583173751831, "learning_rate": 1.725276122582329e-05, "loss": 0.1891, "mean_token_accuracy": 0.9100738167762756, "step": 1900 }, { "epoch": 5.977987421383648, "grad_norm": 0.12477988004684448, "learning_rate": 1.723507673851687e-05, "loss": 0.1697, "mean_token_accuracy": 0.9162830710411072, "step": 1901 }, { "epoch": 5.981132075471698, "grad_norm": 0.1281619518995285, "learning_rate": 1.721739719576167e-05, "loss": 0.1838, "mean_token_accuracy": 0.9113216996192932, "step": 1902 }, { "epoch": 5.984276729559748, "grad_norm": 0.12437441945075989, "learning_rate": 1.7199722615903722e-05, "loss": 0.1768, "mean_token_accuracy": 0.9131054878234863, "step": 1903 }, { "epoch": 5.987421383647799, "grad_norm": 0.12587052583694458, "learning_rate": 1.7182053017283884e-05, "loss": 0.1914, "mean_token_accuracy": 0.9063124656677246, "step": 1904 }, { "epoch": 5.990566037735849, "grad_norm": 0.1270790845155716, "learning_rate": 1.7164388418237878e-05, "loss": 0.1814, "mean_token_accuracy": 0.9116953015327454, "step": 1905 }, { "epoch": 5.9937106918239, "grad_norm": 0.11693906038999557, "learning_rate": 1.714672883709621e-05, "loss": 0.1712, "mean_token_accuracy": 0.9138330817222595, "step": 1906 }, { "epoch": 5.99685534591195, "grad_norm": 0.12291273474693298, "learning_rate": 1.71290742921842e-05, "loss": 0.1838, "mean_token_accuracy": 0.9125950932502747, "step": 1907 }, { "epoch": 6.0, "grad_norm": 0.12398134917020798, "learning_rate": 1.7111424801821933e-05, "loss": 0.1596, "mean_token_accuracy": 0.9150497913360596, "step": 1908 } ], "logging_steps": 1, "max_steps": 3180, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.92346119564768e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }