diff --git "a/checkpoint-85566/trainer_state.json" "b/checkpoint-85566/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-85566/trainer_state.json" @@ -0,0 +1,24050 @@ +{ + "best_metric": 1.3794080018997192, + "best_model_checkpoint": "address-large-text-classifier/checkpoint-85566", + "epoch": 3.0, + "eval_steps": 500, + "global_step": 85566, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008765163733258538, + "grad_norm": 12.878114700317383, + "learning_rate": 1.4607923337618326e-07, + "loss": 2.1608, + "step": 25 + }, + { + "epoch": 0.0017530327466517075, + "grad_norm": 12.612255096435547, + "learning_rate": 2.921584667523665e-07, + "loss": 2.0494, + "step": 50 + }, + { + "epoch": 0.002629549119977561, + "grad_norm": 12.993833541870117, + "learning_rate": 4.3823770012854975e-07, + "loss": 2.1041, + "step": 75 + }, + { + "epoch": 0.003506065493303415, + "grad_norm": 11.023054122924805, + "learning_rate": 5.84316933504733e-07, + "loss": 2.0387, + "step": 100 + }, + { + "epoch": 0.004382581866629268, + "grad_norm": 21.660503387451172, + "learning_rate": 7.303961668809163e-07, + "loss": 2.1135, + "step": 125 + }, + { + "epoch": 0.005259098239955122, + "grad_norm": 13.2007417678833, + "learning_rate": 8.764754002570995e-07, + "loss": 2.0239, + "step": 150 + }, + { + "epoch": 0.0061356146132809765, + "grad_norm": 16.264389038085938, + "learning_rate": 1.0225546336332827e-06, + "loss": 2.0002, + "step": 175 + }, + { + "epoch": 0.00701213098660683, + "grad_norm": 12.02843952178955, + "learning_rate": 1.168633867009466e-06, + "loss": 1.9506, + "step": 200 + }, + { + "epoch": 0.007888647359932683, + "grad_norm": 14.192572593688965, + "learning_rate": 1.3147131003856492e-06, + "loss": 1.9952, + "step": 225 + }, + { + "epoch": 0.008765163733258537, + "grad_norm": 15.224964141845703, + "learning_rate": 1.4607923337618325e-06, + "loss": 1.9428, + "step": 250 + }, + { + "epoch": 0.00964168010658439, + "grad_norm": 12.202608108520508, + "learning_rate": 1.6068715671380159e-06, + "loss": 1.8183, + "step": 275 + }, + { + "epoch": 0.010518196479910244, + "grad_norm": 16.03495216369629, + "learning_rate": 1.752950800514199e-06, + "loss": 1.8691, + "step": 300 + }, + { + "epoch": 0.011394712853236098, + "grad_norm": 56.82136535644531, + "learning_rate": 1.8990300338903821e-06, + "loss": 1.7968, + "step": 325 + }, + { + "epoch": 0.012271229226561953, + "grad_norm": 13.250779151916504, + "learning_rate": 2.0451092672665655e-06, + "loss": 1.7046, + "step": 350 + }, + { + "epoch": 0.013147745599887807, + "grad_norm": 11.990614891052246, + "learning_rate": 2.1911885006427486e-06, + "loss": 1.5278, + "step": 375 + }, + { + "epoch": 0.01402426197321366, + "grad_norm": 14.53021240234375, + "learning_rate": 2.337267734018932e-06, + "loss": 1.3793, + "step": 400 + }, + { + "epoch": 0.014900778346539514, + "grad_norm": 12.418506622314453, + "learning_rate": 2.4833469673951153e-06, + "loss": 1.4004, + "step": 425 + }, + { + "epoch": 0.015777294719865366, + "grad_norm": 11.679211616516113, + "learning_rate": 2.6294262007712984e-06, + "loss": 1.3682, + "step": 450 + }, + { + "epoch": 0.01665381109319122, + "grad_norm": 44.40489959716797, + "learning_rate": 2.7755054341474815e-06, + "loss": 1.3745, + "step": 475 + }, + { + "epoch": 0.017530327466517073, + "grad_norm": 34.335205078125, + "learning_rate": 2.921584667523665e-06, + "loss": 1.2496, + "step": 500 + }, + { + "epoch": 0.018406843839842927, + "grad_norm": 13.09535026550293, + "learning_rate": 3.067663900899848e-06, + "loss": 1.1458, + "step": 525 + }, + { + "epoch": 0.01928336021316878, + "grad_norm": 17.473642349243164, + "learning_rate": 3.2137431342760317e-06, + "loss": 1.2262, + "step": 550 + }, + { + "epoch": 0.020159876586494634, + "grad_norm": 27.161338806152344, + "learning_rate": 3.359822367652215e-06, + "loss": 1.0308, + "step": 575 + }, + { + "epoch": 0.021036392959820488, + "grad_norm": 28.020336151123047, + "learning_rate": 3.505901601028398e-06, + "loss": 0.9803, + "step": 600 + }, + { + "epoch": 0.02191290933314634, + "grad_norm": 10.548603057861328, + "learning_rate": 3.6519808344045816e-06, + "loss": 0.9064, + "step": 625 + }, + { + "epoch": 0.022789425706472195, + "grad_norm": 21.953628540039062, + "learning_rate": 3.7980600677807643e-06, + "loss": 1.0943, + "step": 650 + }, + { + "epoch": 0.02366594207979805, + "grad_norm": 31.458683013916016, + "learning_rate": 3.944139301156948e-06, + "loss": 0.9983, + "step": 675 + }, + { + "epoch": 0.024542458453123906, + "grad_norm": 108.02452087402344, + "learning_rate": 4.090218534533131e-06, + "loss": 0.6583, + "step": 700 + }, + { + "epoch": 0.02541897482644976, + "grad_norm": 21.401248931884766, + "learning_rate": 4.236297767909314e-06, + "loss": 0.8683, + "step": 725 + }, + { + "epoch": 0.026295491199775613, + "grad_norm": 17.436010360717773, + "learning_rate": 4.382377001285497e-06, + "loss": 1.1704, + "step": 750 + }, + { + "epoch": 0.027172007573101467, + "grad_norm": 14.214044570922852, + "learning_rate": 4.52845623466168e-06, + "loss": 0.6443, + "step": 775 + }, + { + "epoch": 0.02804852394642732, + "grad_norm": 14.878693580627441, + "learning_rate": 4.674535468037864e-06, + "loss": 0.7471, + "step": 800 + }, + { + "epoch": 0.028925040319753174, + "grad_norm": 6.159904956817627, + "learning_rate": 4.820614701414047e-06, + "loss": 0.559, + "step": 825 + }, + { + "epoch": 0.029801556693079028, + "grad_norm": 10.83056640625, + "learning_rate": 4.9666939347902305e-06, + "loss": 0.6893, + "step": 850 + }, + { + "epoch": 0.030678073066404882, + "grad_norm": 23.816082000732422, + "learning_rate": 5.112773168166414e-06, + "loss": 0.7054, + "step": 875 + }, + { + "epoch": 0.03155458943973073, + "grad_norm": 29.04193687438965, + "learning_rate": 5.258852401542597e-06, + "loss": 0.492, + "step": 900 + }, + { + "epoch": 0.03243110581305659, + "grad_norm": 14.674774169921875, + "learning_rate": 5.40493163491878e-06, + "loss": 0.5405, + "step": 925 + }, + { + "epoch": 0.03330762218638244, + "grad_norm": 113.9504165649414, + "learning_rate": 5.551010868294963e-06, + "loss": 0.6175, + "step": 950 + }, + { + "epoch": 0.034184138559708296, + "grad_norm": 116.74869537353516, + "learning_rate": 5.697090101671146e-06, + "loss": 0.5574, + "step": 975 + }, + { + "epoch": 0.03506065493303415, + "grad_norm": 1.3847389221191406, + "learning_rate": 5.84316933504733e-06, + "loss": 0.6411, + "step": 1000 + }, + { + "epoch": 0.035937171306360004, + "grad_norm": 31.721471786499023, + "learning_rate": 5.989248568423513e-06, + "loss": 0.4523, + "step": 1025 + }, + { + "epoch": 0.036813687679685854, + "grad_norm": 14.36369800567627, + "learning_rate": 6.135327801799696e-06, + "loss": 0.3155, + "step": 1050 + }, + { + "epoch": 0.03769020405301171, + "grad_norm": 12.580827713012695, + "learning_rate": 6.2814070351758795e-06, + "loss": 0.2696, + "step": 1075 + }, + { + "epoch": 0.03856672042633756, + "grad_norm": 7.630373001098633, + "learning_rate": 6.4274862685520635e-06, + "loss": 0.3367, + "step": 1100 + }, + { + "epoch": 0.03944323679966342, + "grad_norm": 30.650691986083984, + "learning_rate": 6.573565501928246e-06, + "loss": 0.3982, + "step": 1125 + }, + { + "epoch": 0.04031975317298927, + "grad_norm": 0.1827818900346756, + "learning_rate": 6.71964473530443e-06, + "loss": 0.2154, + "step": 1150 + }, + { + "epoch": 0.041196269546315126, + "grad_norm": 2.041903257369995, + "learning_rate": 6.865723968680612e-06, + "loss": 0.3515, + "step": 1175 + }, + { + "epoch": 0.042072785919640976, + "grad_norm": 0.4072812795639038, + "learning_rate": 7.011803202056796e-06, + "loss": 0.2194, + "step": 1200 + }, + { + "epoch": 0.04294930229296683, + "grad_norm": 5.408400535583496, + "learning_rate": 7.157882435432979e-06, + "loss": 0.3381, + "step": 1225 + }, + { + "epoch": 0.04382581866629268, + "grad_norm": 9.34211254119873, + "learning_rate": 7.303961668809163e-06, + "loss": 0.4332, + "step": 1250 + }, + { + "epoch": 0.04470233503961854, + "grad_norm": 16.89436149597168, + "learning_rate": 7.450040902185345e-06, + "loss": 0.3588, + "step": 1275 + }, + { + "epoch": 0.04557885141294439, + "grad_norm": 8.161200523376465, + "learning_rate": 7.5961201355615285e-06, + "loss": 0.4367, + "step": 1300 + }, + { + "epoch": 0.04645536778627025, + "grad_norm": 26.529033660888672, + "learning_rate": 7.742199368937712e-06, + "loss": 0.6968, + "step": 1325 + }, + { + "epoch": 0.0473318841595961, + "grad_norm": 0.21322710812091827, + "learning_rate": 7.888278602313896e-06, + "loss": 0.427, + "step": 1350 + }, + { + "epoch": 0.048208400532921955, + "grad_norm": 2.1418209075927734, + "learning_rate": 8.03435783569008e-06, + "loss": 0.3933, + "step": 1375 + }, + { + "epoch": 0.04908491690624781, + "grad_norm": 0.9025070071220398, + "learning_rate": 8.180437069066262e-06, + "loss": 0.1629, + "step": 1400 + }, + { + "epoch": 0.04996143327957366, + "grad_norm": 0.1392815113067627, + "learning_rate": 8.326516302442446e-06, + "loss": 0.5049, + "step": 1425 + }, + { + "epoch": 0.05083794965289952, + "grad_norm": 57.377811431884766, + "learning_rate": 8.472595535818628e-06, + "loss": 0.221, + "step": 1450 + }, + { + "epoch": 0.05171446602622537, + "grad_norm": 363.1225280761719, + "learning_rate": 8.618674769194812e-06, + "loss": 0.468, + "step": 1475 + }, + { + "epoch": 0.05259098239955123, + "grad_norm": 0.1880239099264145, + "learning_rate": 8.764754002570994e-06, + "loss": 0.3179, + "step": 1500 + }, + { + "epoch": 0.05346749877287708, + "grad_norm": 45.64191818237305, + "learning_rate": 8.910833235947178e-06, + "loss": 0.3446, + "step": 1525 + }, + { + "epoch": 0.054344015146202934, + "grad_norm": 2.242156982421875, + "learning_rate": 9.05691246932336e-06, + "loss": 0.4052, + "step": 1550 + }, + { + "epoch": 0.055220531519528784, + "grad_norm": 100.14468383789062, + "learning_rate": 9.202991702699545e-06, + "loss": 0.9977, + "step": 1575 + }, + { + "epoch": 0.05609704789285464, + "grad_norm": 0.5043649077415466, + "learning_rate": 9.349070936075729e-06, + "loss": 0.3164, + "step": 1600 + }, + { + "epoch": 0.05697356426618049, + "grad_norm": 155.52865600585938, + "learning_rate": 9.495150169451911e-06, + "loss": 0.5149, + "step": 1625 + }, + { + "epoch": 0.05785008063950635, + "grad_norm": 0.19047071039676666, + "learning_rate": 9.641229402828095e-06, + "loss": 0.2328, + "step": 1650 + }, + { + "epoch": 0.0587265970128322, + "grad_norm": 0.6376672983169556, + "learning_rate": 9.787308636204277e-06, + "loss": 0.189, + "step": 1675 + }, + { + "epoch": 0.059603113386158056, + "grad_norm": 0.06638780236244202, + "learning_rate": 9.933387869580461e-06, + "loss": 0.3371, + "step": 1700 + }, + { + "epoch": 0.060479629759483906, + "grad_norm": 0.11004418134689331, + "learning_rate": 1.0079467102956643e-05, + "loss": 0.3958, + "step": 1725 + }, + { + "epoch": 0.061356146132809763, + "grad_norm": 0.32208120822906494, + "learning_rate": 1.0225546336332827e-05, + "loss": 0.5098, + "step": 1750 + }, + { + "epoch": 0.062232662506135614, + "grad_norm": 0.03938484564423561, + "learning_rate": 1.0371625569709011e-05, + "loss": 0.3713, + "step": 1775 + }, + { + "epoch": 0.06310917887946146, + "grad_norm": 44.65238571166992, + "learning_rate": 1.0517704803085194e-05, + "loss": 1.1143, + "step": 1800 + }, + { + "epoch": 0.06398569525278733, + "grad_norm": 0.09295177459716797, + "learning_rate": 1.0663784036461378e-05, + "loss": 0.3617, + "step": 1825 + }, + { + "epoch": 0.06486221162611318, + "grad_norm": 72.09941864013672, + "learning_rate": 1.080986326983756e-05, + "loss": 0.2592, + "step": 1850 + }, + { + "epoch": 0.06573872799943903, + "grad_norm": 0.027828197926282883, + "learning_rate": 1.0955942503213744e-05, + "loss": 0.2495, + "step": 1875 + }, + { + "epoch": 0.06661524437276488, + "grad_norm": 0.037041958421468735, + "learning_rate": 1.1102021736589926e-05, + "loss": 0.494, + "step": 1900 + }, + { + "epoch": 0.06749176074609074, + "grad_norm": 0.4446689188480377, + "learning_rate": 1.124810096996611e-05, + "loss": 0.3709, + "step": 1925 + }, + { + "epoch": 0.06836827711941659, + "grad_norm": 0.37090006470680237, + "learning_rate": 1.1394180203342292e-05, + "loss": 0.7646, + "step": 1950 + }, + { + "epoch": 0.06924479349274244, + "grad_norm": 0.1937519907951355, + "learning_rate": 1.1540259436718476e-05, + "loss": 0.4352, + "step": 1975 + }, + { + "epoch": 0.0701213098660683, + "grad_norm": 0.07553679496049881, + "learning_rate": 1.168633867009466e-05, + "loss": 0.5113, + "step": 2000 + }, + { + "epoch": 0.07099782623939416, + "grad_norm": 0.15395177900791168, + "learning_rate": 1.1832417903470844e-05, + "loss": 0.4418, + "step": 2025 + }, + { + "epoch": 0.07187434261272001, + "grad_norm": 0.09189638495445251, + "learning_rate": 1.1978497136847027e-05, + "loss": 0.0641, + "step": 2050 + }, + { + "epoch": 0.07275085898604586, + "grad_norm": 0.29444265365600586, + "learning_rate": 1.2124576370223209e-05, + "loss": 0.3455, + "step": 2075 + }, + { + "epoch": 0.07362737535937171, + "grad_norm": 0.14757448434829712, + "learning_rate": 1.2270655603599393e-05, + "loss": 0.6109, + "step": 2100 + }, + { + "epoch": 0.07450389173269757, + "grad_norm": 0.23184889554977417, + "learning_rate": 1.2416734836975575e-05, + "loss": 0.2757, + "step": 2125 + }, + { + "epoch": 0.07538040810602342, + "grad_norm": 0.5588589906692505, + "learning_rate": 1.2562814070351759e-05, + "loss": 0.6408, + "step": 2150 + }, + { + "epoch": 0.07625692447934927, + "grad_norm": 0.4347357153892517, + "learning_rate": 1.2708893303727943e-05, + "loss": 0.6642, + "step": 2175 + }, + { + "epoch": 0.07713344085267512, + "grad_norm": 0.19923338294029236, + "learning_rate": 1.2854972537104127e-05, + "loss": 0.157, + "step": 2200 + }, + { + "epoch": 0.07800995722600099, + "grad_norm": 0.09818919748067856, + "learning_rate": 1.3001051770480311e-05, + "loss": 0.3833, + "step": 2225 + }, + { + "epoch": 0.07888647359932684, + "grad_norm": 5.523497581481934, + "learning_rate": 1.3147131003856492e-05, + "loss": 0.1142, + "step": 2250 + }, + { + "epoch": 0.07976298997265269, + "grad_norm": 0.02857324481010437, + "learning_rate": 1.3293210237232676e-05, + "loss": 0.0035, + "step": 2275 + }, + { + "epoch": 0.08063950634597854, + "grad_norm": 0.12375987321138382, + "learning_rate": 1.343928947060886e-05, + "loss": 0.394, + "step": 2300 + }, + { + "epoch": 0.0815160227193044, + "grad_norm": 0.08629830926656723, + "learning_rate": 1.358536870398504e-05, + "loss": 0.2562, + "step": 2325 + }, + { + "epoch": 0.08239253909263025, + "grad_norm": 0.44096246361732483, + "learning_rate": 1.3731447937361224e-05, + "loss": 0.3089, + "step": 2350 + }, + { + "epoch": 0.0832690554659561, + "grad_norm": 0.07389693707227707, + "learning_rate": 1.3877527170737408e-05, + "loss": 0.3708, + "step": 2375 + }, + { + "epoch": 0.08414557183928195, + "grad_norm": 0.19356150925159454, + "learning_rate": 1.4023606404113592e-05, + "loss": 0.3153, + "step": 2400 + }, + { + "epoch": 0.08502208821260782, + "grad_norm": 0.10705860704183578, + "learning_rate": 1.4169685637489774e-05, + "loss": 0.6273, + "step": 2425 + }, + { + "epoch": 0.08589860458593367, + "grad_norm": 417.7571105957031, + "learning_rate": 1.4315764870865958e-05, + "loss": 0.3638, + "step": 2450 + }, + { + "epoch": 0.08677512095925952, + "grad_norm": 0.16871854662895203, + "learning_rate": 1.4461844104242142e-05, + "loss": 0.6484, + "step": 2475 + }, + { + "epoch": 0.08765163733258537, + "grad_norm": 0.08975645154714584, + "learning_rate": 1.4607923337618326e-05, + "loss": 0.3232, + "step": 2500 + }, + { + "epoch": 0.08852815370591123, + "grad_norm": 19.39029884338379, + "learning_rate": 1.4754002570994507e-05, + "loss": 0.5381, + "step": 2525 + }, + { + "epoch": 0.08940467007923708, + "grad_norm": 0.10550019145011902, + "learning_rate": 1.490008180437069e-05, + "loss": 0.4452, + "step": 2550 + }, + { + "epoch": 0.09028118645256293, + "grad_norm": 0.16805946826934814, + "learning_rate": 1.5046161037746875e-05, + "loss": 0.4337, + "step": 2575 + }, + { + "epoch": 0.09115770282588878, + "grad_norm": 0.09682565182447433, + "learning_rate": 1.5192240271123057e-05, + "loss": 0.2296, + "step": 2600 + }, + { + "epoch": 0.09203421919921465, + "grad_norm": 0.03832285851240158, + "learning_rate": 1.533831950449924e-05, + "loss": 0.1878, + "step": 2625 + }, + { + "epoch": 0.0929107355725405, + "grad_norm": 2.188673973083496, + "learning_rate": 1.5484398737875423e-05, + "loss": 0.249, + "step": 2650 + }, + { + "epoch": 0.09378725194586635, + "grad_norm": 0.09451624006032944, + "learning_rate": 1.563047797125161e-05, + "loss": 0.1095, + "step": 2675 + }, + { + "epoch": 0.0946637683191922, + "grad_norm": 0.012161496095359325, + "learning_rate": 1.577655720462779e-05, + "loss": 0.2699, + "step": 2700 + }, + { + "epoch": 0.09554028469251806, + "grad_norm": 72.07886505126953, + "learning_rate": 1.5922636438003973e-05, + "loss": 0.6546, + "step": 2725 + }, + { + "epoch": 0.09641680106584391, + "grad_norm": 0.060827694833278656, + "learning_rate": 1.606871567138016e-05, + "loss": 0.2538, + "step": 2750 + }, + { + "epoch": 0.09729331743916976, + "grad_norm": 0.4787759780883789, + "learning_rate": 1.621479490475634e-05, + "loss": 0.4756, + "step": 2775 + }, + { + "epoch": 0.09816983381249562, + "grad_norm": 26.64658546447754, + "learning_rate": 1.6360874138132524e-05, + "loss": 0.3529, + "step": 2800 + }, + { + "epoch": 0.09904635018582147, + "grad_norm": 0.022708337754011154, + "learning_rate": 1.6506953371508706e-05, + "loss": 0.0554, + "step": 2825 + }, + { + "epoch": 0.09992286655914732, + "grad_norm": 0.12144271284341812, + "learning_rate": 1.665303260488489e-05, + "loss": 0.3251, + "step": 2850 + }, + { + "epoch": 0.10079938293247317, + "grad_norm": 0.5231146216392517, + "learning_rate": 1.6799111838261074e-05, + "loss": 0.7144, + "step": 2875 + }, + { + "epoch": 0.10167589930579904, + "grad_norm": 0.10314569622278214, + "learning_rate": 1.6945191071637256e-05, + "loss": 0.2226, + "step": 2900 + }, + { + "epoch": 0.10255241567912489, + "grad_norm": 0.15697705745697021, + "learning_rate": 1.7091270305013442e-05, + "loss": 0.6895, + "step": 2925 + }, + { + "epoch": 0.10342893205245074, + "grad_norm": 16.26335906982422, + "learning_rate": 1.7237349538389624e-05, + "loss": 0.5585, + "step": 2950 + }, + { + "epoch": 0.10430544842577659, + "grad_norm": 0.11737757176160812, + "learning_rate": 1.7383428771765806e-05, + "loss": 0.2246, + "step": 2975 + }, + { + "epoch": 0.10518196479910245, + "grad_norm": 0.02039375528693199, + "learning_rate": 1.752950800514199e-05, + "loss": 0.2766, + "step": 3000 + }, + { + "epoch": 0.1060584811724283, + "grad_norm": 29.779972076416016, + "learning_rate": 1.7675587238518174e-05, + "loss": 0.7676, + "step": 3025 + }, + { + "epoch": 0.10693499754575415, + "grad_norm": 0.12667439877986908, + "learning_rate": 1.7821666471894357e-05, + "loss": 0.3338, + "step": 3050 + }, + { + "epoch": 0.10781151391908, + "grad_norm": 0.019363489001989365, + "learning_rate": 1.796774570527054e-05, + "loss": 0.3273, + "step": 3075 + }, + { + "epoch": 0.10868803029240587, + "grad_norm": 0.44442468881607056, + "learning_rate": 1.811382493864672e-05, + "loss": 0.3544, + "step": 3100 + }, + { + "epoch": 0.10956454666573172, + "grad_norm": 0.05315634608268738, + "learning_rate": 1.8259904172022907e-05, + "loss": 0.6018, + "step": 3125 + }, + { + "epoch": 0.11044106303905757, + "grad_norm": 0.24343058466911316, + "learning_rate": 1.840598340539909e-05, + "loss": 0.5002, + "step": 3150 + }, + { + "epoch": 0.11131757941238342, + "grad_norm": 0.7673718333244324, + "learning_rate": 1.855206263877527e-05, + "loss": 0.2582, + "step": 3175 + }, + { + "epoch": 0.11219409578570928, + "grad_norm": 56.92452621459961, + "learning_rate": 1.8698141872151457e-05, + "loss": 0.5158, + "step": 3200 + }, + { + "epoch": 0.11307061215903513, + "grad_norm": 0.008615897037088871, + "learning_rate": 1.884422110552764e-05, + "loss": 0.1509, + "step": 3225 + }, + { + "epoch": 0.11394712853236098, + "grad_norm": 0.16383527219295502, + "learning_rate": 1.8990300338903822e-05, + "loss": 0.9219, + "step": 3250 + }, + { + "epoch": 0.11482364490568683, + "grad_norm": 0.0674627274274826, + "learning_rate": 1.9136379572280004e-05, + "loss": 0.1951, + "step": 3275 + }, + { + "epoch": 0.1157001612790127, + "grad_norm": 27.751262664794922, + "learning_rate": 1.928245880565619e-05, + "loss": 0.2699, + "step": 3300 + }, + { + "epoch": 0.11657667765233855, + "grad_norm": 0.10257447510957718, + "learning_rate": 1.9428538039032372e-05, + "loss": 0.515, + "step": 3325 + }, + { + "epoch": 0.1174531940256644, + "grad_norm": 0.03210365027189255, + "learning_rate": 1.9574617272408554e-05, + "loss": 0.2885, + "step": 3350 + }, + { + "epoch": 0.11832971039899025, + "grad_norm": 0.050658296793699265, + "learning_rate": 1.972069650578474e-05, + "loss": 0.6961, + "step": 3375 + }, + { + "epoch": 0.11920622677231611, + "grad_norm": 0.17556484043598175, + "learning_rate": 1.9866775739160922e-05, + "loss": 0.3722, + "step": 3400 + }, + { + "epoch": 0.12008274314564196, + "grad_norm": 0.09483896195888519, + "learning_rate": 2.0012854972537104e-05, + "loss": 0.1024, + "step": 3425 + }, + { + "epoch": 0.12095925951896781, + "grad_norm": 17.167850494384766, + "learning_rate": 2.0158934205913287e-05, + "loss": 0.491, + "step": 3450 + }, + { + "epoch": 0.12183577589229366, + "grad_norm": 0.15880537033081055, + "learning_rate": 2.0305013439289472e-05, + "loss": 0.6782, + "step": 3475 + }, + { + "epoch": 0.12271229226561953, + "grad_norm": 17.070470809936523, + "learning_rate": 2.0451092672665655e-05, + "loss": 0.2956, + "step": 3500 + }, + { + "epoch": 0.12358880863894538, + "grad_norm": 18.908693313598633, + "learning_rate": 2.0597171906041837e-05, + "loss": 0.3357, + "step": 3525 + }, + { + "epoch": 0.12446532501227123, + "grad_norm": 0.05319523438811302, + "learning_rate": 2.0743251139418023e-05, + "loss": 0.4775, + "step": 3550 + }, + { + "epoch": 0.1253418413855971, + "grad_norm": 0.04387371242046356, + "learning_rate": 2.0889330372794205e-05, + "loss": 0.1066, + "step": 3575 + }, + { + "epoch": 0.12621835775892293, + "grad_norm": 0.05877144634723663, + "learning_rate": 2.1035409606170387e-05, + "loss": 0.0019, + "step": 3600 + }, + { + "epoch": 0.1270948741322488, + "grad_norm": 0.09024593979120255, + "learning_rate": 2.118148883954657e-05, + "loss": 0.6718, + "step": 3625 + }, + { + "epoch": 0.12797139050557466, + "grad_norm": 0.5445433259010315, + "learning_rate": 2.1327568072922755e-05, + "loss": 0.3152, + "step": 3650 + }, + { + "epoch": 0.1288479068789005, + "grad_norm": 0.15765056014060974, + "learning_rate": 2.1473647306298937e-05, + "loss": 0.5393, + "step": 3675 + }, + { + "epoch": 0.12972442325222636, + "grad_norm": 46.40446090698242, + "learning_rate": 2.161972653967512e-05, + "loss": 0.5121, + "step": 3700 + }, + { + "epoch": 0.1306009396255522, + "grad_norm": 120.83280944824219, + "learning_rate": 2.1765805773051305e-05, + "loss": 0.4854, + "step": 3725 + }, + { + "epoch": 0.13147745599887806, + "grad_norm": 17.71187400817871, + "learning_rate": 2.1911885006427488e-05, + "loss": 0.4984, + "step": 3750 + }, + { + "epoch": 0.13235397237220392, + "grad_norm": 0.13994598388671875, + "learning_rate": 2.2057964239803673e-05, + "loss": 0.3366, + "step": 3775 + }, + { + "epoch": 0.13323048874552976, + "grad_norm": 0.0466713048517704, + "learning_rate": 2.2204043473179852e-05, + "loss": 0.9932, + "step": 3800 + }, + { + "epoch": 0.13410700511885562, + "grad_norm": 0.11757256835699081, + "learning_rate": 2.2350122706556038e-05, + "loss": 0.417, + "step": 3825 + }, + { + "epoch": 0.13498352149218149, + "grad_norm": 15.387404441833496, + "learning_rate": 2.249620193993222e-05, + "loss": 0.6467, + "step": 3850 + }, + { + "epoch": 0.13586003786550732, + "grad_norm": 24.66891098022461, + "learning_rate": 2.2642281173308402e-05, + "loss": 0.6267, + "step": 3875 + }, + { + "epoch": 0.13673655423883319, + "grad_norm": 0.0671074390411377, + "learning_rate": 2.2788360406684585e-05, + "loss": 0.3911, + "step": 3900 + }, + { + "epoch": 0.13761307061215902, + "grad_norm": 301.1547546386719, + "learning_rate": 2.293443964006077e-05, + "loss": 0.3945, + "step": 3925 + }, + { + "epoch": 0.13848958698548489, + "grad_norm": 0.017921900376677513, + "learning_rate": 2.3080518873436953e-05, + "loss": 0.6659, + "step": 3950 + }, + { + "epoch": 0.13936610335881075, + "grad_norm": 6.55811071395874, + "learning_rate": 2.3226598106813135e-05, + "loss": 0.2245, + "step": 3975 + }, + { + "epoch": 0.1402426197321366, + "grad_norm": 0.15009891986846924, + "learning_rate": 2.337267734018932e-05, + "loss": 0.4444, + "step": 4000 + }, + { + "epoch": 0.14111913610546245, + "grad_norm": 0.13055655360221863, + "learning_rate": 2.3518756573565503e-05, + "loss": 0.522, + "step": 4025 + }, + { + "epoch": 0.14199565247878831, + "grad_norm": 0.06490367650985718, + "learning_rate": 2.366483580694169e-05, + "loss": 0.353, + "step": 4050 + }, + { + "epoch": 0.14287216885211415, + "grad_norm": 0.06954118609428406, + "learning_rate": 2.3810915040317867e-05, + "loss": 0.2833, + "step": 4075 + }, + { + "epoch": 0.14374868522544001, + "grad_norm": 0.014703314751386642, + "learning_rate": 2.3956994273694053e-05, + "loss": 0.088, + "step": 4100 + }, + { + "epoch": 0.14462520159876585, + "grad_norm": 0.004070211201906204, + "learning_rate": 2.4103073507070235e-05, + "loss": 0.54, + "step": 4125 + }, + { + "epoch": 0.14550171797209172, + "grad_norm": 0.009033525362610817, + "learning_rate": 2.4249152740446418e-05, + "loss": 0.3313, + "step": 4150 + }, + { + "epoch": 0.14637823434541758, + "grad_norm": 0.0849425345659256, + "learning_rate": 2.4395231973822603e-05, + "loss": 0.2125, + "step": 4175 + }, + { + "epoch": 0.14725475071874342, + "grad_norm": 0.012493799440562725, + "learning_rate": 2.4541311207198786e-05, + "loss": 0.423, + "step": 4200 + }, + { + "epoch": 0.14813126709206928, + "grad_norm": 15.468573570251465, + "learning_rate": 2.468739044057497e-05, + "loss": 0.341, + "step": 4225 + }, + { + "epoch": 0.14900778346539514, + "grad_norm": 71.20480346679688, + "learning_rate": 2.483346967395115e-05, + "loss": 0.6763, + "step": 4250 + }, + { + "epoch": 0.14988429983872098, + "grad_norm": 0.20974421501159668, + "learning_rate": 2.4979548907327336e-05, + "loss": 0.4438, + "step": 4275 + }, + { + "epoch": 0.15076081621204684, + "grad_norm": 0.0029344165232032537, + "learning_rate": 2.5125628140703518e-05, + "loss": 0.2785, + "step": 4300 + }, + { + "epoch": 0.1516373325853727, + "grad_norm": 0.1142181009054184, + "learning_rate": 2.52717073740797e-05, + "loss": 0.5792, + "step": 4325 + }, + { + "epoch": 0.15251384895869854, + "grad_norm": 0.07386382669210434, + "learning_rate": 2.5417786607455886e-05, + "loss": 0.0998, + "step": 4350 + }, + { + "epoch": 0.1533903653320244, + "grad_norm": 0.8367793560028076, + "learning_rate": 2.556386584083207e-05, + "loss": 0.5127, + "step": 4375 + }, + { + "epoch": 0.15426688170535025, + "grad_norm": 0.17137207090854645, + "learning_rate": 2.5709945074208254e-05, + "loss": 0.6059, + "step": 4400 + }, + { + "epoch": 0.1551433980786761, + "grad_norm": 0.04565940052270889, + "learning_rate": 2.5856024307584436e-05, + "loss": 0.1877, + "step": 4425 + }, + { + "epoch": 0.15601991445200197, + "grad_norm": 0.012102818116545677, + "learning_rate": 2.6002103540960622e-05, + "loss": 0.1699, + "step": 4450 + }, + { + "epoch": 0.1568964308253278, + "grad_norm": 0.14147436618804932, + "learning_rate": 2.6148182774336797e-05, + "loss": 0.5187, + "step": 4475 + }, + { + "epoch": 0.15777294719865367, + "grad_norm": 0.05028606951236725, + "learning_rate": 2.6294262007712983e-05, + "loss": 0.7006, + "step": 4500 + }, + { + "epoch": 0.15864946357197954, + "grad_norm": 0.02696312591433525, + "learning_rate": 2.644034124108917e-05, + "loss": 0.3425, + "step": 4525 + }, + { + "epoch": 0.15952597994530537, + "grad_norm": 0.1011129766702652, + "learning_rate": 2.658642047446535e-05, + "loss": 0.4085, + "step": 4550 + }, + { + "epoch": 0.16040249631863124, + "grad_norm": 0.02831619419157505, + "learning_rate": 2.6732499707841537e-05, + "loss": 0.2622, + "step": 4575 + }, + { + "epoch": 0.16127901269195707, + "grad_norm": 0.0810491219162941, + "learning_rate": 2.687857894121772e-05, + "loss": 0.4499, + "step": 4600 + }, + { + "epoch": 0.16215552906528294, + "grad_norm": 0.3163711130619049, + "learning_rate": 2.7024658174593905e-05, + "loss": 0.614, + "step": 4625 + }, + { + "epoch": 0.1630320454386088, + "grad_norm": 29.76104164123535, + "learning_rate": 2.717073740797008e-05, + "loss": 0.8741, + "step": 4650 + }, + { + "epoch": 0.16390856181193464, + "grad_norm": 14.154927253723145, + "learning_rate": 2.7316816641346266e-05, + "loss": 0.5525, + "step": 4675 + }, + { + "epoch": 0.1647850781852605, + "grad_norm": 0.031115127727389336, + "learning_rate": 2.7462895874722448e-05, + "loss": 0.4604, + "step": 4700 + }, + { + "epoch": 0.16566159455858637, + "grad_norm": 0.028101066127419472, + "learning_rate": 2.7608975108098634e-05, + "loss": 0.5299, + "step": 4725 + }, + { + "epoch": 0.1665381109319122, + "grad_norm": 0.027432158589363098, + "learning_rate": 2.7755054341474816e-05, + "loss": 0.7012, + "step": 4750 + }, + { + "epoch": 0.16741462730523807, + "grad_norm": 0.11607489734888077, + "learning_rate": 2.7901133574851002e-05, + "loss": 0.3266, + "step": 4775 + }, + { + "epoch": 0.1682911436785639, + "grad_norm": 0.08162295818328857, + "learning_rate": 2.8047212808227184e-05, + "loss": 0.5074, + "step": 4800 + }, + { + "epoch": 0.16916766005188977, + "grad_norm": 0.03349420428276062, + "learning_rate": 2.819329204160337e-05, + "loss": 0.2219, + "step": 4825 + }, + { + "epoch": 0.17004417642521563, + "grad_norm": 0.0038992324844002724, + "learning_rate": 2.833937127497955e-05, + "loss": 0.2832, + "step": 4850 + }, + { + "epoch": 0.17092069279854147, + "grad_norm": 287.67901611328125, + "learning_rate": 2.848545050835573e-05, + "loss": 0.593, + "step": 4875 + }, + { + "epoch": 0.17179720917186733, + "grad_norm": 0.020233599469065666, + "learning_rate": 2.8631529741731917e-05, + "loss": 0.1723, + "step": 4900 + }, + { + "epoch": 0.1726737255451932, + "grad_norm": 1.8597439527511597, + "learning_rate": 2.87776089751081e-05, + "loss": 0.6537, + "step": 4925 + }, + { + "epoch": 0.17355024191851903, + "grad_norm": 0.17728838324546814, + "learning_rate": 2.8923688208484284e-05, + "loss": 0.1364, + "step": 4950 + }, + { + "epoch": 0.1744267582918449, + "grad_norm": 0.002202677307650447, + "learning_rate": 2.9069767441860467e-05, + "loss": 0.3779, + "step": 4975 + }, + { + "epoch": 0.17530327466517073, + "grad_norm": 173.83096313476562, + "learning_rate": 2.9215846675236652e-05, + "loss": 0.2673, + "step": 5000 + }, + { + "epoch": 0.1761797910384966, + "grad_norm": 0.14056913554668427, + "learning_rate": 2.936192590861283e-05, + "loss": 0.5104, + "step": 5025 + }, + { + "epoch": 0.17705630741182246, + "grad_norm": 0.013601308688521385, + "learning_rate": 2.9508005141989014e-05, + "loss": 0.5533, + "step": 5050 + }, + { + "epoch": 0.1779328237851483, + "grad_norm": 0.07416268438100815, + "learning_rate": 2.96540843753652e-05, + "loss": 0.1069, + "step": 5075 + }, + { + "epoch": 0.17880934015847416, + "grad_norm": 14.669050216674805, + "learning_rate": 2.980016360874138e-05, + "loss": 0.4707, + "step": 5100 + }, + { + "epoch": 0.17968585653180003, + "grad_norm": 0.0029322488699108362, + "learning_rate": 2.9946242842117567e-05, + "loss": 0.1281, + "step": 5125 + }, + { + "epoch": 0.18056237290512586, + "grad_norm": 0.006498234812170267, + "learning_rate": 3.009232207549375e-05, + "loss": 0.3118, + "step": 5150 + }, + { + "epoch": 0.18143888927845173, + "grad_norm": 0.006581272929906845, + "learning_rate": 3.0238401308869935e-05, + "loss": 0.1932, + "step": 5175 + }, + { + "epoch": 0.18231540565177756, + "grad_norm": 0.0066956402733922005, + "learning_rate": 3.0384480542246114e-05, + "loss": 0.4454, + "step": 5200 + }, + { + "epoch": 0.18319192202510343, + "grad_norm": 39.703582763671875, + "learning_rate": 3.05305597756223e-05, + "loss": 0.3908, + "step": 5225 + }, + { + "epoch": 0.1840684383984293, + "grad_norm": 20.502132415771484, + "learning_rate": 3.067663900899848e-05, + "loss": 0.6984, + "step": 5250 + }, + { + "epoch": 0.18494495477175513, + "grad_norm": 0.09604960680007935, + "learning_rate": 3.0822718242374664e-05, + "loss": 0.591, + "step": 5275 + }, + { + "epoch": 0.185821471145081, + "grad_norm": 0.4947805106639862, + "learning_rate": 3.0968797475750847e-05, + "loss": 0.3877, + "step": 5300 + }, + { + "epoch": 0.18669798751840685, + "grad_norm": 0.15477994084358215, + "learning_rate": 3.1114876709127036e-05, + "loss": 0.3866, + "step": 5325 + }, + { + "epoch": 0.1875745038917327, + "grad_norm": 0.00502210995182395, + "learning_rate": 3.126095594250322e-05, + "loss": 0.6835, + "step": 5350 + }, + { + "epoch": 0.18845102026505856, + "grad_norm": 0.1294722855091095, + "learning_rate": 3.14070351758794e-05, + "loss": 0.4384, + "step": 5375 + }, + { + "epoch": 0.1893275366383844, + "grad_norm": 0.053035613149404526, + "learning_rate": 3.155311440925558e-05, + "loss": 0.2003, + "step": 5400 + }, + { + "epoch": 0.19020405301171026, + "grad_norm": 0.003728720359504223, + "learning_rate": 3.1699193642631765e-05, + "loss": 0.2029, + "step": 5425 + }, + { + "epoch": 0.19108056938503612, + "grad_norm": 0.003917012829333544, + "learning_rate": 3.184527287600795e-05, + "loss": 0.4782, + "step": 5450 + }, + { + "epoch": 0.19195708575836196, + "grad_norm": 0.026103071868419647, + "learning_rate": 3.199135210938413e-05, + "loss": 0.2519, + "step": 5475 + }, + { + "epoch": 0.19283360213168782, + "grad_norm": 0.1464361548423767, + "learning_rate": 3.213743134276032e-05, + "loss": 0.5831, + "step": 5500 + }, + { + "epoch": 0.19371011850501368, + "grad_norm": 0.040101371705532074, + "learning_rate": 3.22835105761365e-05, + "loss": 0.3565, + "step": 5525 + }, + { + "epoch": 0.19458663487833952, + "grad_norm": 0.02921321429312229, + "learning_rate": 3.242958980951268e-05, + "loss": 0.2323, + "step": 5550 + }, + { + "epoch": 0.19546315125166538, + "grad_norm": 0.010292228311300278, + "learning_rate": 3.2575669042888865e-05, + "loss": 0.3112, + "step": 5575 + }, + { + "epoch": 0.19633966762499125, + "grad_norm": 0.09007540345191956, + "learning_rate": 3.272174827626505e-05, + "loss": 0.2464, + "step": 5600 + }, + { + "epoch": 0.19721618399831709, + "grad_norm": 0.09578502178192139, + "learning_rate": 3.286782750964123e-05, + "loss": 0.1395, + "step": 5625 + }, + { + "epoch": 0.19809270037164295, + "grad_norm": 0.3331282436847687, + "learning_rate": 3.301390674301741e-05, + "loss": 0.4548, + "step": 5650 + }, + { + "epoch": 0.19896921674496879, + "grad_norm": 0.06136519834399223, + "learning_rate": 3.31599859763936e-05, + "loss": 0.1022, + "step": 5675 + }, + { + "epoch": 0.19984573311829465, + "grad_norm": 0.07931138575077057, + "learning_rate": 3.330606520976978e-05, + "loss": 0.5391, + "step": 5700 + }, + { + "epoch": 0.2007222494916205, + "grad_norm": 0.17277351021766663, + "learning_rate": 3.3452144443145966e-05, + "loss": 0.6375, + "step": 5725 + }, + { + "epoch": 0.20159876586494635, + "grad_norm": 0.0823805183172226, + "learning_rate": 3.359822367652215e-05, + "loss": 0.0873, + "step": 5750 + }, + { + "epoch": 0.20247528223827221, + "grad_norm": 0.00291452812962234, + "learning_rate": 3.374430290989833e-05, + "loss": 0.2952, + "step": 5775 + }, + { + "epoch": 0.20335179861159808, + "grad_norm": 0.05693582817912102, + "learning_rate": 3.389038214327451e-05, + "loss": 0.1894, + "step": 5800 + }, + { + "epoch": 0.20422831498492391, + "grad_norm": 0.07621035724878311, + "learning_rate": 3.4036461376650695e-05, + "loss": 0.2173, + "step": 5825 + }, + { + "epoch": 0.20510483135824978, + "grad_norm": 0.6673757433891296, + "learning_rate": 3.4182540610026884e-05, + "loss": 0.6855, + "step": 5850 + }, + { + "epoch": 0.20598134773157561, + "grad_norm": 0.005976413376629353, + "learning_rate": 3.4328619843403066e-05, + "loss": 0.0784, + "step": 5875 + }, + { + "epoch": 0.20685786410490148, + "grad_norm": 0.0217595137655735, + "learning_rate": 3.447469907677925e-05, + "loss": 0.0166, + "step": 5900 + }, + { + "epoch": 0.20773438047822734, + "grad_norm": 0.09609726816415787, + "learning_rate": 3.462077831015543e-05, + "loss": 0.2975, + "step": 5925 + }, + { + "epoch": 0.20861089685155318, + "grad_norm": 0.01075949240475893, + "learning_rate": 3.476685754353161e-05, + "loss": 0.1547, + "step": 5950 + }, + { + "epoch": 0.20948741322487904, + "grad_norm": 0.06927796453237534, + "learning_rate": 3.4912936776907795e-05, + "loss": 0.76, + "step": 5975 + }, + { + "epoch": 0.2103639295982049, + "grad_norm": 10.869850158691406, + "learning_rate": 3.505901601028398e-05, + "loss": 0.3923, + "step": 6000 + }, + { + "epoch": 0.21124044597153074, + "grad_norm": 14.71922779083252, + "learning_rate": 3.5205095243660167e-05, + "loss": 0.6041, + "step": 6025 + }, + { + "epoch": 0.2121169623448566, + "grad_norm": 0.004070666618645191, + "learning_rate": 3.535117447703635e-05, + "loss": 0.1901, + "step": 6050 + }, + { + "epoch": 0.21299347871818244, + "grad_norm": 0.004725561942905188, + "learning_rate": 3.549725371041253e-05, + "loss": 0.2384, + "step": 6075 + }, + { + "epoch": 0.2138699950915083, + "grad_norm": 0.08699026703834534, + "learning_rate": 3.564333294378871e-05, + "loss": 0.3512, + "step": 6100 + }, + { + "epoch": 0.21474651146483417, + "grad_norm": 0.2809734046459198, + "learning_rate": 3.5789412177164896e-05, + "loss": 0.8897, + "step": 6125 + }, + { + "epoch": 0.21562302783816, + "grad_norm": 0.05807247385382652, + "learning_rate": 3.593549141054108e-05, + "loss": 0.5225, + "step": 6150 + }, + { + "epoch": 0.21649954421148587, + "grad_norm": 15.52560043334961, + "learning_rate": 3.608157064391726e-05, + "loss": 0.843, + "step": 6175 + }, + { + "epoch": 0.21737606058481174, + "grad_norm": 0.004217942710965872, + "learning_rate": 3.622764987729344e-05, + "loss": 0.242, + "step": 6200 + }, + { + "epoch": 0.21825257695813757, + "grad_norm": 0.054683223366737366, + "learning_rate": 3.637372911066963e-05, + "loss": 0.6405, + "step": 6225 + }, + { + "epoch": 0.21912909333146344, + "grad_norm": 0.009918780997395515, + "learning_rate": 3.6519808344045814e-05, + "loss": 0.1018, + "step": 6250 + }, + { + "epoch": 0.22000560970478927, + "grad_norm": 14.392415046691895, + "learning_rate": 3.6665887577421996e-05, + "loss": 0.8108, + "step": 6275 + }, + { + "epoch": 0.22088212607811514, + "grad_norm": 0.25934675335884094, + "learning_rate": 3.681196681079818e-05, + "loss": 0.21, + "step": 6300 + }, + { + "epoch": 0.221758642451441, + "grad_norm": 0.0038281118031591177, + "learning_rate": 3.695804604417436e-05, + "loss": 0.4515, + "step": 6325 + }, + { + "epoch": 0.22263515882476684, + "grad_norm": 29.352365493774414, + "learning_rate": 3.710412527755054e-05, + "loss": 0.3985, + "step": 6350 + }, + { + "epoch": 0.2235116751980927, + "grad_norm": 0.06942961364984512, + "learning_rate": 3.7250204510926725e-05, + "loss": 0.2052, + "step": 6375 + }, + { + "epoch": 0.22438819157141857, + "grad_norm": 14.561807632446289, + "learning_rate": 3.7396283744302914e-05, + "loss": 1.2276, + "step": 6400 + }, + { + "epoch": 0.2252647079447444, + "grad_norm": 0.0028103559743613005, + "learning_rate": 3.7542362977679097e-05, + "loss": 0.1659, + "step": 6425 + }, + { + "epoch": 0.22614122431807027, + "grad_norm": 0.1468304842710495, + "learning_rate": 3.768844221105528e-05, + "loss": 0.4441, + "step": 6450 + }, + { + "epoch": 0.2270177406913961, + "grad_norm": 0.22474709153175354, + "learning_rate": 3.783452144443146e-05, + "loss": 0.7848, + "step": 6475 + }, + { + "epoch": 0.22789425706472197, + "grad_norm": 0.16377055644989014, + "learning_rate": 3.7980600677807643e-05, + "loss": 0.6396, + "step": 6500 + }, + { + "epoch": 0.22877077343804783, + "grad_norm": 0.003603485180065036, + "learning_rate": 3.8126679911183826e-05, + "loss": 0.3002, + "step": 6525 + }, + { + "epoch": 0.22964728981137367, + "grad_norm": 0.2712433934211731, + "learning_rate": 3.827275914456001e-05, + "loss": 0.205, + "step": 6550 + }, + { + "epoch": 0.23052380618469953, + "grad_norm": 0.04516521841287613, + "learning_rate": 3.84188383779362e-05, + "loss": 0.2712, + "step": 6575 + }, + { + "epoch": 0.2314003225580254, + "grad_norm": 0.018658054992556572, + "learning_rate": 3.856491761131238e-05, + "loss": 0.5432, + "step": 6600 + }, + { + "epoch": 0.23227683893135123, + "grad_norm": 0.059260524809360504, + "learning_rate": 3.871099684468856e-05, + "loss": 0.4181, + "step": 6625 + }, + { + "epoch": 0.2331533553046771, + "grad_norm": 15.432456016540527, + "learning_rate": 3.8857076078064744e-05, + "loss": 1.0501, + "step": 6650 + }, + { + "epoch": 0.23402987167800293, + "grad_norm": 23.586023330688477, + "learning_rate": 3.9003155311440926e-05, + "loss": 1.0634, + "step": 6675 + }, + { + "epoch": 0.2349063880513288, + "grad_norm": 0.6980836391448975, + "learning_rate": 3.914923454481711e-05, + "loss": 1.1056, + "step": 6700 + }, + { + "epoch": 0.23578290442465466, + "grad_norm": 470.1695556640625, + "learning_rate": 3.929531377819329e-05, + "loss": 0.7509, + "step": 6725 + }, + { + "epoch": 0.2366594207979805, + "grad_norm": 0.10339067876338959, + "learning_rate": 3.944139301156948e-05, + "loss": 0.4353, + "step": 6750 + }, + { + "epoch": 0.23753593717130636, + "grad_norm": 0.22904811799526215, + "learning_rate": 3.958747224494566e-05, + "loss": 1.2039, + "step": 6775 + }, + { + "epoch": 0.23841245354463222, + "grad_norm": 0.27951598167419434, + "learning_rate": 3.9733551478321844e-05, + "loss": 0.4529, + "step": 6800 + }, + { + "epoch": 0.23928896991795806, + "grad_norm": 0.32310083508491516, + "learning_rate": 3.9879630711698027e-05, + "loss": 0.8744, + "step": 6825 + }, + { + "epoch": 0.24016548629128393, + "grad_norm": 0.06718556582927704, + "learning_rate": 4.002570994507421e-05, + "loss": 0.7638, + "step": 6850 + }, + { + "epoch": 0.24104200266460976, + "grad_norm": 0.044630344957113266, + "learning_rate": 4.017178917845039e-05, + "loss": 0.2469, + "step": 6875 + }, + { + "epoch": 0.24191851903793563, + "grad_norm": 0.22913067042827606, + "learning_rate": 4.0317868411826573e-05, + "loss": 0.5894, + "step": 6900 + }, + { + "epoch": 0.2427950354112615, + "grad_norm": 0.008846006356179714, + "learning_rate": 4.046394764520276e-05, + "loss": 0.5331, + "step": 6925 + }, + { + "epoch": 0.24367155178458733, + "grad_norm": 0.4238869249820709, + "learning_rate": 4.0610026878578945e-05, + "loss": 0.5537, + "step": 6950 + }, + { + "epoch": 0.2445480681579132, + "grad_norm": 16.396738052368164, + "learning_rate": 4.075610611195513e-05, + "loss": 0.4453, + "step": 6975 + }, + { + "epoch": 0.24542458453123905, + "grad_norm": 0.0037842351011931896, + "learning_rate": 4.090218534533131e-05, + "loss": 1.1031, + "step": 7000 + }, + { + "epoch": 0.2463011009045649, + "grad_norm": 0.13203194737434387, + "learning_rate": 4.104826457870749e-05, + "loss": 0.5787, + "step": 7025 + }, + { + "epoch": 0.24717761727789075, + "grad_norm": 0.2224767655134201, + "learning_rate": 4.1194343812083674e-05, + "loss": 0.3844, + "step": 7050 + }, + { + "epoch": 0.24805413365121662, + "grad_norm": 13.80251407623291, + "learning_rate": 4.1340423045459856e-05, + "loss": 0.4636, + "step": 7075 + }, + { + "epoch": 0.24893065002454245, + "grad_norm": 0.004171015229076147, + "learning_rate": 4.1486502278836045e-05, + "loss": 0.3697, + "step": 7100 + }, + { + "epoch": 0.24980716639786832, + "grad_norm": 0.0030233901925385, + "learning_rate": 4.163258151221223e-05, + "loss": 0.365, + "step": 7125 + }, + { + "epoch": 0.2506836827711942, + "grad_norm": 0.15397627651691437, + "learning_rate": 4.177866074558841e-05, + "loss": 0.3929, + "step": 7150 + }, + { + "epoch": 0.25156019914452005, + "grad_norm": 0.13004937767982483, + "learning_rate": 4.192473997896459e-05, + "loss": 0.3873, + "step": 7175 + }, + { + "epoch": 0.25243671551784586, + "grad_norm": 0.004481049254536629, + "learning_rate": 4.2070819212340774e-05, + "loss": 0.5561, + "step": 7200 + }, + { + "epoch": 0.2533132318911717, + "grad_norm": 0.00627360912039876, + "learning_rate": 4.2216898445716957e-05, + "loss": 0.4077, + "step": 7225 + }, + { + "epoch": 0.2541897482644976, + "grad_norm": 0.06008846312761307, + "learning_rate": 4.236297767909314e-05, + "loss": 0.8886, + "step": 7250 + }, + { + "epoch": 0.25506626463782345, + "grad_norm": 0.016359636560082436, + "learning_rate": 4.250905691246933e-05, + "loss": 0.5864, + "step": 7275 + }, + { + "epoch": 0.2559427810111493, + "grad_norm": 14.299752235412598, + "learning_rate": 4.265513614584551e-05, + "loss": 0.3408, + "step": 7300 + }, + { + "epoch": 0.2568192973844751, + "grad_norm": 0.032054781913757324, + "learning_rate": 4.280121537922169e-05, + "loss": 0.4442, + "step": 7325 + }, + { + "epoch": 0.257695813757801, + "grad_norm": 0.12501658499240875, + "learning_rate": 4.2947294612597875e-05, + "loss": 0.0883, + "step": 7350 + }, + { + "epoch": 0.25857233013112685, + "grad_norm": 0.41116446256637573, + "learning_rate": 4.3093373845974064e-05, + "loss": 1.0948, + "step": 7375 + }, + { + "epoch": 0.2594488465044527, + "grad_norm": 0.4472516179084778, + "learning_rate": 4.323945307935024e-05, + "loss": 0.8658, + "step": 7400 + }, + { + "epoch": 0.2603253628777786, + "grad_norm": 15.141562461853027, + "learning_rate": 4.338553231272642e-05, + "loss": 1.0331, + "step": 7425 + }, + { + "epoch": 0.2612018792511044, + "grad_norm": 0.016509605571627617, + "learning_rate": 4.353161154610261e-05, + "loss": 0.6169, + "step": 7450 + }, + { + "epoch": 0.26207839562443025, + "grad_norm": 3171.77490234375, + "learning_rate": 4.367769077947879e-05, + "loss": 0.8888, + "step": 7475 + }, + { + "epoch": 0.2629549119977561, + "grad_norm": 57.50590896606445, + "learning_rate": 4.3823770012854975e-05, + "loss": 0.6352, + "step": 7500 + }, + { + "epoch": 0.263831428371082, + "grad_norm": 0.06253138184547424, + "learning_rate": 4.396984924623116e-05, + "loss": 0.4102, + "step": 7525 + }, + { + "epoch": 0.26470794474440784, + "grad_norm": 0.2149645835161209, + "learning_rate": 4.4115928479607347e-05, + "loss": 0.2617, + "step": 7550 + }, + { + "epoch": 0.2655844611177337, + "grad_norm": 0.20151516795158386, + "learning_rate": 4.426200771298352e-05, + "loss": 0.4983, + "step": 7575 + }, + { + "epoch": 0.2664609774910595, + "grad_norm": 0.004399659112095833, + "learning_rate": 4.4408086946359704e-05, + "loss": 0.8526, + "step": 7600 + }, + { + "epoch": 0.2673374938643854, + "grad_norm": 0.0034480225294828415, + "learning_rate": 4.4554166179735893e-05, + "loss": 0.4235, + "step": 7625 + }, + { + "epoch": 0.26821401023771124, + "grad_norm": 0.0543229877948761, + "learning_rate": 4.4700245413112076e-05, + "loss": 0.5732, + "step": 7650 + }, + { + "epoch": 0.2690905266110371, + "grad_norm": 0.12486666440963745, + "learning_rate": 4.484632464648826e-05, + "loss": 0.6661, + "step": 7675 + }, + { + "epoch": 0.26996704298436297, + "grad_norm": 0.2770999073982239, + "learning_rate": 4.499240387986444e-05, + "loss": 0.4755, + "step": 7700 + }, + { + "epoch": 0.2708435593576888, + "grad_norm": 0.09385648369789124, + "learning_rate": 4.513848311324063e-05, + "loss": 0.3906, + "step": 7725 + }, + { + "epoch": 0.27172007573101464, + "grad_norm": 1.1177512407302856, + "learning_rate": 4.5284562346616805e-05, + "loss": 0.6681, + "step": 7750 + }, + { + "epoch": 0.2725965921043405, + "grad_norm": 0.050830770283937454, + "learning_rate": 4.543064157999299e-05, + "loss": 0.2554, + "step": 7775 + }, + { + "epoch": 0.27347310847766637, + "grad_norm": 14.355769157409668, + "learning_rate": 4.557672081336917e-05, + "loss": 0.3053, + "step": 7800 + }, + { + "epoch": 0.27434962485099224, + "grad_norm": 0.01108743716031313, + "learning_rate": 4.572280004674536e-05, + "loss": 0.5017, + "step": 7825 + }, + { + "epoch": 0.27522614122431804, + "grad_norm": 0.010831678286194801, + "learning_rate": 4.586887928012154e-05, + "loss": 0.6782, + "step": 7850 + }, + { + "epoch": 0.2761026575976439, + "grad_norm": 0.22110404074192047, + "learning_rate": 4.601495851349772e-05, + "loss": 0.4039, + "step": 7875 + }, + { + "epoch": 0.27697917397096977, + "grad_norm": 0.0967605784535408, + "learning_rate": 4.6161037746873905e-05, + "loss": 0.3896, + "step": 7900 + }, + { + "epoch": 0.27785569034429564, + "grad_norm": 0.10672342032194138, + "learning_rate": 4.6307116980250094e-05, + "loss": 0.4189, + "step": 7925 + }, + { + "epoch": 0.2787322067176215, + "grad_norm": 0.02347092144191265, + "learning_rate": 4.645319621362627e-05, + "loss": 0.3819, + "step": 7950 + }, + { + "epoch": 0.27960872309094736, + "grad_norm": 10.138099670410156, + "learning_rate": 4.659927544700245e-05, + "loss": 0.5992, + "step": 7975 + }, + { + "epoch": 0.2804852394642732, + "grad_norm": 42.00944519042969, + "learning_rate": 4.674535468037864e-05, + "loss": 0.1839, + "step": 8000 + }, + { + "epoch": 0.28136175583759904, + "grad_norm": 0.17899727821350098, + "learning_rate": 4.6891433913754823e-05, + "loss": 0.4001, + "step": 8025 + }, + { + "epoch": 0.2822382722109249, + "grad_norm": 0.21548710763454437, + "learning_rate": 4.7037513147131006e-05, + "loss": 0.1303, + "step": 8050 + }, + { + "epoch": 0.28311478858425076, + "grad_norm": 0.003747751237824559, + "learning_rate": 4.718359238050719e-05, + "loss": 0.6089, + "step": 8075 + }, + { + "epoch": 0.28399130495757663, + "grad_norm": 0.09477001428604126, + "learning_rate": 4.732967161388338e-05, + "loss": 0.5175, + "step": 8100 + }, + { + "epoch": 0.28486782133090244, + "grad_norm": 0.2736736238002777, + "learning_rate": 4.747575084725955e-05, + "loss": 0.1864, + "step": 8125 + }, + { + "epoch": 0.2857443377042283, + "grad_norm": 0.11316664516925812, + "learning_rate": 4.7621830080635735e-05, + "loss": 0.5197, + "step": 8150 + }, + { + "epoch": 0.28662085407755417, + "grad_norm": 0.09680013358592987, + "learning_rate": 4.7767909314011924e-05, + "loss": 0.2123, + "step": 8175 + }, + { + "epoch": 0.28749737045088003, + "grad_norm": 0.04082019254565239, + "learning_rate": 4.7913988547388106e-05, + "loss": 0.198, + "step": 8200 + }, + { + "epoch": 0.2883738868242059, + "grad_norm": 0.1372864544391632, + "learning_rate": 4.806006778076429e-05, + "loss": 0.5117, + "step": 8225 + }, + { + "epoch": 0.2892504031975317, + "grad_norm": 0.24542997777462006, + "learning_rate": 4.820614701414047e-05, + "loss": 0.5493, + "step": 8250 + }, + { + "epoch": 0.29012691957085757, + "grad_norm": 0.03176625445485115, + "learning_rate": 4.835222624751666e-05, + "loss": 0.1878, + "step": 8275 + }, + { + "epoch": 0.29100343594418343, + "grad_norm": 0.06692216545343399, + "learning_rate": 4.8498305480892835e-05, + "loss": 0.6638, + "step": 8300 + }, + { + "epoch": 0.2918799523175093, + "grad_norm": 0.05381672456860542, + "learning_rate": 4.864438471426902e-05, + "loss": 0.08, + "step": 8325 + }, + { + "epoch": 0.29275646869083516, + "grad_norm": 15.64413070678711, + "learning_rate": 4.879046394764521e-05, + "loss": 0.6041, + "step": 8350 + }, + { + "epoch": 0.293632985064161, + "grad_norm": 0.004016489256173372, + "learning_rate": 4.893654318102139e-05, + "loss": 0.439, + "step": 8375 + }, + { + "epoch": 0.29450950143748683, + "grad_norm": 0.16023534536361694, + "learning_rate": 4.908262241439757e-05, + "loss": 0.433, + "step": 8400 + }, + { + "epoch": 0.2953860178108127, + "grad_norm": 0.5470284819602966, + "learning_rate": 4.9228701647773753e-05, + "loss": 0.5011, + "step": 8425 + }, + { + "epoch": 0.29626253418413856, + "grad_norm": 0.17784595489501953, + "learning_rate": 4.937478088114994e-05, + "loss": 0.5434, + "step": 8450 + }, + { + "epoch": 0.2971390505574644, + "grad_norm": 0.07412736117839813, + "learning_rate": 4.952086011452612e-05, + "loss": 0.4718, + "step": 8475 + }, + { + "epoch": 0.2980155669307903, + "grad_norm": 0.149408757686615, + "learning_rate": 4.96669393479023e-05, + "loss": 0.2018, + "step": 8500 + }, + { + "epoch": 0.2988920833041161, + "grad_norm": 0.22742848098278046, + "learning_rate": 4.981301858127849e-05, + "loss": 0.2899, + "step": 8525 + }, + { + "epoch": 0.29976859967744196, + "grad_norm": 0.0059156399220228195, + "learning_rate": 4.995909781465467e-05, + "loss": 0.4637, + "step": 8550 + }, + { + "epoch": 0.3006451160507678, + "grad_norm": 14.756769180297852, + "learning_rate": 4.998831305431833e-05, + "loss": 0.5531, + "step": 8575 + }, + { + "epoch": 0.3015216324240937, + "grad_norm": 14.086288452148438, + "learning_rate": 4.9972081185316006e-05, + "loss": 1.0543, + "step": 8600 + }, + { + "epoch": 0.30239814879741955, + "grad_norm": 7.949361324310303, + "learning_rate": 4.995584931631368e-05, + "loss": 0.398, + "step": 8625 + }, + { + "epoch": 0.3032746651707454, + "grad_norm": 13.500972747802734, + "learning_rate": 4.9939617447311356e-05, + "loss": 0.9319, + "step": 8650 + }, + { + "epoch": 0.3041511815440712, + "grad_norm": 0.021229546517133713, + "learning_rate": 4.992338557830903e-05, + "loss": 0.3569, + "step": 8675 + }, + { + "epoch": 0.3050276979173971, + "grad_norm": 0.1833747923374176, + "learning_rate": 4.990715370930671e-05, + "loss": 1.0179, + "step": 8700 + }, + { + "epoch": 0.30590421429072295, + "grad_norm": 0.22282759845256805, + "learning_rate": 4.9890921840304386e-05, + "loss": 0.6904, + "step": 8725 + }, + { + "epoch": 0.3067807306640488, + "grad_norm": 15.032511711120605, + "learning_rate": 4.987468997130206e-05, + "loss": 0.9661, + "step": 8750 + }, + { + "epoch": 0.3076572470373747, + "grad_norm": 0.16836312413215637, + "learning_rate": 4.985845810229973e-05, + "loss": 0.4056, + "step": 8775 + }, + { + "epoch": 0.3085337634107005, + "grad_norm": 0.4255472719669342, + "learning_rate": 4.9842226233297403e-05, + "loss": 0.8192, + "step": 8800 + }, + { + "epoch": 0.30941027978402635, + "grad_norm": 2.820171356201172, + "learning_rate": 4.9825994364295085e-05, + "loss": 0.8818, + "step": 8825 + }, + { + "epoch": 0.3102867961573522, + "grad_norm": 0.10632987320423126, + "learning_rate": 4.980976249529276e-05, + "loss": 0.4209, + "step": 8850 + }, + { + "epoch": 0.3111633125306781, + "grad_norm": 0.20116712152957916, + "learning_rate": 4.9793530626290434e-05, + "loss": 0.5169, + "step": 8875 + }, + { + "epoch": 0.31203982890400395, + "grad_norm": 0.05979451537132263, + "learning_rate": 4.977729875728811e-05, + "loss": 0.4462, + "step": 8900 + }, + { + "epoch": 0.31291634527732975, + "grad_norm": 0.14861252903938293, + "learning_rate": 4.9761066888285783e-05, + "loss": 0.3627, + "step": 8925 + }, + { + "epoch": 0.3137928616506556, + "grad_norm": 0.013700807467103004, + "learning_rate": 4.9744835019283465e-05, + "loss": 0.4792, + "step": 8950 + }, + { + "epoch": 0.3146693780239815, + "grad_norm": 0.42998039722442627, + "learning_rate": 4.972860315028114e-05, + "loss": 0.3772, + "step": 8975 + }, + { + "epoch": 0.31554589439730735, + "grad_norm": 0.3277534246444702, + "learning_rate": 4.9712371281278814e-05, + "loss": 0.3519, + "step": 9000 + }, + { + "epoch": 0.3164224107706332, + "grad_norm": 0.26477983593940735, + "learning_rate": 4.969613941227649e-05, + "loss": 0.2633, + "step": 9025 + }, + { + "epoch": 0.3172989271439591, + "grad_norm": 0.16454839706420898, + "learning_rate": 4.967990754327417e-05, + "loss": 0.1113, + "step": 9050 + }, + { + "epoch": 0.3181754435172849, + "grad_norm": 0.1323496252298355, + "learning_rate": 4.9663675674271845e-05, + "loss": 0.6257, + "step": 9075 + }, + { + "epoch": 0.31905195989061075, + "grad_norm": 0.12275713682174683, + "learning_rate": 4.964744380526952e-05, + "loss": 0.4408, + "step": 9100 + }, + { + "epoch": 0.3199284762639366, + "grad_norm": 0.039267197251319885, + "learning_rate": 4.9631211936267194e-05, + "loss": 0.4538, + "step": 9125 + }, + { + "epoch": 0.3208049926372625, + "grad_norm": 1.3261841535568237, + "learning_rate": 4.961498006726487e-05, + "loss": 0.225, + "step": 9150 + }, + { + "epoch": 0.32168150901058834, + "grad_norm": 0.016822071745991707, + "learning_rate": 4.9598748198262543e-05, + "loss": 0.3073, + "step": 9175 + }, + { + "epoch": 0.32255802538391415, + "grad_norm": 0.034095246344804764, + "learning_rate": 4.958251632926022e-05, + "loss": 1.0642, + "step": 9200 + }, + { + "epoch": 0.32343454175724, + "grad_norm": 2.720649242401123, + "learning_rate": 4.956628446025789e-05, + "loss": 0.8852, + "step": 9225 + }, + { + "epoch": 0.3243110581305659, + "grad_norm": 13.608190536499023, + "learning_rate": 4.955005259125557e-05, + "loss": 1.1679, + "step": 9250 + }, + { + "epoch": 0.32518757450389174, + "grad_norm": 0.05963896960020065, + "learning_rate": 4.953382072225324e-05, + "loss": 1.3481, + "step": 9275 + }, + { + "epoch": 0.3260640908772176, + "grad_norm": 59.45439910888672, + "learning_rate": 4.9517588853250923e-05, + "loss": 0.4824, + "step": 9300 + }, + { + "epoch": 0.3269406072505434, + "grad_norm": 10.563567161560059, + "learning_rate": 4.95013569842486e-05, + "loss": 1.1724, + "step": 9325 + }, + { + "epoch": 0.3278171236238693, + "grad_norm": 183.80873107910156, + "learning_rate": 4.948512511524627e-05, + "loss": 1.0275, + "step": 9350 + }, + { + "epoch": 0.32869363999719514, + "grad_norm": 0.7987326383590698, + "learning_rate": 4.946889324624395e-05, + "loss": 0.6941, + "step": 9375 + }, + { + "epoch": 0.329570156370521, + "grad_norm": 0.9053282141685486, + "learning_rate": 4.945266137724162e-05, + "loss": 0.4942, + "step": 9400 + }, + { + "epoch": 0.33044667274384687, + "grad_norm": 0.010590286925435066, + "learning_rate": 4.9436429508239304e-05, + "loss": 0.3814, + "step": 9425 + }, + { + "epoch": 0.33132318911717273, + "grad_norm": 0.13249900937080383, + "learning_rate": 4.942019763923698e-05, + "loss": 0.346, + "step": 9450 + }, + { + "epoch": 0.33219970549049854, + "grad_norm": 0.006732940208166838, + "learning_rate": 4.940396577023465e-05, + "loss": 0.5689, + "step": 9475 + }, + { + "epoch": 0.3330762218638244, + "grad_norm": 14.143925666809082, + "learning_rate": 4.938773390123233e-05, + "loss": 0.4879, + "step": 9500 + }, + { + "epoch": 0.33395273823715027, + "grad_norm": 49.23941421508789, + "learning_rate": 4.937150203223e-05, + "loss": 0.4262, + "step": 9525 + }, + { + "epoch": 0.33482925461047613, + "grad_norm": 0.02923794463276863, + "learning_rate": 4.935527016322768e-05, + "loss": 1.2098, + "step": 9550 + }, + { + "epoch": 0.335705770983802, + "grad_norm": 13.448921203613281, + "learning_rate": 4.933903829422535e-05, + "loss": 0.2749, + "step": 9575 + }, + { + "epoch": 0.3365822873571278, + "grad_norm": 0.05454031005501747, + "learning_rate": 4.9322806425223026e-05, + "loss": 0.363, + "step": 9600 + }, + { + "epoch": 0.33745880373045367, + "grad_norm": 0.6278654932975769, + "learning_rate": 4.93065745562207e-05, + "loss": 0.512, + "step": 9625 + }, + { + "epoch": 0.33833532010377954, + "grad_norm": 0.02888045273721218, + "learning_rate": 4.9290342687218375e-05, + "loss": 0.229, + "step": 9650 + }, + { + "epoch": 0.3392118364771054, + "grad_norm": 0.02433536760509014, + "learning_rate": 4.927411081821606e-05, + "loss": 0.2321, + "step": 9675 + }, + { + "epoch": 0.34008835285043126, + "grad_norm": 0.009655815549194813, + "learning_rate": 4.925787894921373e-05, + "loss": 0.4536, + "step": 9700 + }, + { + "epoch": 0.34096486922375707, + "grad_norm": 14.470565795898438, + "learning_rate": 4.9241647080211406e-05, + "loss": 0.5404, + "step": 9725 + }, + { + "epoch": 0.34184138559708294, + "grad_norm": 13.976292610168457, + "learning_rate": 4.922541521120908e-05, + "loss": 0.6249, + "step": 9750 + }, + { + "epoch": 0.3427179019704088, + "grad_norm": 0.034647274762392044, + "learning_rate": 4.9209183342206755e-05, + "loss": 0.4655, + "step": 9775 + }, + { + "epoch": 0.34359441834373466, + "grad_norm": 51.79508590698242, + "learning_rate": 4.919295147320444e-05, + "loss": 0.3667, + "step": 9800 + }, + { + "epoch": 0.34447093471706053, + "grad_norm": 1.3426275253295898, + "learning_rate": 4.917671960420211e-05, + "loss": 0.5674, + "step": 9825 + }, + { + "epoch": 0.3453474510903864, + "grad_norm": 0.007003598380833864, + "learning_rate": 4.9160487735199786e-05, + "loss": 0.1587, + "step": 9850 + }, + { + "epoch": 0.3462239674637122, + "grad_norm": 14.689213752746582, + "learning_rate": 4.914425586619746e-05, + "loss": 0.4786, + "step": 9875 + }, + { + "epoch": 0.34710048383703807, + "grad_norm": 0.055759359151124954, + "learning_rate": 4.9128023997195135e-05, + "loss": 0.5082, + "step": 9900 + }, + { + "epoch": 0.34797700021036393, + "grad_norm": 0.004565467592328787, + "learning_rate": 4.911179212819281e-05, + "loss": 0.1805, + "step": 9925 + }, + { + "epoch": 0.3488535165836898, + "grad_norm": 0.004476895555853844, + "learning_rate": 4.9095560259190485e-05, + "loss": 0.9619, + "step": 9950 + }, + { + "epoch": 0.34973003295701566, + "grad_norm": 0.8587487936019897, + "learning_rate": 4.907932839018816e-05, + "loss": 0.1705, + "step": 9975 + }, + { + "epoch": 0.35060654933034147, + "grad_norm": 14.803383827209473, + "learning_rate": 4.9063096521185834e-05, + "loss": 0.5109, + "step": 10000 + }, + { + "epoch": 0.35148306570366733, + "grad_norm": 0.10086461156606674, + "learning_rate": 4.904686465218351e-05, + "loss": 0.4388, + "step": 10025 + }, + { + "epoch": 0.3523595820769932, + "grad_norm": 14.824682235717773, + "learning_rate": 4.903063278318119e-05, + "loss": 0.8135, + "step": 10050 + }, + { + "epoch": 0.35323609845031906, + "grad_norm": 0.21500328183174133, + "learning_rate": 4.9014400914178865e-05, + "loss": 0.6007, + "step": 10075 + }, + { + "epoch": 0.3541126148236449, + "grad_norm": 15.288132667541504, + "learning_rate": 4.899816904517654e-05, + "loss": 0.3193, + "step": 10100 + }, + { + "epoch": 0.3549891311969708, + "grad_norm": 0.030897876247763634, + "learning_rate": 4.8981937176174214e-05, + "loss": 0.5058, + "step": 10125 + }, + { + "epoch": 0.3558656475702966, + "grad_norm": 14.464731216430664, + "learning_rate": 4.8965705307171895e-05, + "loss": 0.3201, + "step": 10150 + }, + { + "epoch": 0.35674216394362246, + "grad_norm": 0.010546306148171425, + "learning_rate": 4.894947343816957e-05, + "loss": 0.6666, + "step": 10175 + }, + { + "epoch": 0.3576186803169483, + "grad_norm": 0.05457659810781479, + "learning_rate": 4.8933241569167245e-05, + "loss": 0.4809, + "step": 10200 + }, + { + "epoch": 0.3584951966902742, + "grad_norm": 0.07991427928209305, + "learning_rate": 4.891700970016492e-05, + "loss": 0.2861, + "step": 10225 + }, + { + "epoch": 0.35937171306360005, + "grad_norm": 0.009358255192637444, + "learning_rate": 4.8900777831162594e-05, + "loss": 0.1841, + "step": 10250 + }, + { + "epoch": 0.36024822943692586, + "grad_norm": 0.0575709193944931, + "learning_rate": 4.888454596216027e-05, + "loss": 0.4354, + "step": 10275 + }, + { + "epoch": 0.3611247458102517, + "grad_norm": 0.006699188146740198, + "learning_rate": 4.886831409315794e-05, + "loss": 0.3501, + "step": 10300 + }, + { + "epoch": 0.3620012621835776, + "grad_norm": 0.1408693492412567, + "learning_rate": 4.885208222415562e-05, + "loss": 0.6427, + "step": 10325 + }, + { + "epoch": 0.36287777855690345, + "grad_norm": 0.010215718299150467, + "learning_rate": 4.883585035515329e-05, + "loss": 0.3052, + "step": 10350 + }, + { + "epoch": 0.3637542949302293, + "grad_norm": 0.3137216567993164, + "learning_rate": 4.881961848615097e-05, + "loss": 0.1702, + "step": 10375 + }, + { + "epoch": 0.3646308113035551, + "grad_norm": 0.03956151381134987, + "learning_rate": 4.880338661714865e-05, + "loss": 0.0058, + "step": 10400 + }, + { + "epoch": 0.365507327676881, + "grad_norm": 0.006265764124691486, + "learning_rate": 4.878715474814632e-05, + "loss": 0.4848, + "step": 10425 + }, + { + "epoch": 0.36638384405020685, + "grad_norm": 14.88607120513916, + "learning_rate": 4.8770922879144e-05, + "loss": 0.679, + "step": 10450 + }, + { + "epoch": 0.3672603604235327, + "grad_norm": 0.44252628087997437, + "learning_rate": 4.875469101014167e-05, + "loss": 0.8053, + "step": 10475 + }, + { + "epoch": 0.3681368767968586, + "grad_norm": 1791.34814453125, + "learning_rate": 4.873845914113935e-05, + "loss": 0.1936, + "step": 10500 + }, + { + "epoch": 0.36901339317018444, + "grad_norm": 0.018540937453508377, + "learning_rate": 4.872222727213703e-05, + "loss": 0.2979, + "step": 10525 + }, + { + "epoch": 0.36988990954351025, + "grad_norm": 0.18954648077487946, + "learning_rate": 4.87059954031347e-05, + "loss": 0.3048, + "step": 10550 + }, + { + "epoch": 0.3707664259168361, + "grad_norm": 0.20199595391750336, + "learning_rate": 4.868976353413238e-05, + "loss": 0.3595, + "step": 10575 + }, + { + "epoch": 0.371642942290162, + "grad_norm": 16.719966888427734, + "learning_rate": 4.867353166513005e-05, + "loss": 0.7209, + "step": 10600 + }, + { + "epoch": 0.37251945866348785, + "grad_norm": 0.040200598537921906, + "learning_rate": 4.865729979612773e-05, + "loss": 0.4157, + "step": 10625 + }, + { + "epoch": 0.3733959750368137, + "grad_norm": 0.25922831892967224, + "learning_rate": 4.864106792712541e-05, + "loss": 0.6697, + "step": 10650 + }, + { + "epoch": 0.3742724914101395, + "grad_norm": 0.00827852264046669, + "learning_rate": 4.862483605812308e-05, + "loss": 0.5287, + "step": 10675 + }, + { + "epoch": 0.3751490077834654, + "grad_norm": 0.3032333254814148, + "learning_rate": 4.860860418912075e-05, + "loss": 0.4538, + "step": 10700 + }, + { + "epoch": 0.37602552415679125, + "grad_norm": 0.0896478071808815, + "learning_rate": 4.8592372320118426e-05, + "loss": 0.7145, + "step": 10725 + }, + { + "epoch": 0.3769020405301171, + "grad_norm": 0.3189281225204468, + "learning_rate": 4.85761404511161e-05, + "loss": 0.7671, + "step": 10750 + }, + { + "epoch": 0.377778556903443, + "grad_norm": 0.17828913033008575, + "learning_rate": 4.855990858211378e-05, + "loss": 0.3848, + "step": 10775 + }, + { + "epoch": 0.3786550732767688, + "grad_norm": 1.2941590547561646, + "learning_rate": 4.8543676713111456e-05, + "loss": 0.3306, + "step": 10800 + }, + { + "epoch": 0.37953158965009465, + "grad_norm": 0.11756742745637894, + "learning_rate": 4.852744484410913e-05, + "loss": 0.6647, + "step": 10825 + }, + { + "epoch": 0.3804081060234205, + "grad_norm": 0.7507511377334595, + "learning_rate": 4.8511212975106806e-05, + "loss": 0.683, + "step": 10850 + }, + { + "epoch": 0.3812846223967464, + "grad_norm": 0.015148717910051346, + "learning_rate": 4.849498110610448e-05, + "loss": 0.5559, + "step": 10875 + }, + { + "epoch": 0.38216113877007224, + "grad_norm": 0.2398933470249176, + "learning_rate": 4.847874923710216e-05, + "loss": 0.2771, + "step": 10900 + }, + { + "epoch": 0.3830376551433981, + "grad_norm": 0.16130314767360687, + "learning_rate": 4.8462517368099836e-05, + "loss": 0.2146, + "step": 10925 + }, + { + "epoch": 0.3839141715167239, + "grad_norm": 0.04635791853070259, + "learning_rate": 4.844628549909751e-05, + "loss": 0.3736, + "step": 10950 + }, + { + "epoch": 0.3847906878900498, + "grad_norm": 0.06686024367809296, + "learning_rate": 4.8430053630095186e-05, + "loss": 0.0994, + "step": 10975 + }, + { + "epoch": 0.38566720426337564, + "grad_norm": 15.290380477905273, + "learning_rate": 4.841382176109286e-05, + "loss": 0.4305, + "step": 11000 + }, + { + "epoch": 0.3865437206367015, + "grad_norm": 0.08941290527582169, + "learning_rate": 4.839758989209054e-05, + "loss": 0.2198, + "step": 11025 + }, + { + "epoch": 0.38742023701002737, + "grad_norm": 0.14174267649650574, + "learning_rate": 4.8381358023088216e-05, + "loss": 0.72, + "step": 11050 + }, + { + "epoch": 0.3882967533833532, + "grad_norm": 0.005024300422519445, + "learning_rate": 4.836512615408589e-05, + "loss": 0.3562, + "step": 11075 + }, + { + "epoch": 0.38917326975667904, + "grad_norm": 0.1760040521621704, + "learning_rate": 4.8348894285083566e-05, + "loss": 0.1132, + "step": 11100 + }, + { + "epoch": 0.3900497861300049, + "grad_norm": 0.03685943782329559, + "learning_rate": 4.8332662416081234e-05, + "loss": 0.1023, + "step": 11125 + }, + { + "epoch": 0.39092630250333077, + "grad_norm": 0.1700683832168579, + "learning_rate": 4.8316430547078915e-05, + "loss": 0.2582, + "step": 11150 + }, + { + "epoch": 0.39180281887665663, + "grad_norm": 0.2792922258377075, + "learning_rate": 4.830019867807659e-05, + "loss": 0.6008, + "step": 11175 + }, + { + "epoch": 0.3926793352499825, + "grad_norm": 0.1941952258348465, + "learning_rate": 4.8283966809074264e-05, + "loss": 0.1824, + "step": 11200 + }, + { + "epoch": 0.3935558516233083, + "grad_norm": 0.1627628356218338, + "learning_rate": 4.826773494007194e-05, + "loss": 0.2127, + "step": 11225 + }, + { + "epoch": 0.39443236799663417, + "grad_norm": 0.21417059004306793, + "learning_rate": 4.825150307106962e-05, + "loss": 0.7309, + "step": 11250 + }, + { + "epoch": 0.39530888436996003, + "grad_norm": 0.004717429168522358, + "learning_rate": 4.8235271202067295e-05, + "loss": 0.2734, + "step": 11275 + }, + { + "epoch": 0.3961854007432859, + "grad_norm": 0.15284979343414307, + "learning_rate": 4.821903933306497e-05, + "loss": 0.4629, + "step": 11300 + }, + { + "epoch": 0.39706191711661176, + "grad_norm": 0.11644934862852097, + "learning_rate": 4.8202807464062644e-05, + "loss": 0.4505, + "step": 11325 + }, + { + "epoch": 0.39793843348993757, + "grad_norm": 0.0647820308804512, + "learning_rate": 4.818657559506032e-05, + "loss": 0.2245, + "step": 11350 + }, + { + "epoch": 0.39881494986326343, + "grad_norm": 0.3612101376056671, + "learning_rate": 4.8170343726058e-05, + "loss": 1.0704, + "step": 11375 + }, + { + "epoch": 0.3996914662365893, + "grad_norm": 0.11998118460178375, + "learning_rate": 4.8154111857055675e-05, + "loss": 0.1631, + "step": 11400 + }, + { + "epoch": 0.40056798260991516, + "grad_norm": 0.003246456617489457, + "learning_rate": 4.813787998805335e-05, + "loss": 0.2056, + "step": 11425 + }, + { + "epoch": 0.401444498983241, + "grad_norm": 0.0031909309327602386, + "learning_rate": 4.8121648119051024e-05, + "loss": 0.4595, + "step": 11450 + }, + { + "epoch": 0.40232101535656684, + "grad_norm": 0.13521301746368408, + "learning_rate": 4.81054162500487e-05, + "loss": 0.4138, + "step": 11475 + }, + { + "epoch": 0.4031975317298927, + "grad_norm": 0.0632961094379425, + "learning_rate": 4.8089184381046374e-05, + "loss": 0.2016, + "step": 11500 + }, + { + "epoch": 0.40407404810321856, + "grad_norm": 0.005184983368963003, + "learning_rate": 4.807295251204405e-05, + "loss": 0.2797, + "step": 11525 + }, + { + "epoch": 0.40495056447654443, + "grad_norm": 0.45584532618522644, + "learning_rate": 4.805672064304172e-05, + "loss": 0.7037, + "step": 11550 + }, + { + "epoch": 0.4058270808498703, + "grad_norm": 18.1007137298584, + "learning_rate": 4.80404887740394e-05, + "loss": 2.1175, + "step": 11575 + }, + { + "epoch": 0.40670359722319616, + "grad_norm": 0.6630319356918335, + "learning_rate": 4.802425690503707e-05, + "loss": 0.8401, + "step": 11600 + }, + { + "epoch": 0.40758011359652196, + "grad_norm": 0.025692567229270935, + "learning_rate": 4.8008025036034754e-05, + "loss": 0.9184, + "step": 11625 + }, + { + "epoch": 0.40845662996984783, + "grad_norm": 0.3357422649860382, + "learning_rate": 4.799179316703243e-05, + "loss": 1.8606, + "step": 11650 + }, + { + "epoch": 0.4093331463431737, + "grad_norm": 17.335060119628906, + "learning_rate": 4.79755612980301e-05, + "loss": 2.4668, + "step": 11675 + }, + { + "epoch": 0.41020966271649956, + "grad_norm": 12.593079566955566, + "learning_rate": 4.795932942902778e-05, + "loss": 1.9899, + "step": 11700 + }, + { + "epoch": 0.4110861790898254, + "grad_norm": 5.3870134353637695, + "learning_rate": 4.794309756002545e-05, + "loss": 1.2847, + "step": 11725 + }, + { + "epoch": 0.41196269546315123, + "grad_norm": 4.067342758178711, + "learning_rate": 4.7926865691023134e-05, + "loss": 1.1512, + "step": 11750 + }, + { + "epoch": 0.4128392118364771, + "grad_norm": 4.22260046005249, + "learning_rate": 4.791063382202081e-05, + "loss": 0.8984, + "step": 11775 + }, + { + "epoch": 0.41371572820980296, + "grad_norm": 16.192333221435547, + "learning_rate": 4.789440195301848e-05, + "loss": 0.894, + "step": 11800 + }, + { + "epoch": 0.4145922445831288, + "grad_norm": 6.8901166915893555, + "learning_rate": 4.787817008401616e-05, + "loss": 0.9018, + "step": 11825 + }, + { + "epoch": 0.4154687609564547, + "grad_norm": 5.604538917541504, + "learning_rate": 4.786193821501383e-05, + "loss": 1.0841, + "step": 11850 + }, + { + "epoch": 0.4163452773297805, + "grad_norm": 9.760684967041016, + "learning_rate": 4.784570634601151e-05, + "loss": 0.7487, + "step": 11875 + }, + { + "epoch": 0.41722179370310636, + "grad_norm": 0.04392727091908455, + "learning_rate": 4.782947447700918e-05, + "loss": 0.896, + "step": 11900 + }, + { + "epoch": 0.4180983100764322, + "grad_norm": 13.067900657653809, + "learning_rate": 4.7813242608006856e-05, + "loss": 1.0921, + "step": 11925 + }, + { + "epoch": 0.4189748264497581, + "grad_norm": 8.497584342956543, + "learning_rate": 4.779701073900453e-05, + "loss": 0.9571, + "step": 11950 + }, + { + "epoch": 0.41985134282308395, + "grad_norm": 0.04684456065297127, + "learning_rate": 4.7780778870002205e-05, + "loss": 0.774, + "step": 11975 + }, + { + "epoch": 0.4207278591964098, + "grad_norm": 6.000108242034912, + "learning_rate": 4.776454700099989e-05, + "loss": 0.8765, + "step": 12000 + }, + { + "epoch": 0.4216043755697356, + "grad_norm": 4.214759349822998, + "learning_rate": 4.774831513199756e-05, + "loss": 0.6598, + "step": 12025 + }, + { + "epoch": 0.4224808919430615, + "grad_norm": 9.4307279586792, + "learning_rate": 4.7732083262995236e-05, + "loss": 0.86, + "step": 12050 + }, + { + "epoch": 0.42335740831638735, + "grad_norm": 2.7616958618164062, + "learning_rate": 4.771585139399291e-05, + "loss": 0.8047, + "step": 12075 + }, + { + "epoch": 0.4242339246897132, + "grad_norm": 0.020237023010849953, + "learning_rate": 4.7699619524990585e-05, + "loss": 0.813, + "step": 12100 + }, + { + "epoch": 0.4251104410630391, + "grad_norm": 21.838762283325195, + "learning_rate": 4.768338765598827e-05, + "loss": 1.0385, + "step": 12125 + }, + { + "epoch": 0.4259869574363649, + "grad_norm": 11.382061004638672, + "learning_rate": 4.766715578698594e-05, + "loss": 0.9955, + "step": 12150 + }, + { + "epoch": 0.42686347380969075, + "grad_norm": 7.27595853805542, + "learning_rate": 4.7650923917983616e-05, + "loss": 0.8412, + "step": 12175 + }, + { + "epoch": 0.4277399901830166, + "grad_norm": 0.6162340641021729, + "learning_rate": 4.763469204898129e-05, + "loss": 1.0952, + "step": 12200 + }, + { + "epoch": 0.4286165065563425, + "grad_norm": 3.1064302921295166, + "learning_rate": 4.7618460179978965e-05, + "loss": 0.9034, + "step": 12225 + }, + { + "epoch": 0.42949302292966834, + "grad_norm": 0.9773826599121094, + "learning_rate": 4.760222831097664e-05, + "loss": 1.2921, + "step": 12250 + }, + { + "epoch": 0.43036953930299415, + "grad_norm": 11.120121955871582, + "learning_rate": 4.7585996441974315e-05, + "loss": 0.9335, + "step": 12275 + }, + { + "epoch": 0.43124605567632, + "grad_norm": 5.1735405921936035, + "learning_rate": 4.756976457297199e-05, + "loss": 1.0394, + "step": 12300 + }, + { + "epoch": 0.4321225720496459, + "grad_norm": 0.014818266965448856, + "learning_rate": 4.7553532703969664e-05, + "loss": 0.9712, + "step": 12325 + }, + { + "epoch": 0.43299908842297175, + "grad_norm": 5.651721000671387, + "learning_rate": 4.753730083496734e-05, + "loss": 0.7598, + "step": 12350 + }, + { + "epoch": 0.4338756047962976, + "grad_norm": 0.011702095158398151, + "learning_rate": 4.752106896596502e-05, + "loss": 0.6975, + "step": 12375 + }, + { + "epoch": 0.4347521211696235, + "grad_norm": 11.252761840820312, + "learning_rate": 4.7504837096962695e-05, + "loss": 0.9566, + "step": 12400 + }, + { + "epoch": 0.4356286375429493, + "grad_norm": 4.146478652954102, + "learning_rate": 4.748860522796037e-05, + "loss": 0.9102, + "step": 12425 + }, + { + "epoch": 0.43650515391627515, + "grad_norm": 10.628090858459473, + "learning_rate": 4.7472373358958044e-05, + "loss": 0.7803, + "step": 12450 + }, + { + "epoch": 0.437381670289601, + "grad_norm": 5.878779888153076, + "learning_rate": 4.7456141489955725e-05, + "loss": 0.8749, + "step": 12475 + }, + { + "epoch": 0.4382581866629269, + "grad_norm": 0.009643251076340675, + "learning_rate": 4.74399096209534e-05, + "loss": 0.9128, + "step": 12500 + }, + { + "epoch": 0.43913470303625274, + "grad_norm": 7.3341522216796875, + "learning_rate": 4.7423677751951075e-05, + "loss": 0.6562, + "step": 12525 + }, + { + "epoch": 0.44001121940957855, + "grad_norm": 7.9861931800842285, + "learning_rate": 4.740744588294875e-05, + "loss": 0.9844, + "step": 12550 + }, + { + "epoch": 0.4408877357829044, + "grad_norm": 9.245299339294434, + "learning_rate": 4.7391214013946424e-05, + "loss": 0.6937, + "step": 12575 + }, + { + "epoch": 0.4417642521562303, + "grad_norm": 4.324703693389893, + "learning_rate": 4.7374982144944105e-05, + "loss": 0.7441, + "step": 12600 + }, + { + "epoch": 0.44264076852955614, + "grad_norm": 6.6869893074035645, + "learning_rate": 4.735875027594177e-05, + "loss": 0.7005, + "step": 12625 + }, + { + "epoch": 0.443517284902882, + "grad_norm": 5.858974456787109, + "learning_rate": 4.734251840693945e-05, + "loss": 0.6217, + "step": 12650 + }, + { + "epoch": 0.44439380127620787, + "grad_norm": 12.305220603942871, + "learning_rate": 4.732628653793712e-05, + "loss": 1.0873, + "step": 12675 + }, + { + "epoch": 0.4452703176495337, + "grad_norm": 2.54984188079834, + "learning_rate": 4.73100546689348e-05, + "loss": 0.619, + "step": 12700 + }, + { + "epoch": 0.44614683402285954, + "grad_norm": 1.5886402130126953, + "learning_rate": 4.729382279993248e-05, + "loss": 0.9263, + "step": 12725 + }, + { + "epoch": 0.4470233503961854, + "grad_norm": 0.008074513636529446, + "learning_rate": 4.727759093093015e-05, + "loss": 0.9176, + "step": 12750 + }, + { + "epoch": 0.44789986676951127, + "grad_norm": 6.999600410461426, + "learning_rate": 4.726135906192783e-05, + "loss": 1.2257, + "step": 12775 + }, + { + "epoch": 0.44877638314283713, + "grad_norm": 4.357109069824219, + "learning_rate": 4.72451271929255e-05, + "loss": 0.7147, + "step": 12800 + }, + { + "epoch": 0.44965289951616294, + "grad_norm": 19.312379837036133, + "learning_rate": 4.722889532392318e-05, + "loss": 0.9763, + "step": 12825 + }, + { + "epoch": 0.4505294158894888, + "grad_norm": 9.641755104064941, + "learning_rate": 4.721266345492086e-05, + "loss": 1.2435, + "step": 12850 + }, + { + "epoch": 0.45140593226281467, + "grad_norm": 6.428452968597412, + "learning_rate": 4.719643158591853e-05, + "loss": 0.9649, + "step": 12875 + }, + { + "epoch": 0.45228244863614053, + "grad_norm": 9.004369735717773, + "learning_rate": 4.718019971691621e-05, + "loss": 0.9376, + "step": 12900 + }, + { + "epoch": 0.4531589650094664, + "grad_norm": 3.575873851776123, + "learning_rate": 4.716396784791388e-05, + "loss": 0.7963, + "step": 12925 + }, + { + "epoch": 0.4540354813827922, + "grad_norm": 8.474581718444824, + "learning_rate": 4.714773597891156e-05, + "loss": 0.7405, + "step": 12950 + }, + { + "epoch": 0.45491199775611807, + "grad_norm": 11.565832138061523, + "learning_rate": 4.713150410990924e-05, + "loss": 0.9037, + "step": 12975 + }, + { + "epoch": 0.45578851412944393, + "grad_norm": 0.008348888717591763, + "learning_rate": 4.711527224090691e-05, + "loss": 0.7464, + "step": 13000 + }, + { + "epoch": 0.4566650305027698, + "grad_norm": 12.881969451904297, + "learning_rate": 4.709904037190459e-05, + "loss": 0.825, + "step": 13025 + }, + { + "epoch": 0.45754154687609566, + "grad_norm": 13.01504898071289, + "learning_rate": 4.7082808502902256e-05, + "loss": 0.8277, + "step": 13050 + }, + { + "epoch": 0.4584180632494215, + "grad_norm": 10.522770881652832, + "learning_rate": 4.706657663389993e-05, + "loss": 0.5278, + "step": 13075 + }, + { + "epoch": 0.45929457962274733, + "grad_norm": 5.754170894622803, + "learning_rate": 4.705034476489761e-05, + "loss": 0.8384, + "step": 13100 + }, + { + "epoch": 0.4601710959960732, + "grad_norm": 14.059527397155762, + "learning_rate": 4.7034112895895287e-05, + "loss": 1.0784, + "step": 13125 + }, + { + "epoch": 0.46104761236939906, + "grad_norm": 6.517821788787842, + "learning_rate": 4.701788102689296e-05, + "loss": 0.7391, + "step": 13150 + }, + { + "epoch": 0.4619241287427249, + "grad_norm": 9.433304786682129, + "learning_rate": 4.7001649157890636e-05, + "loss": 0.6308, + "step": 13175 + }, + { + "epoch": 0.4628006451160508, + "grad_norm": 9.67560863494873, + "learning_rate": 4.698541728888831e-05, + "loss": 0.663, + "step": 13200 + }, + { + "epoch": 0.4636771614893766, + "grad_norm": 11.441699981689453, + "learning_rate": 4.696918541988599e-05, + "loss": 0.7438, + "step": 13225 + }, + { + "epoch": 0.46455367786270246, + "grad_norm": 8.98148250579834, + "learning_rate": 4.6952953550883667e-05, + "loss": 0.6593, + "step": 13250 + }, + { + "epoch": 0.4654301942360283, + "grad_norm": 9.426572799682617, + "learning_rate": 4.693672168188134e-05, + "loss": 0.8849, + "step": 13275 + }, + { + "epoch": 0.4663067106093542, + "grad_norm": 13.168689727783203, + "learning_rate": 4.6920489812879016e-05, + "loss": 0.8706, + "step": 13300 + }, + { + "epoch": 0.46718322698268006, + "grad_norm": 7.38999080657959, + "learning_rate": 4.690425794387669e-05, + "loss": 0.8829, + "step": 13325 + }, + { + "epoch": 0.46805974335600586, + "grad_norm": 8.570085525512695, + "learning_rate": 4.688802607487437e-05, + "loss": 0.8741, + "step": 13350 + }, + { + "epoch": 0.46893625972933173, + "grad_norm": 7.0719218254089355, + "learning_rate": 4.687179420587205e-05, + "loss": 0.8429, + "step": 13375 + }, + { + "epoch": 0.4698127761026576, + "grad_norm": 7.8700456619262695, + "learning_rate": 4.685556233686972e-05, + "loss": 0.7048, + "step": 13400 + }, + { + "epoch": 0.47068929247598346, + "grad_norm": 6.611064910888672, + "learning_rate": 4.6839330467867396e-05, + "loss": 0.7332, + "step": 13425 + }, + { + "epoch": 0.4715658088493093, + "grad_norm": 0.007439002860337496, + "learning_rate": 4.682309859886507e-05, + "loss": 0.5985, + "step": 13450 + }, + { + "epoch": 0.4724423252226352, + "grad_norm": 0.007731274235993624, + "learning_rate": 4.6806866729862745e-05, + "loss": 0.7178, + "step": 13475 + }, + { + "epoch": 0.473318841595961, + "grad_norm": 18.222192764282227, + "learning_rate": 4.679063486086042e-05, + "loss": 0.7678, + "step": 13500 + }, + { + "epoch": 0.47419535796928686, + "grad_norm": 5.502414226531982, + "learning_rate": 4.6774402991858094e-05, + "loss": 0.9083, + "step": 13525 + }, + { + "epoch": 0.4750718743426127, + "grad_norm": 11.877302169799805, + "learning_rate": 4.675817112285577e-05, + "loss": 0.8099, + "step": 13550 + }, + { + "epoch": 0.4759483907159386, + "grad_norm": 7.975980758666992, + "learning_rate": 4.674193925385345e-05, + "loss": 1.2048, + "step": 13575 + }, + { + "epoch": 0.47682490708926445, + "grad_norm": 6.720562934875488, + "learning_rate": 4.6725707384851125e-05, + "loss": 0.6301, + "step": 13600 + }, + { + "epoch": 0.47770142346259026, + "grad_norm": 2.212615728378296, + "learning_rate": 4.67094755158488e-05, + "loss": 0.7361, + "step": 13625 + }, + { + "epoch": 0.4785779398359161, + "grad_norm": 7.376840591430664, + "learning_rate": 4.6693243646846474e-05, + "loss": 0.899, + "step": 13650 + }, + { + "epoch": 0.479454456209242, + "grad_norm": 9.050980567932129, + "learning_rate": 4.667701177784415e-05, + "loss": 1.0607, + "step": 13675 + }, + { + "epoch": 0.48033097258256785, + "grad_norm": 2.5061824321746826, + "learning_rate": 4.666077990884183e-05, + "loss": 0.7377, + "step": 13700 + }, + { + "epoch": 0.4812074889558937, + "grad_norm": 14.498836517333984, + "learning_rate": 4.6644548039839505e-05, + "loss": 0.9932, + "step": 13725 + }, + { + "epoch": 0.4820840053292195, + "grad_norm": 5.407880783081055, + "learning_rate": 4.662831617083718e-05, + "loss": 0.7819, + "step": 13750 + }, + { + "epoch": 0.4829605217025454, + "grad_norm": 5.9832234382629395, + "learning_rate": 4.6612084301834854e-05, + "loss": 0.7796, + "step": 13775 + }, + { + "epoch": 0.48383703807587125, + "grad_norm": 7.125918388366699, + "learning_rate": 4.659585243283253e-05, + "loss": 0.912, + "step": 13800 + }, + { + "epoch": 0.4847135544491971, + "grad_norm": 3.132843255996704, + "learning_rate": 4.6579620563830204e-05, + "loss": 0.7696, + "step": 13825 + }, + { + "epoch": 0.485590070822523, + "grad_norm": 8.794737815856934, + "learning_rate": 4.656338869482788e-05, + "loss": 0.8038, + "step": 13850 + }, + { + "epoch": 0.48646658719584884, + "grad_norm": 2.969339370727539, + "learning_rate": 4.654715682582555e-05, + "loss": 1.173, + "step": 13875 + }, + { + "epoch": 0.48734310356917465, + "grad_norm": 7.546872138977051, + "learning_rate": 4.653092495682323e-05, + "loss": 0.7024, + "step": 13900 + }, + { + "epoch": 0.4882196199425005, + "grad_norm": 11.537336349487305, + "learning_rate": 4.65146930878209e-05, + "loss": 0.7928, + "step": 13925 + }, + { + "epoch": 0.4890961363158264, + "grad_norm": 0.005619999952614307, + "learning_rate": 4.6498461218818584e-05, + "loss": 0.7802, + "step": 13950 + }, + { + "epoch": 0.48997265268915224, + "grad_norm": 9.538442611694336, + "learning_rate": 4.648222934981626e-05, + "loss": 0.9079, + "step": 13975 + }, + { + "epoch": 0.4908491690624781, + "grad_norm": 12.42443561553955, + "learning_rate": 4.646599748081393e-05, + "loss": 0.7065, + "step": 14000 + }, + { + "epoch": 0.4917256854358039, + "grad_norm": 6.318347454071045, + "learning_rate": 4.644976561181161e-05, + "loss": 0.6327, + "step": 14025 + }, + { + "epoch": 0.4926022018091298, + "grad_norm": 0.0048320600762963295, + "learning_rate": 4.643353374280928e-05, + "loss": 0.9488, + "step": 14050 + }, + { + "epoch": 0.49347871818245564, + "grad_norm": 7.790452480316162, + "learning_rate": 4.6417301873806964e-05, + "loss": 0.9586, + "step": 14075 + }, + { + "epoch": 0.4943552345557815, + "grad_norm": 5.275586128234863, + "learning_rate": 4.640107000480464e-05, + "loss": 0.9722, + "step": 14100 + }, + { + "epoch": 0.4952317509291074, + "grad_norm": 11.852282524108887, + "learning_rate": 4.638483813580231e-05, + "loss": 1.3176, + "step": 14125 + }, + { + "epoch": 0.49610826730243324, + "grad_norm": 3.87994384765625, + "learning_rate": 4.636860626679999e-05, + "loss": 0.9594, + "step": 14150 + }, + { + "epoch": 0.49698478367575905, + "grad_norm": 2.6835203170776367, + "learning_rate": 4.635237439779766e-05, + "loss": 1.3925, + "step": 14175 + }, + { + "epoch": 0.4978613000490849, + "grad_norm": 8.791675567626953, + "learning_rate": 4.633614252879534e-05, + "loss": 1.5515, + "step": 14200 + }, + { + "epoch": 0.4987378164224108, + "grad_norm": 7.096665859222412, + "learning_rate": 4.631991065979301e-05, + "loss": 1.3706, + "step": 14225 + }, + { + "epoch": 0.49961433279573664, + "grad_norm": 9.926222801208496, + "learning_rate": 4.6303678790790686e-05, + "loss": 1.1635, + "step": 14250 + }, + { + "epoch": 0.5004908491690625, + "grad_norm": 3.2490410804748535, + "learning_rate": 4.628744692178836e-05, + "loss": 1.0267, + "step": 14275 + }, + { + "epoch": 0.5013673655423884, + "grad_norm": 4.522827625274658, + "learning_rate": 4.6271215052786036e-05, + "loss": 1.3342, + "step": 14300 + }, + { + "epoch": 0.5022438819157142, + "grad_norm": 4.870907306671143, + "learning_rate": 4.625498318378372e-05, + "loss": 1.399, + "step": 14325 + }, + { + "epoch": 0.5031203982890401, + "grad_norm": 5.891185283660889, + "learning_rate": 4.623875131478139e-05, + "loss": 1.2098, + "step": 14350 + }, + { + "epoch": 0.5039969146623658, + "grad_norm": 10.622488021850586, + "learning_rate": 4.6222519445779066e-05, + "loss": 1.1266, + "step": 14375 + }, + { + "epoch": 0.5048734310356917, + "grad_norm": 0.006781752221286297, + "learning_rate": 4.620628757677674e-05, + "loss": 1.2043, + "step": 14400 + }, + { + "epoch": 0.5057499474090176, + "grad_norm": 4.930004596710205, + "learning_rate": 4.6190055707774416e-05, + "loss": 1.1662, + "step": 14425 + }, + { + "epoch": 0.5066264637823434, + "grad_norm": 9.217228889465332, + "learning_rate": 4.61738238387721e-05, + "loss": 1.195, + "step": 14450 + }, + { + "epoch": 0.5075029801556693, + "grad_norm": 4.175748825073242, + "learning_rate": 4.615759196976977e-05, + "loss": 1.2792, + "step": 14475 + }, + { + "epoch": 0.5083794965289952, + "grad_norm": 6.321438312530518, + "learning_rate": 4.6141360100767446e-05, + "loss": 1.1988, + "step": 14500 + }, + { + "epoch": 0.509256012902321, + "grad_norm": 4.776960372924805, + "learning_rate": 4.612512823176512e-05, + "loss": 1.309, + "step": 14525 + }, + { + "epoch": 0.5101325292756469, + "grad_norm": 5.556209564208984, + "learning_rate": 4.6108896362762796e-05, + "loss": 0.9723, + "step": 14550 + }, + { + "epoch": 0.5110090456489728, + "grad_norm": 5.256525993347168, + "learning_rate": 4.609266449376047e-05, + "loss": 1.379, + "step": 14575 + }, + { + "epoch": 0.5118855620222986, + "grad_norm": 0.005491985473781824, + "learning_rate": 4.6076432624758145e-05, + "loss": 1.1296, + "step": 14600 + }, + { + "epoch": 0.5127620783956244, + "grad_norm": 5.322751045227051, + "learning_rate": 4.606020075575582e-05, + "loss": 1.1221, + "step": 14625 + }, + { + "epoch": 0.5136385947689502, + "grad_norm": 20.669130325317383, + "learning_rate": 4.6043968886753494e-05, + "loss": 1.282, + "step": 14650 + }, + { + "epoch": 0.5145151111422761, + "grad_norm": 8.551734924316406, + "learning_rate": 4.6027737017751176e-05, + "loss": 1.9343, + "step": 14675 + }, + { + "epoch": 0.515391627515602, + "grad_norm": 4.299094200134277, + "learning_rate": 4.601150514874885e-05, + "loss": 1.8772, + "step": 14700 + }, + { + "epoch": 0.5162681438889278, + "grad_norm": 4.1930718421936035, + "learning_rate": 4.5995273279746525e-05, + "loss": 1.4641, + "step": 14725 + }, + { + "epoch": 0.5171446602622537, + "grad_norm": 6.161220550537109, + "learning_rate": 4.59790414107442e-05, + "loss": 1.0833, + "step": 14750 + }, + { + "epoch": 0.5180211766355796, + "grad_norm": 8.075519561767578, + "learning_rate": 4.5962809541741874e-05, + "loss": 1.2873, + "step": 14775 + }, + { + "epoch": 0.5188976930089054, + "grad_norm": 3.982177495956421, + "learning_rate": 4.5946577672739556e-05, + "loss": 1.2467, + "step": 14800 + }, + { + "epoch": 0.5197742093822313, + "grad_norm": 5.434603691101074, + "learning_rate": 4.593034580373723e-05, + "loss": 0.8939, + "step": 14825 + }, + { + "epoch": 0.5206507257555572, + "grad_norm": 6.292034149169922, + "learning_rate": 4.5914113934734905e-05, + "loss": 1.3094, + "step": 14850 + }, + { + "epoch": 0.521527242128883, + "grad_norm": 15.706145286560059, + "learning_rate": 4.589788206573258e-05, + "loss": 1.9144, + "step": 14875 + }, + { + "epoch": 0.5224037585022088, + "grad_norm": 4.467050075531006, + "learning_rate": 4.5881650196730254e-05, + "loss": 1.2079, + "step": 14900 + }, + { + "epoch": 0.5232802748755346, + "grad_norm": 6.932319164276123, + "learning_rate": 4.5865418327727936e-05, + "loss": 0.9257, + "step": 14925 + }, + { + "epoch": 0.5241567912488605, + "grad_norm": 5.143194675445557, + "learning_rate": 4.584918645872561e-05, + "loss": 2.8129, + "step": 14950 + }, + { + "epoch": 0.5250333076221864, + "grad_norm": 7.185523986816406, + "learning_rate": 4.583295458972328e-05, + "loss": 2.3135, + "step": 14975 + }, + { + "epoch": 0.5259098239955122, + "grad_norm": 6.707859992980957, + "learning_rate": 4.581672272072095e-05, + "loss": 1.7409, + "step": 15000 + }, + { + "epoch": 0.5267863403688381, + "grad_norm": 4.802098751068115, + "learning_rate": 4.580049085171863e-05, + "loss": 1.7744, + "step": 15025 + }, + { + "epoch": 0.527662856742164, + "grad_norm": 7.939062595367432, + "learning_rate": 4.578425898271631e-05, + "loss": 1.7169, + "step": 15050 + }, + { + "epoch": 0.5285393731154898, + "grad_norm": 7.36226749420166, + "learning_rate": 4.5768027113713984e-05, + "loss": 1.6652, + "step": 15075 + }, + { + "epoch": 0.5294158894888157, + "grad_norm": 6.879636764526367, + "learning_rate": 4.575179524471166e-05, + "loss": 1.954, + "step": 15100 + }, + { + "epoch": 0.5302924058621415, + "grad_norm": 6.151422023773193, + "learning_rate": 4.573556337570933e-05, + "loss": 1.8467, + "step": 15125 + }, + { + "epoch": 0.5311689222354674, + "grad_norm": 5.700201511383057, + "learning_rate": 4.571933150670701e-05, + "loss": 1.7295, + "step": 15150 + }, + { + "epoch": 0.5320454386087932, + "grad_norm": 5.273646831512451, + "learning_rate": 4.570309963770469e-05, + "loss": 1.6805, + "step": 15175 + }, + { + "epoch": 0.532921954982119, + "grad_norm": 8.710776329040527, + "learning_rate": 4.5686867768702364e-05, + "loss": 1.7392, + "step": 15200 + }, + { + "epoch": 0.5337984713554449, + "grad_norm": 6.670529365539551, + "learning_rate": 4.567063589970004e-05, + "loss": 1.7321, + "step": 15225 + }, + { + "epoch": 0.5346749877287708, + "grad_norm": 7.633786678314209, + "learning_rate": 4.565440403069771e-05, + "loss": 1.7001, + "step": 15250 + }, + { + "epoch": 0.5355515041020966, + "grad_norm": 11.721328735351562, + "learning_rate": 4.563817216169539e-05, + "loss": 1.6775, + "step": 15275 + }, + { + "epoch": 0.5364280204754225, + "grad_norm": 6.974327564239502, + "learning_rate": 4.562194029269307e-05, + "loss": 1.6548, + "step": 15300 + }, + { + "epoch": 0.5373045368487483, + "grad_norm": 6.553997993469238, + "learning_rate": 4.5605708423690744e-05, + "loss": 1.5673, + "step": 15325 + }, + { + "epoch": 0.5381810532220742, + "grad_norm": 9.465036392211914, + "learning_rate": 4.558947655468842e-05, + "loss": 1.9434, + "step": 15350 + }, + { + "epoch": 0.5390575695954001, + "grad_norm": 4.727696895599365, + "learning_rate": 4.557324468568609e-05, + "loss": 1.5181, + "step": 15375 + }, + { + "epoch": 0.5399340859687259, + "grad_norm": 4.43351936340332, + "learning_rate": 4.555701281668376e-05, + "loss": 1.9304, + "step": 15400 + }, + { + "epoch": 0.5408106023420518, + "grad_norm": 13.859894752502441, + "learning_rate": 4.554078094768144e-05, + "loss": 1.7938, + "step": 15425 + }, + { + "epoch": 0.5416871187153776, + "grad_norm": 5.851466178894043, + "learning_rate": 4.552454907867912e-05, + "loss": 1.7435, + "step": 15450 + }, + { + "epoch": 0.5425636350887034, + "grad_norm": 7.271807670593262, + "learning_rate": 4.550831720967679e-05, + "loss": 1.7852, + "step": 15475 + }, + { + "epoch": 0.5434401514620293, + "grad_norm": 4.827552318572998, + "learning_rate": 4.5492085340674466e-05, + "loss": 1.7299, + "step": 15500 + }, + { + "epoch": 0.5443166678353552, + "grad_norm": 5.547919750213623, + "learning_rate": 4.547585347167214e-05, + "loss": 1.7872, + "step": 15525 + }, + { + "epoch": 0.545193184208681, + "grad_norm": 7.546894073486328, + "learning_rate": 4.545962160266982e-05, + "loss": 1.6203, + "step": 15550 + }, + { + "epoch": 0.5460697005820069, + "grad_norm": 4.652714729309082, + "learning_rate": 4.54433897336675e-05, + "loss": 1.6302, + "step": 15575 + }, + { + "epoch": 0.5469462169553327, + "grad_norm": 7.520934581756592, + "learning_rate": 4.542715786466517e-05, + "loss": 1.6264, + "step": 15600 + }, + { + "epoch": 0.5478227333286586, + "grad_norm": 11.276270866394043, + "learning_rate": 4.5410925995662846e-05, + "loss": 1.5159, + "step": 15625 + }, + { + "epoch": 0.5486992497019845, + "grad_norm": 12.868476867675781, + "learning_rate": 4.539469412666052e-05, + "loss": 1.7946, + "step": 15650 + }, + { + "epoch": 0.5495757660753103, + "grad_norm": 7.865349769592285, + "learning_rate": 4.53784622576582e-05, + "loss": 1.8599, + "step": 15675 + }, + { + "epoch": 0.5504522824486361, + "grad_norm": 6.379178524017334, + "learning_rate": 4.536223038865588e-05, + "loss": 1.9412, + "step": 15700 + }, + { + "epoch": 0.551328798821962, + "grad_norm": 5.303730010986328, + "learning_rate": 4.534599851965355e-05, + "loss": 1.6397, + "step": 15725 + }, + { + "epoch": 0.5522053151952878, + "grad_norm": 6.336709022521973, + "learning_rate": 4.5329766650651226e-05, + "loss": 1.6109, + "step": 15750 + }, + { + "epoch": 0.5530818315686137, + "grad_norm": 10.06227970123291, + "learning_rate": 4.53135347816489e-05, + "loss": 1.9047, + "step": 15775 + }, + { + "epoch": 0.5539583479419395, + "grad_norm": 6.412175178527832, + "learning_rate": 4.5297302912646575e-05, + "loss": 1.8481, + "step": 15800 + }, + { + "epoch": 0.5548348643152654, + "grad_norm": 5.124109745025635, + "learning_rate": 4.528107104364425e-05, + "loss": 1.6639, + "step": 15825 + }, + { + "epoch": 0.5557113806885913, + "grad_norm": 11.274688720703125, + "learning_rate": 4.5264839174641925e-05, + "loss": 1.7984, + "step": 15850 + }, + { + "epoch": 0.5565878970619171, + "grad_norm": 8.960264205932617, + "learning_rate": 4.52486073056396e-05, + "loss": 1.6157, + "step": 15875 + }, + { + "epoch": 0.557464413435243, + "grad_norm": 6.758230209350586, + "learning_rate": 4.523237543663728e-05, + "loss": 1.799, + "step": 15900 + }, + { + "epoch": 0.5583409298085689, + "grad_norm": 10.868471145629883, + "learning_rate": 4.5216143567634955e-05, + "loss": 1.9462, + "step": 15925 + }, + { + "epoch": 0.5592174461818947, + "grad_norm": 8.634478569030762, + "learning_rate": 4.519991169863263e-05, + "loss": 1.7245, + "step": 15950 + }, + { + "epoch": 0.5600939625552205, + "grad_norm": 16.205564498901367, + "learning_rate": 4.5183679829630305e-05, + "loss": 1.7381, + "step": 15975 + }, + { + "epoch": 0.5609704789285463, + "grad_norm": 7.925133228302002, + "learning_rate": 4.516744796062798e-05, + "loss": 1.7097, + "step": 16000 + }, + { + "epoch": 0.5618469953018722, + "grad_norm": 9.860912322998047, + "learning_rate": 4.515121609162566e-05, + "loss": 1.7215, + "step": 16025 + }, + { + "epoch": 0.5627235116751981, + "grad_norm": 9.497489929199219, + "learning_rate": 4.5134984222623335e-05, + "loss": 1.63, + "step": 16050 + }, + { + "epoch": 0.5636000280485239, + "grad_norm": 5.452057361602783, + "learning_rate": 4.511875235362101e-05, + "loss": 1.6042, + "step": 16075 + }, + { + "epoch": 0.5644765444218498, + "grad_norm": 8.061319351196289, + "learning_rate": 4.5102520484618685e-05, + "loss": 1.7619, + "step": 16100 + }, + { + "epoch": 0.5653530607951757, + "grad_norm": 7.426731109619141, + "learning_rate": 4.508628861561636e-05, + "loss": 1.6499, + "step": 16125 + }, + { + "epoch": 0.5662295771685015, + "grad_norm": 5.175726413726807, + "learning_rate": 4.5070056746614034e-05, + "loss": 1.5959, + "step": 16150 + }, + { + "epoch": 0.5671060935418274, + "grad_norm": 11.267877578735352, + "learning_rate": 4.505382487761171e-05, + "loss": 1.5972, + "step": 16175 + }, + { + "epoch": 0.5679826099151533, + "grad_norm": 5.426123142242432, + "learning_rate": 4.503759300860938e-05, + "loss": 1.679, + "step": 16200 + }, + { + "epoch": 0.5688591262884791, + "grad_norm": 4.787314414978027, + "learning_rate": 4.502136113960706e-05, + "loss": 1.7257, + "step": 16225 + }, + { + "epoch": 0.5697356426618049, + "grad_norm": 7.594196796417236, + "learning_rate": 4.500512927060473e-05, + "loss": 1.7046, + "step": 16250 + }, + { + "epoch": 0.5706121590351307, + "grad_norm": 7.340895175933838, + "learning_rate": 4.4988897401602414e-05, + "loss": 1.5493, + "step": 16275 + }, + { + "epoch": 0.5714886754084566, + "grad_norm": 11.057573318481445, + "learning_rate": 4.497266553260009e-05, + "loss": 1.6886, + "step": 16300 + }, + { + "epoch": 0.5723651917817825, + "grad_norm": 4.6211700439453125, + "learning_rate": 4.495643366359776e-05, + "loss": 1.7571, + "step": 16325 + }, + { + "epoch": 0.5732417081551083, + "grad_norm": 13.614513397216797, + "learning_rate": 4.494020179459544e-05, + "loss": 1.5971, + "step": 16350 + }, + { + "epoch": 0.5741182245284342, + "grad_norm": 6.6602935791015625, + "learning_rate": 4.492396992559311e-05, + "loss": 1.8366, + "step": 16375 + }, + { + "epoch": 0.5749947409017601, + "grad_norm": 6.244035720825195, + "learning_rate": 4.4907738056590794e-05, + "loss": 1.84, + "step": 16400 + }, + { + "epoch": 0.5758712572750859, + "grad_norm": 8.937885284423828, + "learning_rate": 4.489150618758847e-05, + "loss": 1.7698, + "step": 16425 + }, + { + "epoch": 0.5767477736484118, + "grad_norm": 7.110170364379883, + "learning_rate": 4.487527431858614e-05, + "loss": 1.7163, + "step": 16450 + }, + { + "epoch": 0.5776242900217377, + "grad_norm": 12.116111755371094, + "learning_rate": 4.485904244958382e-05, + "loss": 1.8528, + "step": 16475 + }, + { + "epoch": 0.5785008063950634, + "grad_norm": 7.487866401672363, + "learning_rate": 4.484281058058149e-05, + "loss": 1.553, + "step": 16500 + }, + { + "epoch": 0.5793773227683893, + "grad_norm": 5.478507995605469, + "learning_rate": 4.482657871157917e-05, + "loss": 1.8316, + "step": 16525 + }, + { + "epoch": 0.5802538391417151, + "grad_norm": 7.665471076965332, + "learning_rate": 4.481034684257684e-05, + "loss": 1.8116, + "step": 16550 + }, + { + "epoch": 0.581130355515041, + "grad_norm": 7.017566680908203, + "learning_rate": 4.4794114973574516e-05, + "loss": 1.6257, + "step": 16575 + }, + { + "epoch": 0.5820068718883669, + "grad_norm": 5.722063064575195, + "learning_rate": 4.477788310457219e-05, + "loss": 1.5587, + "step": 16600 + }, + { + "epoch": 0.5828833882616927, + "grad_norm": 10.578594207763672, + "learning_rate": 4.4761651235569866e-05, + "loss": 1.8873, + "step": 16625 + }, + { + "epoch": 0.5837599046350186, + "grad_norm": 5.816608428955078, + "learning_rate": 4.474541936656755e-05, + "loss": 1.5935, + "step": 16650 + }, + { + "epoch": 0.5846364210083445, + "grad_norm": 4.97550106048584, + "learning_rate": 4.472918749756522e-05, + "loss": 1.6838, + "step": 16675 + }, + { + "epoch": 0.5855129373816703, + "grad_norm": 10.621302604675293, + "learning_rate": 4.4712955628562896e-05, + "loss": 1.7974, + "step": 16700 + }, + { + "epoch": 0.5863894537549962, + "grad_norm": 7.426506519317627, + "learning_rate": 4.469672375956057e-05, + "loss": 1.7038, + "step": 16725 + }, + { + "epoch": 0.587265970128322, + "grad_norm": 5.472107410430908, + "learning_rate": 4.4680491890558246e-05, + "loss": 1.6955, + "step": 16750 + }, + { + "epoch": 0.5881424865016478, + "grad_norm": 7.1849541664123535, + "learning_rate": 4.466426002155593e-05, + "loss": 1.7232, + "step": 16775 + }, + { + "epoch": 0.5890190028749737, + "grad_norm": 6.121264457702637, + "learning_rate": 4.46480281525536e-05, + "loss": 1.7092, + "step": 16800 + }, + { + "epoch": 0.5898955192482995, + "grad_norm": 13.896688461303711, + "learning_rate": 4.4631796283551276e-05, + "loss": 1.7411, + "step": 16825 + }, + { + "epoch": 0.5907720356216254, + "grad_norm": 6.469836235046387, + "learning_rate": 4.461556441454895e-05, + "loss": 1.8019, + "step": 16850 + }, + { + "epoch": 0.5916485519949513, + "grad_norm": 6.170224189758301, + "learning_rate": 4.459933254554663e-05, + "loss": 1.7799, + "step": 16875 + }, + { + "epoch": 0.5925250683682771, + "grad_norm": 4.628929138183594, + "learning_rate": 4.45831006765443e-05, + "loss": 1.596, + "step": 16900 + }, + { + "epoch": 0.593401584741603, + "grad_norm": 7.409107685089111, + "learning_rate": 4.4566868807541975e-05, + "loss": 1.6647, + "step": 16925 + }, + { + "epoch": 0.5942781011149288, + "grad_norm": 4.833291530609131, + "learning_rate": 4.455063693853965e-05, + "loss": 1.6346, + "step": 16950 + }, + { + "epoch": 0.5951546174882547, + "grad_norm": 3.914989471435547, + "learning_rate": 4.4534405069537324e-05, + "loss": 1.6983, + "step": 16975 + }, + { + "epoch": 0.5960311338615806, + "grad_norm": 13.422329902648926, + "learning_rate": 4.4518173200535006e-05, + "loss": 2.0415, + "step": 17000 + }, + { + "epoch": 0.5969076502349064, + "grad_norm": 7.7497076988220215, + "learning_rate": 4.450194133153268e-05, + "loss": 1.9194, + "step": 17025 + }, + { + "epoch": 0.5977841666082322, + "grad_norm": 6.6223015785217285, + "learning_rate": 4.4485709462530355e-05, + "loss": 1.663, + "step": 17050 + }, + { + "epoch": 0.5986606829815581, + "grad_norm": 4.527822494506836, + "learning_rate": 4.446947759352803e-05, + "loss": 1.8353, + "step": 17075 + }, + { + "epoch": 0.5995371993548839, + "grad_norm": 9.909481048583984, + "learning_rate": 4.4453245724525704e-05, + "loss": 1.5654, + "step": 17100 + }, + { + "epoch": 0.6004137157282098, + "grad_norm": 4.866239070892334, + "learning_rate": 4.4437013855523386e-05, + "loss": 1.7691, + "step": 17125 + }, + { + "epoch": 0.6012902321015356, + "grad_norm": 9.599346160888672, + "learning_rate": 4.442078198652106e-05, + "loss": 1.6088, + "step": 17150 + }, + { + "epoch": 0.6021667484748615, + "grad_norm": 5.112902641296387, + "learning_rate": 4.4404550117518735e-05, + "loss": 1.664, + "step": 17175 + }, + { + "epoch": 0.6030432648481874, + "grad_norm": 9.690934181213379, + "learning_rate": 4.438831824851641e-05, + "loss": 1.5636, + "step": 17200 + }, + { + "epoch": 0.6039197812215132, + "grad_norm": 7.4422478675842285, + "learning_rate": 4.4372086379514084e-05, + "loss": 1.7974, + "step": 17225 + }, + { + "epoch": 0.6047962975948391, + "grad_norm": 4.296697616577148, + "learning_rate": 4.4355854510511766e-05, + "loss": 1.8195, + "step": 17250 + }, + { + "epoch": 0.605672813968165, + "grad_norm": 4.885331153869629, + "learning_rate": 4.433962264150944e-05, + "loss": 1.6713, + "step": 17275 + }, + { + "epoch": 0.6065493303414908, + "grad_norm": 4.312460899353027, + "learning_rate": 4.4323390772507115e-05, + "loss": 1.8646, + "step": 17300 + }, + { + "epoch": 0.6074258467148166, + "grad_norm": 13.629843711853027, + "learning_rate": 4.430715890350478e-05, + "loss": 1.7689, + "step": 17325 + }, + { + "epoch": 0.6083023630881425, + "grad_norm": 9.86801528930664, + "learning_rate": 4.429092703450246e-05, + "loss": 1.7393, + "step": 17350 + }, + { + "epoch": 0.6091788794614683, + "grad_norm": 9.650004386901855, + "learning_rate": 4.427469516550014e-05, + "loss": 1.6448, + "step": 17375 + }, + { + "epoch": 0.6100553958347942, + "grad_norm": 4.611039161682129, + "learning_rate": 4.4258463296497814e-05, + "loss": 1.7145, + "step": 17400 + }, + { + "epoch": 0.61093191220812, + "grad_norm": 9.571714401245117, + "learning_rate": 4.424223142749549e-05, + "loss": 1.8565, + "step": 17425 + }, + { + "epoch": 0.6118084285814459, + "grad_norm": 5.638747692108154, + "learning_rate": 4.422599955849316e-05, + "loss": 1.8378, + "step": 17450 + }, + { + "epoch": 0.6126849449547718, + "grad_norm": 6.91680383682251, + "learning_rate": 4.420976768949084e-05, + "loss": 1.7915, + "step": 17475 + }, + { + "epoch": 0.6135614613280976, + "grad_norm": 6.8779425621032715, + "learning_rate": 4.419353582048852e-05, + "loss": 1.7398, + "step": 17500 + }, + { + "epoch": 0.6144379777014235, + "grad_norm": 6.73306941986084, + "learning_rate": 4.4177303951486194e-05, + "loss": 1.7923, + "step": 17525 + }, + { + "epoch": 0.6153144940747494, + "grad_norm": 4.347649097442627, + "learning_rate": 4.416107208248387e-05, + "loss": 1.6621, + "step": 17550 + }, + { + "epoch": 0.6161910104480751, + "grad_norm": 19.843761444091797, + "learning_rate": 4.414484021348154e-05, + "loss": 1.8452, + "step": 17575 + }, + { + "epoch": 0.617067526821401, + "grad_norm": 6.9806952476501465, + "learning_rate": 4.412860834447922e-05, + "loss": 1.7109, + "step": 17600 + }, + { + "epoch": 0.6179440431947268, + "grad_norm": 4.492630958557129, + "learning_rate": 4.41123764754769e-05, + "loss": 1.5401, + "step": 17625 + }, + { + "epoch": 0.6188205595680527, + "grad_norm": 9.098654747009277, + "learning_rate": 4.4096144606474574e-05, + "loss": 1.8705, + "step": 17650 + }, + { + "epoch": 0.6196970759413786, + "grad_norm": 5.076759338378906, + "learning_rate": 4.407991273747225e-05, + "loss": 1.7745, + "step": 17675 + }, + { + "epoch": 0.6205735923147044, + "grad_norm": 6.41923713684082, + "learning_rate": 4.406368086846992e-05, + "loss": 1.5924, + "step": 17700 + }, + { + "epoch": 0.6214501086880303, + "grad_norm": 7.299322605133057, + "learning_rate": 4.40474489994676e-05, + "loss": 1.879, + "step": 17725 + }, + { + "epoch": 0.6223266250613562, + "grad_norm": 4.2683424949646, + "learning_rate": 4.403121713046527e-05, + "loss": 1.6311, + "step": 17750 + }, + { + "epoch": 0.623203141434682, + "grad_norm": 7.15070104598999, + "learning_rate": 4.401498526146295e-05, + "loss": 1.6602, + "step": 17775 + }, + { + "epoch": 0.6240796578080079, + "grad_norm": 7.759368419647217, + "learning_rate": 4.399875339246062e-05, + "loss": 1.7454, + "step": 17800 + }, + { + "epoch": 0.6249561741813338, + "grad_norm": 11.27694320678711, + "learning_rate": 4.3982521523458296e-05, + "loss": 1.865, + "step": 17825 + }, + { + "epoch": 0.6258326905546595, + "grad_norm": 7.054596424102783, + "learning_rate": 4.396628965445597e-05, + "loss": 1.8221, + "step": 17850 + }, + { + "epoch": 0.6267092069279854, + "grad_norm": 3.856940269470215, + "learning_rate": 4.395005778545365e-05, + "loss": 1.6654, + "step": 17875 + }, + { + "epoch": 0.6275857233013112, + "grad_norm": 5.934255123138428, + "learning_rate": 4.393382591645133e-05, + "loss": 1.8049, + "step": 17900 + }, + { + "epoch": 0.6284622396746371, + "grad_norm": 6.989076137542725, + "learning_rate": 4.3917594047449e-05, + "loss": 1.697, + "step": 17925 + }, + { + "epoch": 0.629338756047963, + "grad_norm": 9.441452026367188, + "learning_rate": 4.3901362178446676e-05, + "loss": 1.5671, + "step": 17950 + }, + { + "epoch": 0.6302152724212888, + "grad_norm": 3.9008734226226807, + "learning_rate": 4.388513030944435e-05, + "loss": 1.7979, + "step": 17975 + }, + { + "epoch": 0.6310917887946147, + "grad_norm": 6.344481945037842, + "learning_rate": 4.386889844044203e-05, + "loss": 1.832, + "step": 18000 + }, + { + "epoch": 0.6319683051679406, + "grad_norm": 6.707412242889404, + "learning_rate": 4.385266657143971e-05, + "loss": 1.7573, + "step": 18025 + }, + { + "epoch": 0.6328448215412664, + "grad_norm": 4.940411567687988, + "learning_rate": 4.383643470243738e-05, + "loss": 1.7582, + "step": 18050 + }, + { + "epoch": 0.6337213379145923, + "grad_norm": 11.431363105773926, + "learning_rate": 4.3820202833435056e-05, + "loss": 1.72, + "step": 18075 + }, + { + "epoch": 0.6345978542879182, + "grad_norm": 6.1672844886779785, + "learning_rate": 4.380397096443273e-05, + "loss": 1.7633, + "step": 18100 + }, + { + "epoch": 0.6354743706612439, + "grad_norm": 7.293442249298096, + "learning_rate": 4.3787739095430405e-05, + "loss": 1.6204, + "step": 18125 + }, + { + "epoch": 0.6363508870345698, + "grad_norm": 4.068375587463379, + "learning_rate": 4.377150722642808e-05, + "loss": 1.567, + "step": 18150 + }, + { + "epoch": 0.6372274034078956, + "grad_norm": 5.548999309539795, + "learning_rate": 4.3755275357425755e-05, + "loss": 1.8088, + "step": 18175 + }, + { + "epoch": 0.6381039197812215, + "grad_norm": 10.798421859741211, + "learning_rate": 4.373904348842343e-05, + "loss": 1.6528, + "step": 18200 + }, + { + "epoch": 0.6389804361545474, + "grad_norm": 3.632856607437134, + "learning_rate": 4.372281161942111e-05, + "loss": 1.9538, + "step": 18225 + }, + { + "epoch": 0.6398569525278732, + "grad_norm": 7.594532012939453, + "learning_rate": 4.3706579750418786e-05, + "loss": 1.7253, + "step": 18250 + }, + { + "epoch": 0.6407334689011991, + "grad_norm": 5.100186824798584, + "learning_rate": 4.369034788141646e-05, + "loss": 1.554, + "step": 18275 + }, + { + "epoch": 0.641609985274525, + "grad_norm": 4.151604175567627, + "learning_rate": 4.3674116012414135e-05, + "loss": 1.7545, + "step": 18300 + }, + { + "epoch": 0.6424865016478508, + "grad_norm": 6.535691261291504, + "learning_rate": 4.365788414341181e-05, + "loss": 1.7752, + "step": 18325 + }, + { + "epoch": 0.6433630180211767, + "grad_norm": 6.807957649230957, + "learning_rate": 4.364165227440949e-05, + "loss": 1.7609, + "step": 18350 + }, + { + "epoch": 0.6442395343945025, + "grad_norm": 5.976528167724609, + "learning_rate": 4.3625420405407166e-05, + "loss": 1.6624, + "step": 18375 + }, + { + "epoch": 0.6451160507678283, + "grad_norm": 10.592754364013672, + "learning_rate": 4.360918853640484e-05, + "loss": 1.6919, + "step": 18400 + }, + { + "epoch": 0.6459925671411542, + "grad_norm": 7.778469562530518, + "learning_rate": 4.3592956667402515e-05, + "loss": 1.7372, + "step": 18425 + }, + { + "epoch": 0.64686908351448, + "grad_norm": 12.807841300964355, + "learning_rate": 4.357672479840019e-05, + "loss": 1.7301, + "step": 18450 + }, + { + "epoch": 0.6477455998878059, + "grad_norm": 4.226706027984619, + "learning_rate": 4.3560492929397864e-05, + "loss": 1.7812, + "step": 18475 + }, + { + "epoch": 0.6486221162611318, + "grad_norm": 10.613983154296875, + "learning_rate": 4.354426106039554e-05, + "loss": 1.7824, + "step": 18500 + }, + { + "epoch": 0.6494986326344576, + "grad_norm": 4.369169235229492, + "learning_rate": 4.352802919139321e-05, + "loss": 1.6458, + "step": 18525 + }, + { + "epoch": 0.6503751490077835, + "grad_norm": 12.1525239944458, + "learning_rate": 4.351179732239089e-05, + "loss": 1.7787, + "step": 18550 + }, + { + "epoch": 0.6512516653811093, + "grad_norm": 7.556457042694092, + "learning_rate": 4.349556545338856e-05, + "loss": 1.6488, + "step": 18575 + }, + { + "epoch": 0.6521281817544352, + "grad_norm": 6.435143947601318, + "learning_rate": 4.3479333584386244e-05, + "loss": 1.8258, + "step": 18600 + }, + { + "epoch": 0.6530046981277611, + "grad_norm": 6.633813381195068, + "learning_rate": 4.346310171538392e-05, + "loss": 1.7239, + "step": 18625 + }, + { + "epoch": 0.6538812145010868, + "grad_norm": 8.040837287902832, + "learning_rate": 4.344686984638159e-05, + "loss": 1.6519, + "step": 18650 + }, + { + "epoch": 0.6547577308744127, + "grad_norm": 9.946712493896484, + "learning_rate": 4.343063797737927e-05, + "loss": 1.6875, + "step": 18675 + }, + { + "epoch": 0.6556342472477386, + "grad_norm": 7.115630626678467, + "learning_rate": 4.341440610837694e-05, + "loss": 1.7565, + "step": 18700 + }, + { + "epoch": 0.6565107636210644, + "grad_norm": 6.3737568855285645, + "learning_rate": 4.3398174239374624e-05, + "loss": 1.7623, + "step": 18725 + }, + { + "epoch": 0.6573872799943903, + "grad_norm": 5.428982734680176, + "learning_rate": 4.33819423703723e-05, + "loss": 1.7648, + "step": 18750 + }, + { + "epoch": 0.6582637963677161, + "grad_norm": 6.730842113494873, + "learning_rate": 4.3365710501369973e-05, + "loss": 1.7908, + "step": 18775 + }, + { + "epoch": 0.659140312741042, + "grad_norm": 5.0466179847717285, + "learning_rate": 4.334947863236765e-05, + "loss": 1.7386, + "step": 18800 + }, + { + "epoch": 0.6600168291143679, + "grad_norm": 5.247219085693359, + "learning_rate": 4.333324676336532e-05, + "loss": 1.7344, + "step": 18825 + }, + { + "epoch": 0.6608933454876937, + "grad_norm": 4.591024398803711, + "learning_rate": 4.3317014894363e-05, + "loss": 1.5881, + "step": 18850 + }, + { + "epoch": 0.6617698618610196, + "grad_norm": 5.586825370788574, + "learning_rate": 4.330078302536067e-05, + "loss": 1.705, + "step": 18875 + }, + { + "epoch": 0.6626463782343455, + "grad_norm": 4.16548490524292, + "learning_rate": 4.328455115635835e-05, + "loss": 1.7746, + "step": 18900 + }, + { + "epoch": 0.6635228946076712, + "grad_norm": 11.44693660736084, + "learning_rate": 4.326831928735602e-05, + "loss": 1.919, + "step": 18925 + }, + { + "epoch": 0.6643994109809971, + "grad_norm": 5.815780162811279, + "learning_rate": 4.3252087418353696e-05, + "loss": 1.7288, + "step": 18950 + }, + { + "epoch": 0.665275927354323, + "grad_norm": 7.706365585327148, + "learning_rate": 4.323585554935138e-05, + "loss": 1.72, + "step": 18975 + }, + { + "epoch": 0.6661524437276488, + "grad_norm": 4.451009750366211, + "learning_rate": 4.321962368034905e-05, + "loss": 1.7754, + "step": 19000 + }, + { + "epoch": 0.6670289601009747, + "grad_norm": 7.924132823944092, + "learning_rate": 4.320339181134673e-05, + "loss": 1.7946, + "step": 19025 + }, + { + "epoch": 0.6679054764743005, + "grad_norm": 7.483951568603516, + "learning_rate": 4.31871599423444e-05, + "loss": 1.7719, + "step": 19050 + }, + { + "epoch": 0.6687819928476264, + "grad_norm": 5.420973777770996, + "learning_rate": 4.3170928073342076e-05, + "loss": 1.7183, + "step": 19075 + }, + { + "epoch": 0.6696585092209523, + "grad_norm": 3.946235179901123, + "learning_rate": 4.315469620433976e-05, + "loss": 1.6074, + "step": 19100 + }, + { + "epoch": 0.6705350255942781, + "grad_norm": 6.9826436042785645, + "learning_rate": 4.313846433533743e-05, + "loss": 1.74, + "step": 19125 + }, + { + "epoch": 0.671411541967604, + "grad_norm": 7.382415771484375, + "learning_rate": 4.312223246633511e-05, + "loss": 1.6607, + "step": 19150 + }, + { + "epoch": 0.6722880583409299, + "grad_norm": 6.221233367919922, + "learning_rate": 4.310600059733278e-05, + "loss": 1.8361, + "step": 19175 + }, + { + "epoch": 0.6731645747142556, + "grad_norm": 5.855953216552734, + "learning_rate": 4.308976872833046e-05, + "loss": 1.7511, + "step": 19200 + }, + { + "epoch": 0.6740410910875815, + "grad_norm": 4.484837532043457, + "learning_rate": 4.307353685932814e-05, + "loss": 1.6067, + "step": 19225 + }, + { + "epoch": 0.6749176074609073, + "grad_norm": 16.838096618652344, + "learning_rate": 4.3057304990325805e-05, + "loss": 1.7091, + "step": 19250 + }, + { + "epoch": 0.6757941238342332, + "grad_norm": 9.35261058807373, + "learning_rate": 4.304107312132348e-05, + "loss": 1.7382, + "step": 19275 + }, + { + "epoch": 0.6766706402075591, + "grad_norm": 7.000728607177734, + "learning_rate": 4.3024841252321155e-05, + "loss": 1.7985, + "step": 19300 + }, + { + "epoch": 0.6775471565808849, + "grad_norm": 5.836925029754639, + "learning_rate": 4.3008609383318836e-05, + "loss": 1.8331, + "step": 19325 + }, + { + "epoch": 0.6784236729542108, + "grad_norm": 7.4478044509887695, + "learning_rate": 4.299237751431651e-05, + "loss": 1.8826, + "step": 19350 + }, + { + "epoch": 0.6793001893275367, + "grad_norm": 11.254827499389648, + "learning_rate": 4.2976145645314185e-05, + "loss": 1.7073, + "step": 19375 + }, + { + "epoch": 0.6801767057008625, + "grad_norm": 4.601868152618408, + "learning_rate": 4.295991377631186e-05, + "loss": 1.617, + "step": 19400 + }, + { + "epoch": 0.6810532220741884, + "grad_norm": 6.572701930999756, + "learning_rate": 4.2943681907309535e-05, + "loss": 1.7161, + "step": 19425 + }, + { + "epoch": 0.6819297384475141, + "grad_norm": 10.980940818786621, + "learning_rate": 4.2927450038307216e-05, + "loss": 1.7892, + "step": 19450 + }, + { + "epoch": 0.68280625482084, + "grad_norm": 5.6751556396484375, + "learning_rate": 4.291121816930489e-05, + "loss": 1.8079, + "step": 19475 + }, + { + "epoch": 0.6836827711941659, + "grad_norm": 4.236637115478516, + "learning_rate": 4.2894986300302565e-05, + "loss": 1.676, + "step": 19500 + }, + { + "epoch": 0.6845592875674917, + "grad_norm": 7.291186809539795, + "learning_rate": 4.287875443130024e-05, + "loss": 1.6425, + "step": 19525 + }, + { + "epoch": 0.6854358039408176, + "grad_norm": 5.5853071212768555, + "learning_rate": 4.2862522562297915e-05, + "loss": 1.9036, + "step": 19550 + }, + { + "epoch": 0.6863123203141435, + "grad_norm": 7.140960693359375, + "learning_rate": 4.2846290693295596e-05, + "loss": 1.6359, + "step": 19575 + }, + { + "epoch": 0.6871888366874693, + "grad_norm": 7.743915557861328, + "learning_rate": 4.283005882429327e-05, + "loss": 1.8102, + "step": 19600 + }, + { + "epoch": 0.6880653530607952, + "grad_norm": 4.859267234802246, + "learning_rate": 4.2813826955290945e-05, + "loss": 1.6274, + "step": 19625 + }, + { + "epoch": 0.6889418694341211, + "grad_norm": 7.44967794418335, + "learning_rate": 4.279759508628862e-05, + "loss": 1.7928, + "step": 19650 + }, + { + "epoch": 0.6898183858074469, + "grad_norm": 6.269384384155273, + "learning_rate": 4.278136321728629e-05, + "loss": 1.7393, + "step": 19675 + }, + { + "epoch": 0.6906949021807728, + "grad_norm": 7.331720352172852, + "learning_rate": 4.276513134828397e-05, + "loss": 1.6419, + "step": 19700 + }, + { + "epoch": 0.6915714185540985, + "grad_norm": 6.2278289794921875, + "learning_rate": 4.2748899479281644e-05, + "loss": 1.7463, + "step": 19725 + }, + { + "epoch": 0.6924479349274244, + "grad_norm": 7.479672431945801, + "learning_rate": 4.273266761027932e-05, + "loss": 1.7973, + "step": 19750 + }, + { + "epoch": 0.6933244513007503, + "grad_norm": 5.915968418121338, + "learning_rate": 4.271643574127699e-05, + "loss": 1.7913, + "step": 19775 + }, + { + "epoch": 0.6942009676740761, + "grad_norm": 5.5712738037109375, + "learning_rate": 4.270020387227467e-05, + "loss": 1.59, + "step": 19800 + }, + { + "epoch": 0.695077484047402, + "grad_norm": 6.120835304260254, + "learning_rate": 4.268397200327235e-05, + "loss": 1.7413, + "step": 19825 + }, + { + "epoch": 0.6959540004207279, + "grad_norm": 6.860760688781738, + "learning_rate": 4.2667740134270024e-05, + "loss": 1.9318, + "step": 19850 + }, + { + "epoch": 0.6968305167940537, + "grad_norm": 5.246811389923096, + "learning_rate": 4.26515082652677e-05, + "loss": 1.699, + "step": 19875 + }, + { + "epoch": 0.6977070331673796, + "grad_norm": 4.009196758270264, + "learning_rate": 4.263527639626537e-05, + "loss": 1.7433, + "step": 19900 + }, + { + "epoch": 0.6985835495407055, + "grad_norm": 7.8316779136657715, + "learning_rate": 4.261904452726305e-05, + "loss": 1.8383, + "step": 19925 + }, + { + "epoch": 0.6994600659140313, + "grad_norm": 3.893549680709839, + "learning_rate": 4.260281265826073e-05, + "loss": 1.6662, + "step": 19950 + }, + { + "epoch": 0.7003365822873572, + "grad_norm": 6.709279537200928, + "learning_rate": 4.2586580789258404e-05, + "loss": 1.769, + "step": 19975 + }, + { + "epoch": 0.7012130986606829, + "grad_norm": 7.843245506286621, + "learning_rate": 4.257034892025608e-05, + "loss": 1.7916, + "step": 20000 + }, + { + "epoch": 0.7020896150340088, + "grad_norm": 5.607316970825195, + "learning_rate": 4.255411705125375e-05, + "loss": 1.7728, + "step": 20025 + }, + { + "epoch": 0.7029661314073347, + "grad_norm": 5.706740856170654, + "learning_rate": 4.253788518225143e-05, + "loss": 1.6467, + "step": 20050 + }, + { + "epoch": 0.7038426477806605, + "grad_norm": 5.685728073120117, + "learning_rate": 4.25216533132491e-05, + "loss": 1.8098, + "step": 20075 + }, + { + "epoch": 0.7047191641539864, + "grad_norm": 6.995488166809082, + "learning_rate": 4.250542144424678e-05, + "loss": 1.7416, + "step": 20100 + }, + { + "epoch": 0.7055956805273123, + "grad_norm": 4.5367536544799805, + "learning_rate": 4.248918957524445e-05, + "loss": 1.7156, + "step": 20125 + }, + { + "epoch": 0.7064721969006381, + "grad_norm": 7.05874490737915, + "learning_rate": 4.2472957706242126e-05, + "loss": 1.7512, + "step": 20150 + }, + { + "epoch": 0.707348713273964, + "grad_norm": 5.468486785888672, + "learning_rate": 4.24567258372398e-05, + "loss": 1.8021, + "step": 20175 + }, + { + "epoch": 0.7082252296472898, + "grad_norm": 5.993373870849609, + "learning_rate": 4.244049396823748e-05, + "loss": 1.9221, + "step": 20200 + }, + { + "epoch": 0.7091017460206157, + "grad_norm": 5.789038181304932, + "learning_rate": 4.242426209923516e-05, + "loss": 1.7136, + "step": 20225 + }, + { + "epoch": 0.7099782623939416, + "grad_norm": 5.740288734436035, + "learning_rate": 4.240803023023283e-05, + "loss": 1.7435, + "step": 20250 + }, + { + "epoch": 0.7108547787672673, + "grad_norm": 6.291687488555908, + "learning_rate": 4.2391798361230506e-05, + "loss": 1.8127, + "step": 20275 + }, + { + "epoch": 0.7117312951405932, + "grad_norm": 7.5563459396362305, + "learning_rate": 4.237556649222819e-05, + "loss": 1.7582, + "step": 20300 + }, + { + "epoch": 0.712607811513919, + "grad_norm": 6.041136264801025, + "learning_rate": 4.235933462322586e-05, + "loss": 1.7449, + "step": 20325 + }, + { + "epoch": 0.7134843278872449, + "grad_norm": 3.424799919128418, + "learning_rate": 4.234310275422354e-05, + "loss": 1.6464, + "step": 20350 + }, + { + "epoch": 0.7143608442605708, + "grad_norm": 10.520652770996094, + "learning_rate": 4.232687088522121e-05, + "loss": 1.7125, + "step": 20375 + }, + { + "epoch": 0.7152373606338966, + "grad_norm": 3.8908426761627197, + "learning_rate": 4.2310639016218886e-05, + "loss": 1.8317, + "step": 20400 + }, + { + "epoch": 0.7161138770072225, + "grad_norm": 4.765612602233887, + "learning_rate": 4.229440714721656e-05, + "loss": 1.8033, + "step": 20425 + }, + { + "epoch": 0.7169903933805484, + "grad_norm": 6.563026428222656, + "learning_rate": 4.2278175278214236e-05, + "loss": 1.6358, + "step": 20450 + }, + { + "epoch": 0.7178669097538742, + "grad_norm": 5.47117805480957, + "learning_rate": 4.226194340921191e-05, + "loss": 1.658, + "step": 20475 + }, + { + "epoch": 0.7187434261272001, + "grad_norm": 4.793690204620361, + "learning_rate": 4.2245711540209585e-05, + "loss": 1.7826, + "step": 20500 + }, + { + "epoch": 0.7196199425005259, + "grad_norm": 4.5695600509643555, + "learning_rate": 4.222947967120726e-05, + "loss": 1.8256, + "step": 20525 + }, + { + "epoch": 0.7204964588738517, + "grad_norm": 6.47720193862915, + "learning_rate": 4.221324780220494e-05, + "loss": 1.7959, + "step": 20550 + }, + { + "epoch": 0.7213729752471776, + "grad_norm": 7.561789035797119, + "learning_rate": 4.2197015933202616e-05, + "loss": 1.6868, + "step": 20575 + }, + { + "epoch": 0.7222494916205034, + "grad_norm": 6.490933418273926, + "learning_rate": 4.218078406420029e-05, + "loss": 1.5462, + "step": 20600 + }, + { + "epoch": 0.7231260079938293, + "grad_norm": 4.751588821411133, + "learning_rate": 4.2164552195197965e-05, + "loss": 1.6286, + "step": 20625 + }, + { + "epoch": 0.7240025243671552, + "grad_norm": 5.148170471191406, + "learning_rate": 4.214832032619564e-05, + "loss": 1.8162, + "step": 20650 + }, + { + "epoch": 0.724879040740481, + "grad_norm": 4.241532802581787, + "learning_rate": 4.213208845719332e-05, + "loss": 1.793, + "step": 20675 + }, + { + "epoch": 0.7257555571138069, + "grad_norm": 9.680534362792969, + "learning_rate": 4.2115856588190996e-05, + "loss": 1.7737, + "step": 20700 + }, + { + "epoch": 0.7266320734871328, + "grad_norm": 5.100157260894775, + "learning_rate": 4.209962471918867e-05, + "loss": 1.689, + "step": 20725 + }, + { + "epoch": 0.7275085898604586, + "grad_norm": 5.228930950164795, + "learning_rate": 4.2083392850186345e-05, + "loss": 1.745, + "step": 20750 + }, + { + "epoch": 0.7283851062337845, + "grad_norm": 6.12069034576416, + "learning_rate": 4.206716098118402e-05, + "loss": 1.5425, + "step": 20775 + }, + { + "epoch": 0.7292616226071102, + "grad_norm": 5.3806586265563965, + "learning_rate": 4.2050929112181694e-05, + "loss": 1.6376, + "step": 20800 + }, + { + "epoch": 0.7301381389804361, + "grad_norm": 6.37677526473999, + "learning_rate": 4.203469724317937e-05, + "loss": 1.5472, + "step": 20825 + }, + { + "epoch": 0.731014655353762, + "grad_norm": 4.826788902282715, + "learning_rate": 4.2018465374177044e-05, + "loss": 1.8928, + "step": 20850 + }, + { + "epoch": 0.7318911717270878, + "grad_norm": 7.522418022155762, + "learning_rate": 4.200223350517472e-05, + "loss": 1.6903, + "step": 20875 + }, + { + "epoch": 0.7327676881004137, + "grad_norm": 11.619901657104492, + "learning_rate": 4.198600163617239e-05, + "loss": 1.9648, + "step": 20900 + }, + { + "epoch": 0.7336442044737396, + "grad_norm": 6.424306869506836, + "learning_rate": 4.1969769767170074e-05, + "loss": 1.6818, + "step": 20925 + }, + { + "epoch": 0.7345207208470654, + "grad_norm": 4.196927070617676, + "learning_rate": 4.195353789816775e-05, + "loss": 1.7506, + "step": 20950 + }, + { + "epoch": 0.7353972372203913, + "grad_norm": 9.285497665405273, + "learning_rate": 4.1937306029165424e-05, + "loss": 1.5119, + "step": 20975 + }, + { + "epoch": 0.7362737535937172, + "grad_norm": 3.7124249935150146, + "learning_rate": 4.19210741601631e-05, + "loss": 1.6679, + "step": 21000 + }, + { + "epoch": 0.737150269967043, + "grad_norm": 6.31929349899292, + "learning_rate": 4.190484229116077e-05, + "loss": 1.67, + "step": 21025 + }, + { + "epoch": 0.7380267863403689, + "grad_norm": 8.293059349060059, + "learning_rate": 4.1888610422158454e-05, + "loss": 1.82, + "step": 21050 + }, + { + "epoch": 0.7389033027136946, + "grad_norm": 6.662185192108154, + "learning_rate": 4.187237855315613e-05, + "loss": 1.6307, + "step": 21075 + }, + { + "epoch": 0.7397798190870205, + "grad_norm": 6.579734802246094, + "learning_rate": 4.1856146684153804e-05, + "loss": 1.7937, + "step": 21100 + }, + { + "epoch": 0.7406563354603464, + "grad_norm": 3.9326236248016357, + "learning_rate": 4.183991481515148e-05, + "loss": 1.5955, + "step": 21125 + }, + { + "epoch": 0.7415328518336722, + "grad_norm": 3.7121353149414062, + "learning_rate": 4.182368294614915e-05, + "loss": 1.6575, + "step": 21150 + }, + { + "epoch": 0.7424093682069981, + "grad_norm": 5.851113796234131, + "learning_rate": 4.180745107714683e-05, + "loss": 1.8399, + "step": 21175 + }, + { + "epoch": 0.743285884580324, + "grad_norm": 7.718535423278809, + "learning_rate": 4.17912192081445e-05, + "loss": 1.652, + "step": 21200 + }, + { + "epoch": 0.7441624009536498, + "grad_norm": 5.026419162750244, + "learning_rate": 4.177498733914218e-05, + "loss": 1.7191, + "step": 21225 + }, + { + "epoch": 0.7450389173269757, + "grad_norm": 3.967162609100342, + "learning_rate": 4.175875547013985e-05, + "loss": 1.6766, + "step": 21250 + }, + { + "epoch": 0.7459154337003016, + "grad_norm": 3.7013943195343018, + "learning_rate": 4.1742523601137526e-05, + "loss": 1.6532, + "step": 21275 + }, + { + "epoch": 0.7467919500736274, + "grad_norm": 6.533852577209473, + "learning_rate": 4.172629173213521e-05, + "loss": 1.7392, + "step": 21300 + }, + { + "epoch": 0.7476684664469533, + "grad_norm": 6.4201860427856445, + "learning_rate": 4.171005986313288e-05, + "loss": 1.767, + "step": 21325 + }, + { + "epoch": 0.748544982820279, + "grad_norm": 8.887445449829102, + "learning_rate": 4.169382799413056e-05, + "loss": 1.7162, + "step": 21350 + }, + { + "epoch": 0.7494214991936049, + "grad_norm": 4.656317710876465, + "learning_rate": 4.167759612512823e-05, + "loss": 1.5871, + "step": 21375 + }, + { + "epoch": 0.7502980155669308, + "grad_norm": 5.203045845031738, + "learning_rate": 4.166136425612591e-05, + "loss": 1.6938, + "step": 21400 + }, + { + "epoch": 0.7511745319402566, + "grad_norm": 6.296996593475342, + "learning_rate": 4.164513238712359e-05, + "loss": 1.8663, + "step": 21425 + }, + { + "epoch": 0.7520510483135825, + "grad_norm": 6.166327953338623, + "learning_rate": 4.162890051812126e-05, + "loss": 1.5665, + "step": 21450 + }, + { + "epoch": 0.7529275646869084, + "grad_norm": 5.769898891448975, + "learning_rate": 4.161266864911894e-05, + "loss": 1.5678, + "step": 21475 + }, + { + "epoch": 0.7538040810602342, + "grad_norm": 4.6604323387146, + "learning_rate": 4.159643678011661e-05, + "loss": 1.5042, + "step": 21500 + }, + { + "epoch": 0.7546805974335601, + "grad_norm": 5.1181960105896, + "learning_rate": 4.158020491111429e-05, + "loss": 1.8821, + "step": 21525 + }, + { + "epoch": 0.755557113806886, + "grad_norm": 10.204445838928223, + "learning_rate": 4.156397304211197e-05, + "loss": 1.6523, + "step": 21550 + }, + { + "epoch": 0.7564336301802118, + "grad_norm": 4.8304243087768555, + "learning_rate": 4.154774117310964e-05, + "loss": 1.6879, + "step": 21575 + }, + { + "epoch": 0.7573101465535376, + "grad_norm": 9.886143684387207, + "learning_rate": 4.153150930410731e-05, + "loss": 1.8337, + "step": 21600 + }, + { + "epoch": 0.7581866629268634, + "grad_norm": 7.994892597198486, + "learning_rate": 4.1515277435104985e-05, + "loss": 1.7282, + "step": 21625 + }, + { + "epoch": 0.7590631793001893, + "grad_norm": 3.663795232772827, + "learning_rate": 4.1499045566102666e-05, + "loss": 1.6892, + "step": 21650 + }, + { + "epoch": 0.7599396956735152, + "grad_norm": 7.410410404205322, + "learning_rate": 4.148281369710034e-05, + "loss": 1.7226, + "step": 21675 + }, + { + "epoch": 0.760816212046841, + "grad_norm": 4.400084495544434, + "learning_rate": 4.1466581828098015e-05, + "loss": 1.6181, + "step": 21700 + }, + { + "epoch": 0.7616927284201669, + "grad_norm": 6.210215091705322, + "learning_rate": 4.145034995909569e-05, + "loss": 1.8128, + "step": 21725 + }, + { + "epoch": 0.7625692447934928, + "grad_norm": 5.65915584564209, + "learning_rate": 4.1434118090093365e-05, + "loss": 1.605, + "step": 21750 + }, + { + "epoch": 0.7634457611668186, + "grad_norm": 8.544163703918457, + "learning_rate": 4.1417886221091046e-05, + "loss": 1.7894, + "step": 21775 + }, + { + "epoch": 0.7643222775401445, + "grad_norm": 7.550364017486572, + "learning_rate": 4.140165435208872e-05, + "loss": 1.6386, + "step": 21800 + }, + { + "epoch": 0.7651987939134703, + "grad_norm": 6.033892631530762, + "learning_rate": 4.1385422483086395e-05, + "loss": 1.768, + "step": 21825 + }, + { + "epoch": 0.7660753102867962, + "grad_norm": 3.708367347717285, + "learning_rate": 4.136919061408407e-05, + "loss": 1.7507, + "step": 21850 + }, + { + "epoch": 0.766951826660122, + "grad_norm": 10.949930191040039, + "learning_rate": 4.1352958745081745e-05, + "loss": 1.5663, + "step": 21875 + }, + { + "epoch": 0.7678283430334478, + "grad_norm": 4.670827865600586, + "learning_rate": 4.1336726876079426e-05, + "loss": 1.7385, + "step": 21900 + }, + { + "epoch": 0.7687048594067737, + "grad_norm": 6.072994232177734, + "learning_rate": 4.13204950070771e-05, + "loss": 1.7353, + "step": 21925 + }, + { + "epoch": 0.7695813757800996, + "grad_norm": 7.770570755004883, + "learning_rate": 4.1304263138074775e-05, + "loss": 1.6107, + "step": 21950 + }, + { + "epoch": 0.7704578921534254, + "grad_norm": 10.8436918258667, + "learning_rate": 4.128803126907245e-05, + "loss": 1.8628, + "step": 21975 + }, + { + "epoch": 0.7713344085267513, + "grad_norm": 4.822438716888428, + "learning_rate": 4.1271799400070125e-05, + "loss": 1.6113, + "step": 22000 + }, + { + "epoch": 0.7722109249000771, + "grad_norm": 4.031903266906738, + "learning_rate": 4.12555675310678e-05, + "loss": 1.6433, + "step": 22025 + }, + { + "epoch": 0.773087441273403, + "grad_norm": 4.47920036315918, + "learning_rate": 4.1239335662065474e-05, + "loss": 1.67, + "step": 22050 + }, + { + "epoch": 0.7739639576467289, + "grad_norm": 5.633161544799805, + "learning_rate": 4.122310379306315e-05, + "loss": 1.7328, + "step": 22075 + }, + { + "epoch": 0.7748404740200547, + "grad_norm": 11.776056289672852, + "learning_rate": 4.120687192406082e-05, + "loss": 1.6073, + "step": 22100 + }, + { + "epoch": 0.7757169903933806, + "grad_norm": 4.781505584716797, + "learning_rate": 4.11906400550585e-05, + "loss": 1.6775, + "step": 22125 + }, + { + "epoch": 0.7765935067667064, + "grad_norm": 6.47687292098999, + "learning_rate": 4.117440818605618e-05, + "loss": 1.7925, + "step": 22150 + }, + { + "epoch": 0.7774700231400322, + "grad_norm": 5.653767108917236, + "learning_rate": 4.1158176317053854e-05, + "loss": 1.7733, + "step": 22175 + }, + { + "epoch": 0.7783465395133581, + "grad_norm": 13.190276145935059, + "learning_rate": 4.114194444805153e-05, + "loss": 1.8518, + "step": 22200 + }, + { + "epoch": 0.779223055886684, + "grad_norm": 8.045233726501465, + "learning_rate": 4.11257125790492e-05, + "loss": 1.6204, + "step": 22225 + }, + { + "epoch": 0.7800995722600098, + "grad_norm": 5.701691627502441, + "learning_rate": 4.110948071004688e-05, + "loss": 1.845, + "step": 22250 + }, + { + "epoch": 0.7809760886333357, + "grad_norm": 5.760019779205322, + "learning_rate": 4.109324884104456e-05, + "loss": 1.6749, + "step": 22275 + }, + { + "epoch": 0.7818526050066615, + "grad_norm": 4.158820629119873, + "learning_rate": 4.1077016972042234e-05, + "loss": 1.5801, + "step": 22300 + }, + { + "epoch": 0.7827291213799874, + "grad_norm": 8.946185111999512, + "learning_rate": 4.106078510303991e-05, + "loss": 1.6659, + "step": 22325 + }, + { + "epoch": 0.7836056377533133, + "grad_norm": 11.368963241577148, + "learning_rate": 4.104455323403758e-05, + "loss": 1.6465, + "step": 22350 + }, + { + "epoch": 0.7844821541266391, + "grad_norm": 3.9160091876983643, + "learning_rate": 4.102832136503526e-05, + "loss": 1.7213, + "step": 22375 + }, + { + "epoch": 0.785358670499965, + "grad_norm": 3.65844988822937, + "learning_rate": 4.101208949603293e-05, + "loss": 1.7337, + "step": 22400 + }, + { + "epoch": 0.7862351868732907, + "grad_norm": 3.5482757091522217, + "learning_rate": 4.099585762703061e-05, + "loss": 1.6608, + "step": 22425 + }, + { + "epoch": 0.7871117032466166, + "grad_norm": 9.861228942871094, + "learning_rate": 4.097962575802828e-05, + "loss": 1.6978, + "step": 22450 + }, + { + "epoch": 0.7879882196199425, + "grad_norm": 4.560858726501465, + "learning_rate": 4.0963393889025956e-05, + "loss": 1.5343, + "step": 22475 + }, + { + "epoch": 0.7888647359932683, + "grad_norm": 6.0870795249938965, + "learning_rate": 4.094716202002364e-05, + "loss": 1.7077, + "step": 22500 + }, + { + "epoch": 0.7897412523665942, + "grad_norm": 8.08203411102295, + "learning_rate": 4.093093015102131e-05, + "loss": 1.6147, + "step": 22525 + }, + { + "epoch": 0.7906177687399201, + "grad_norm": 8.86173152923584, + "learning_rate": 4.091469828201899e-05, + "loss": 1.842, + "step": 22550 + }, + { + "epoch": 0.7914942851132459, + "grad_norm": 4.728313446044922, + "learning_rate": 4.089846641301666e-05, + "loss": 1.7031, + "step": 22575 + }, + { + "epoch": 0.7923708014865718, + "grad_norm": 4.312060356140137, + "learning_rate": 4.0882234544014336e-05, + "loss": 1.8647, + "step": 22600 + }, + { + "epoch": 0.7932473178598977, + "grad_norm": 3.3879785537719727, + "learning_rate": 4.086600267501202e-05, + "loss": 1.6167, + "step": 22625 + }, + { + "epoch": 0.7941238342332235, + "grad_norm": 3.9501001834869385, + "learning_rate": 4.084977080600969e-05, + "loss": 1.6691, + "step": 22650 + }, + { + "epoch": 0.7950003506065493, + "grad_norm": 4.937401294708252, + "learning_rate": 4.083353893700737e-05, + "loss": 1.8961, + "step": 22675 + }, + { + "epoch": 0.7958768669798751, + "grad_norm": 3.9664812088012695, + "learning_rate": 4.081730706800504e-05, + "loss": 1.6968, + "step": 22700 + }, + { + "epoch": 0.796753383353201, + "grad_norm": 4.068183898925781, + "learning_rate": 4.0801075199002717e-05, + "loss": 1.6924, + "step": 22725 + }, + { + "epoch": 0.7976298997265269, + "grad_norm": 5.394825458526611, + "learning_rate": 4.078484333000039e-05, + "loss": 1.6902, + "step": 22750 + }, + { + "epoch": 0.7985064160998527, + "grad_norm": 3.5094492435455322, + "learning_rate": 4.0768611460998066e-05, + "loss": 1.6971, + "step": 22775 + }, + { + "epoch": 0.7993829324731786, + "grad_norm": 7.298615455627441, + "learning_rate": 4.075237959199574e-05, + "loss": 1.7962, + "step": 22800 + }, + { + "epoch": 0.8002594488465045, + "grad_norm": 5.069077491760254, + "learning_rate": 4.0736147722993415e-05, + "loss": 1.7277, + "step": 22825 + }, + { + "epoch": 0.8011359652198303, + "grad_norm": 4.073123931884766, + "learning_rate": 4.071991585399109e-05, + "loss": 1.5991, + "step": 22850 + }, + { + "epoch": 0.8020124815931562, + "grad_norm": 9.686578750610352, + "learning_rate": 4.070368398498877e-05, + "loss": 1.7117, + "step": 22875 + }, + { + "epoch": 0.802888997966482, + "grad_norm": 12.435944557189941, + "learning_rate": 4.0687452115986446e-05, + "loss": 1.769, + "step": 22900 + }, + { + "epoch": 0.8037655143398079, + "grad_norm": 4.414602756500244, + "learning_rate": 4.067122024698412e-05, + "loss": 1.6903, + "step": 22925 + }, + { + "epoch": 0.8046420307131337, + "grad_norm": 3.6309690475463867, + "learning_rate": 4.0654988377981795e-05, + "loss": 1.7635, + "step": 22950 + }, + { + "epoch": 0.8055185470864595, + "grad_norm": 5.075233459472656, + "learning_rate": 4.063875650897947e-05, + "loss": 1.616, + "step": 22975 + }, + { + "epoch": 0.8063950634597854, + "grad_norm": 4.076478004455566, + "learning_rate": 4.062252463997715e-05, + "loss": 1.7091, + "step": 23000 + }, + { + "epoch": 0.8072715798331113, + "grad_norm": 5.1065993309021, + "learning_rate": 4.0606292770974826e-05, + "loss": 1.8342, + "step": 23025 + }, + { + "epoch": 0.8081480962064371, + "grad_norm": 5.416407108306885, + "learning_rate": 4.05900609019725e-05, + "loss": 1.7586, + "step": 23050 + }, + { + "epoch": 0.809024612579763, + "grad_norm": 6.292265892028809, + "learning_rate": 4.0573829032970175e-05, + "loss": 1.5827, + "step": 23075 + }, + { + "epoch": 0.8099011289530889, + "grad_norm": 5.885228633880615, + "learning_rate": 4.055759716396785e-05, + "loss": 1.7654, + "step": 23100 + }, + { + "epoch": 0.8107776453264147, + "grad_norm": 8.876052856445312, + "learning_rate": 4.0541365294965524e-05, + "loss": 1.6244, + "step": 23125 + }, + { + "epoch": 0.8116541616997406, + "grad_norm": 7.380987644195557, + "learning_rate": 4.05251334259632e-05, + "loss": 1.7457, + "step": 23150 + }, + { + "epoch": 0.8125306780730664, + "grad_norm": 6.264214992523193, + "learning_rate": 4.0508901556960874e-05, + "loss": 1.7989, + "step": 23175 + }, + { + "epoch": 0.8134071944463923, + "grad_norm": 6.163475513458252, + "learning_rate": 4.049266968795855e-05, + "loss": 1.5388, + "step": 23200 + }, + { + "epoch": 0.8142837108197181, + "grad_norm": 6.278437614440918, + "learning_rate": 4.047643781895622e-05, + "loss": 1.8411, + "step": 23225 + }, + { + "epoch": 0.8151602271930439, + "grad_norm": 5.7069292068481445, + "learning_rate": 4.0460205949953904e-05, + "loss": 1.7087, + "step": 23250 + }, + { + "epoch": 0.8160367435663698, + "grad_norm": 5.672524452209473, + "learning_rate": 4.044397408095158e-05, + "loss": 1.6362, + "step": 23275 + }, + { + "epoch": 0.8169132599396957, + "grad_norm": 10.261502265930176, + "learning_rate": 4.0427742211949254e-05, + "loss": 1.7477, + "step": 23300 + }, + { + "epoch": 0.8177897763130215, + "grad_norm": 3.6466095447540283, + "learning_rate": 4.041151034294693e-05, + "loss": 1.6552, + "step": 23325 + }, + { + "epoch": 0.8186662926863474, + "grad_norm": 8.191649436950684, + "learning_rate": 4.03952784739446e-05, + "loss": 1.6656, + "step": 23350 + }, + { + "epoch": 0.8195428090596732, + "grad_norm": 5.6616010665893555, + "learning_rate": 4.0379046604942284e-05, + "loss": 1.636, + "step": 23375 + }, + { + "epoch": 0.8204193254329991, + "grad_norm": 5.754661560058594, + "learning_rate": 4.036281473593996e-05, + "loss": 1.6573, + "step": 23400 + }, + { + "epoch": 0.821295841806325, + "grad_norm": 6.4935688972473145, + "learning_rate": 4.0346582866937634e-05, + "loss": 1.6714, + "step": 23425 + }, + { + "epoch": 0.8221723581796508, + "grad_norm": 6.690324306488037, + "learning_rate": 4.033035099793531e-05, + "loss": 1.6292, + "step": 23450 + }, + { + "epoch": 0.8230488745529766, + "grad_norm": 4.199691295623779, + "learning_rate": 4.031411912893298e-05, + "loss": 1.7887, + "step": 23475 + }, + { + "epoch": 0.8239253909263025, + "grad_norm": 5.8543620109558105, + "learning_rate": 4.0297887259930664e-05, + "loss": 1.6381, + "step": 23500 + }, + { + "epoch": 0.8248019072996283, + "grad_norm": 5.246114253997803, + "learning_rate": 4.028165539092833e-05, + "loss": 1.8085, + "step": 23525 + }, + { + "epoch": 0.8256784236729542, + "grad_norm": 11.904260635375977, + "learning_rate": 4.026542352192601e-05, + "loss": 1.7441, + "step": 23550 + }, + { + "epoch": 0.82655494004628, + "grad_norm": 5.129090785980225, + "learning_rate": 4.024919165292368e-05, + "loss": 1.7711, + "step": 23575 + }, + { + "epoch": 0.8274314564196059, + "grad_norm": 6.554096221923828, + "learning_rate": 4.0232959783921356e-05, + "loss": 1.7544, + "step": 23600 + }, + { + "epoch": 0.8283079727929318, + "grad_norm": 3.182779312133789, + "learning_rate": 4.021672791491904e-05, + "loss": 1.6184, + "step": 23625 + }, + { + "epoch": 0.8291844891662576, + "grad_norm": 5.287519931793213, + "learning_rate": 4.020049604591671e-05, + "loss": 1.6867, + "step": 23650 + }, + { + "epoch": 0.8300610055395835, + "grad_norm": 4.40645170211792, + "learning_rate": 4.018426417691439e-05, + "loss": 1.8465, + "step": 23675 + }, + { + "epoch": 0.8309375219129094, + "grad_norm": 6.097541332244873, + "learning_rate": 4.016803230791206e-05, + "loss": 1.7283, + "step": 23700 + }, + { + "epoch": 0.8318140382862352, + "grad_norm": 6.722564697265625, + "learning_rate": 4.015180043890974e-05, + "loss": 1.6697, + "step": 23725 + }, + { + "epoch": 0.832690554659561, + "grad_norm": 11.033297538757324, + "learning_rate": 4.013556856990742e-05, + "loss": 1.6686, + "step": 23750 + }, + { + "epoch": 0.8335670710328869, + "grad_norm": 6.548190116882324, + "learning_rate": 4.011933670090509e-05, + "loss": 1.8156, + "step": 23775 + }, + { + "epoch": 0.8344435874062127, + "grad_norm": 6.099876403808594, + "learning_rate": 4.010310483190277e-05, + "loss": 1.7125, + "step": 23800 + }, + { + "epoch": 0.8353201037795386, + "grad_norm": 7.616811752319336, + "learning_rate": 4.008687296290044e-05, + "loss": 1.5136, + "step": 23825 + }, + { + "epoch": 0.8361966201528644, + "grad_norm": 3.4832310676574707, + "learning_rate": 4.007064109389812e-05, + "loss": 1.7756, + "step": 23850 + }, + { + "epoch": 0.8370731365261903, + "grad_norm": 3.3611369132995605, + "learning_rate": 4.00544092248958e-05, + "loss": 1.6058, + "step": 23875 + }, + { + "epoch": 0.8379496528995162, + "grad_norm": 3.0637598037719727, + "learning_rate": 4.003817735589347e-05, + "loss": 1.5916, + "step": 23900 + }, + { + "epoch": 0.838826169272842, + "grad_norm": 6.4877824783325195, + "learning_rate": 4.002194548689115e-05, + "loss": 1.6964, + "step": 23925 + }, + { + "epoch": 0.8397026856461679, + "grad_norm": 8.140599250793457, + "learning_rate": 4.0005713617888815e-05, + "loss": 1.5617, + "step": 23950 + }, + { + "epoch": 0.8405792020194938, + "grad_norm": 5.877359867095947, + "learning_rate": 3.9989481748886496e-05, + "loss": 1.85, + "step": 23975 + }, + { + "epoch": 0.8414557183928196, + "grad_norm": 5.481163501739502, + "learning_rate": 3.997324987988417e-05, + "loss": 1.6054, + "step": 24000 + }, + { + "epoch": 0.8423322347661454, + "grad_norm": 5.000059604644775, + "learning_rate": 3.9957018010881846e-05, + "loss": 1.8101, + "step": 24025 + }, + { + "epoch": 0.8432087511394712, + "grad_norm": 5.197421550750732, + "learning_rate": 3.994078614187952e-05, + "loss": 1.8717, + "step": 24050 + }, + { + "epoch": 0.8440852675127971, + "grad_norm": 3.572495937347412, + "learning_rate": 3.9924554272877195e-05, + "loss": 1.8099, + "step": 24075 + }, + { + "epoch": 0.844961783886123, + "grad_norm": 6.445613861083984, + "learning_rate": 3.9908322403874876e-05, + "loss": 1.6258, + "step": 24100 + }, + { + "epoch": 0.8458383002594488, + "grad_norm": 5.585651397705078, + "learning_rate": 3.989209053487255e-05, + "loss": 1.7903, + "step": 24125 + }, + { + "epoch": 0.8467148166327747, + "grad_norm": 4.320014476776123, + "learning_rate": 3.9875858665870226e-05, + "loss": 1.839, + "step": 24150 + }, + { + "epoch": 0.8475913330061006, + "grad_norm": 3.598759412765503, + "learning_rate": 3.98596267968679e-05, + "loss": 1.6144, + "step": 24175 + }, + { + "epoch": 0.8484678493794264, + "grad_norm": 8.348057746887207, + "learning_rate": 3.9843394927865575e-05, + "loss": 1.6101, + "step": 24200 + }, + { + "epoch": 0.8493443657527523, + "grad_norm": 5.679026126861572, + "learning_rate": 3.9827163058863256e-05, + "loss": 1.8812, + "step": 24225 + }, + { + "epoch": 0.8502208821260782, + "grad_norm": 6.0741167068481445, + "learning_rate": 3.981093118986093e-05, + "loss": 1.6718, + "step": 24250 + }, + { + "epoch": 0.851097398499404, + "grad_norm": 4.0966644287109375, + "learning_rate": 3.9794699320858606e-05, + "loss": 1.5113, + "step": 24275 + }, + { + "epoch": 0.8519739148727298, + "grad_norm": 5.315481185913086, + "learning_rate": 3.977846745185628e-05, + "loss": 1.4744, + "step": 24300 + }, + { + "epoch": 0.8528504312460556, + "grad_norm": 4.046975135803223, + "learning_rate": 3.9762235582853955e-05, + "loss": 1.8346, + "step": 24325 + }, + { + "epoch": 0.8537269476193815, + "grad_norm": 4.235684871673584, + "learning_rate": 3.974600371385163e-05, + "loss": 1.515, + "step": 24350 + }, + { + "epoch": 0.8546034639927074, + "grad_norm": 6.774339199066162, + "learning_rate": 3.9729771844849304e-05, + "loss": 1.6111, + "step": 24375 + }, + { + "epoch": 0.8554799803660332, + "grad_norm": 6.070276260375977, + "learning_rate": 3.971353997584698e-05, + "loss": 1.675, + "step": 24400 + }, + { + "epoch": 0.8563564967393591, + "grad_norm": 5.62911319732666, + "learning_rate": 3.9697308106844653e-05, + "loss": 1.5753, + "step": 24425 + }, + { + "epoch": 0.857233013112685, + "grad_norm": 7.983538627624512, + "learning_rate": 3.968107623784233e-05, + "loss": 1.7055, + "step": 24450 + }, + { + "epoch": 0.8581095294860108, + "grad_norm": 5.610392093658447, + "learning_rate": 3.966484436884001e-05, + "loss": 2.037, + "step": 24475 + }, + { + "epoch": 0.8589860458593367, + "grad_norm": 4.11611270904541, + "learning_rate": 3.9648612499837684e-05, + "loss": 1.8026, + "step": 24500 + }, + { + "epoch": 0.8598625622326626, + "grad_norm": 7.9117231369018555, + "learning_rate": 3.963238063083536e-05, + "loss": 1.7502, + "step": 24525 + }, + { + "epoch": 0.8607390786059883, + "grad_norm": 4.177938938140869, + "learning_rate": 3.9616148761833033e-05, + "loss": 1.7778, + "step": 24550 + }, + { + "epoch": 0.8616155949793142, + "grad_norm": 3.3162825107574463, + "learning_rate": 3.959991689283071e-05, + "loss": 1.796, + "step": 24575 + }, + { + "epoch": 0.86249211135264, + "grad_norm": 6.337653160095215, + "learning_rate": 3.958368502382839e-05, + "loss": 1.619, + "step": 24600 + }, + { + "epoch": 0.8633686277259659, + "grad_norm": 3.294485330581665, + "learning_rate": 3.9567453154826064e-05, + "loss": 1.772, + "step": 24625 + }, + { + "epoch": 0.8642451440992918, + "grad_norm": 12.015644073486328, + "learning_rate": 3.955122128582374e-05, + "loss": 1.6118, + "step": 24650 + }, + { + "epoch": 0.8651216604726176, + "grad_norm": 4.222651481628418, + "learning_rate": 3.9534989416821413e-05, + "loss": 1.7351, + "step": 24675 + }, + { + "epoch": 0.8659981768459435, + "grad_norm": 3.402078628540039, + "learning_rate": 3.951875754781909e-05, + "loss": 1.7946, + "step": 24700 + }, + { + "epoch": 0.8668746932192694, + "grad_norm": 11.347024917602539, + "learning_rate": 3.950252567881676e-05, + "loss": 1.8362, + "step": 24725 + }, + { + "epoch": 0.8677512095925952, + "grad_norm": 8.869746208190918, + "learning_rate": 3.948629380981444e-05, + "loss": 1.6991, + "step": 24750 + }, + { + "epoch": 0.8686277259659211, + "grad_norm": 11.733623504638672, + "learning_rate": 3.947006194081211e-05, + "loss": 1.6778, + "step": 24775 + }, + { + "epoch": 0.869504242339247, + "grad_norm": 6.4805145263671875, + "learning_rate": 3.945383007180979e-05, + "loss": 1.7548, + "step": 24800 + }, + { + "epoch": 0.8703807587125727, + "grad_norm": 5.961811065673828, + "learning_rate": 3.943759820280747e-05, + "loss": 1.7202, + "step": 24825 + }, + { + "epoch": 0.8712572750858986, + "grad_norm": 8.28621768951416, + "learning_rate": 3.942136633380514e-05, + "loss": 1.8271, + "step": 24850 + }, + { + "epoch": 0.8721337914592244, + "grad_norm": 8.547704696655273, + "learning_rate": 3.940513446480282e-05, + "loss": 1.5539, + "step": 24875 + }, + { + "epoch": 0.8730103078325503, + "grad_norm": 11.140161514282227, + "learning_rate": 3.938890259580049e-05, + "loss": 1.8055, + "step": 24900 + }, + { + "epoch": 0.8738868242058762, + "grad_norm": 4.1092987060546875, + "learning_rate": 3.937267072679817e-05, + "loss": 1.6724, + "step": 24925 + }, + { + "epoch": 0.874763340579202, + "grad_norm": 6.631948471069336, + "learning_rate": 3.935643885779585e-05, + "loss": 1.6115, + "step": 24950 + }, + { + "epoch": 0.8756398569525279, + "grad_norm": 5.139019966125488, + "learning_rate": 3.934020698879352e-05, + "loss": 1.4256, + "step": 24975 + }, + { + "epoch": 0.8765163733258537, + "grad_norm": 11.836568832397461, + "learning_rate": 3.93239751197912e-05, + "loss": 1.6693, + "step": 25000 + }, + { + "epoch": 0.8773928896991796, + "grad_norm": 3.3977880477905273, + "learning_rate": 3.930774325078887e-05, + "loss": 1.7354, + "step": 25025 + }, + { + "epoch": 0.8782694060725055, + "grad_norm": 5.082038402557373, + "learning_rate": 3.929151138178655e-05, + "loss": 1.8697, + "step": 25050 + }, + { + "epoch": 0.8791459224458313, + "grad_norm": 13.530316352844238, + "learning_rate": 3.927527951278422e-05, + "loss": 1.6598, + "step": 25075 + }, + { + "epoch": 0.8800224388191571, + "grad_norm": 4.111960411071777, + "learning_rate": 3.9259047643781896e-05, + "loss": 1.6237, + "step": 25100 + }, + { + "epoch": 0.880898955192483, + "grad_norm": 9.188414573669434, + "learning_rate": 3.924281577477957e-05, + "loss": 1.7045, + "step": 25125 + }, + { + "epoch": 0.8817754715658088, + "grad_norm": 4.251001358032227, + "learning_rate": 3.9226583905777245e-05, + "loss": 1.6431, + "step": 25150 + }, + { + "epoch": 0.8826519879391347, + "grad_norm": 9.804367065429688, + "learning_rate": 3.921035203677492e-05, + "loss": 1.8317, + "step": 25175 + }, + { + "epoch": 0.8835285043124605, + "grad_norm": 5.246570587158203, + "learning_rate": 3.91941201677726e-05, + "loss": 1.6705, + "step": 25200 + }, + { + "epoch": 0.8844050206857864, + "grad_norm": 8.049766540527344, + "learning_rate": 3.9177888298770276e-05, + "loss": 1.6402, + "step": 25225 + }, + { + "epoch": 0.8852815370591123, + "grad_norm": 6.163084030151367, + "learning_rate": 3.916165642976795e-05, + "loss": 1.8092, + "step": 25250 + }, + { + "epoch": 0.8861580534324381, + "grad_norm": 11.213810920715332, + "learning_rate": 3.9145424560765625e-05, + "loss": 1.6166, + "step": 25275 + }, + { + "epoch": 0.887034569805764, + "grad_norm": 5.172678470611572, + "learning_rate": 3.91291926917633e-05, + "loss": 1.6832, + "step": 25300 + }, + { + "epoch": 0.8879110861790899, + "grad_norm": 4.120918273925781, + "learning_rate": 3.911296082276098e-05, + "loss": 1.8276, + "step": 25325 + }, + { + "epoch": 0.8887876025524157, + "grad_norm": 4.993307113647461, + "learning_rate": 3.9096728953758656e-05, + "loss": 1.7246, + "step": 25350 + }, + { + "epoch": 0.8896641189257415, + "grad_norm": 6.242663383483887, + "learning_rate": 3.908049708475633e-05, + "loss": 1.6609, + "step": 25375 + }, + { + "epoch": 0.8905406352990674, + "grad_norm": 7.1374030113220215, + "learning_rate": 3.9064265215754005e-05, + "loss": 1.6956, + "step": 25400 + }, + { + "epoch": 0.8914171516723932, + "grad_norm": 4.73626708984375, + "learning_rate": 3.904803334675168e-05, + "loss": 1.6741, + "step": 25425 + }, + { + "epoch": 0.8922936680457191, + "grad_norm": 5.495104789733887, + "learning_rate": 3.9031801477749355e-05, + "loss": 1.5751, + "step": 25450 + }, + { + "epoch": 0.8931701844190449, + "grad_norm": 3.4081802368164062, + "learning_rate": 3.901556960874703e-05, + "loss": 1.6877, + "step": 25475 + }, + { + "epoch": 0.8940467007923708, + "grad_norm": 5.610198974609375, + "learning_rate": 3.8999337739744704e-05, + "loss": 1.7978, + "step": 25500 + }, + { + "epoch": 0.8949232171656967, + "grad_norm": 3.583150625228882, + "learning_rate": 3.898310587074238e-05, + "loss": 1.7314, + "step": 25525 + }, + { + "epoch": 0.8957997335390225, + "grad_norm": 7.185346603393555, + "learning_rate": 3.896687400174005e-05, + "loss": 1.8497, + "step": 25550 + }, + { + "epoch": 0.8966762499123484, + "grad_norm": 4.677765369415283, + "learning_rate": 3.8950642132737735e-05, + "loss": 1.8665, + "step": 25575 + }, + { + "epoch": 0.8975527662856743, + "grad_norm": 3.7469446659088135, + "learning_rate": 3.893441026373541e-05, + "loss": 1.6843, + "step": 25600 + }, + { + "epoch": 0.898429282659, + "grad_norm": 7.992246627807617, + "learning_rate": 3.8918178394733084e-05, + "loss": 1.7642, + "step": 25625 + }, + { + "epoch": 0.8993057990323259, + "grad_norm": 3.8764231204986572, + "learning_rate": 3.890194652573076e-05, + "loss": 1.648, + "step": 25650 + }, + { + "epoch": 0.9001823154056517, + "grad_norm": 5.613622665405273, + "learning_rate": 3.888571465672843e-05, + "loss": 1.6324, + "step": 25675 + }, + { + "epoch": 0.9010588317789776, + "grad_norm": 7.483119487762451, + "learning_rate": 3.8869482787726115e-05, + "loss": 1.6302, + "step": 25700 + }, + { + "epoch": 0.9019353481523035, + "grad_norm": 5.353825092315674, + "learning_rate": 3.885325091872379e-05, + "loss": 1.8761, + "step": 25725 + }, + { + "epoch": 0.9028118645256293, + "grad_norm": 3.843681812286377, + "learning_rate": 3.8837019049721464e-05, + "loss": 1.6097, + "step": 25750 + }, + { + "epoch": 0.9036883808989552, + "grad_norm": 3.434091567993164, + "learning_rate": 3.882078718071914e-05, + "loss": 1.7949, + "step": 25775 + }, + { + "epoch": 0.9045648972722811, + "grad_norm": 3.8939411640167236, + "learning_rate": 3.880455531171681e-05, + "loss": 1.635, + "step": 25800 + }, + { + "epoch": 0.9054414136456069, + "grad_norm": 4.176934719085693, + "learning_rate": 3.8788323442714495e-05, + "loss": 1.5806, + "step": 25825 + }, + { + "epoch": 0.9063179300189328, + "grad_norm": 6.137138843536377, + "learning_rate": 3.877209157371217e-05, + "loss": 1.6245, + "step": 25850 + }, + { + "epoch": 0.9071944463922587, + "grad_norm": 4.446413040161133, + "learning_rate": 3.875585970470984e-05, + "loss": 1.8094, + "step": 25875 + }, + { + "epoch": 0.9080709627655844, + "grad_norm": 6.142608165740967, + "learning_rate": 3.873962783570751e-05, + "loss": 1.6006, + "step": 25900 + }, + { + "epoch": 0.9089474791389103, + "grad_norm": 6.71065092086792, + "learning_rate": 3.872339596670519e-05, + "loss": 1.7642, + "step": 25925 + }, + { + "epoch": 0.9098239955122361, + "grad_norm": 7.3341569900512695, + "learning_rate": 3.870716409770287e-05, + "loss": 1.6329, + "step": 25950 + }, + { + "epoch": 0.910700511885562, + "grad_norm": 5.601141452789307, + "learning_rate": 3.869093222870054e-05, + "loss": 1.897, + "step": 25975 + }, + { + "epoch": 0.9115770282588879, + "grad_norm": 6.154690742492676, + "learning_rate": 3.867470035969822e-05, + "loss": 1.6611, + "step": 26000 + }, + { + "epoch": 0.9124535446322137, + "grad_norm": 10.088637351989746, + "learning_rate": 3.865846849069589e-05, + "loss": 1.8263, + "step": 26025 + }, + { + "epoch": 0.9133300610055396, + "grad_norm": 6.172771453857422, + "learning_rate": 3.864223662169357e-05, + "loss": 1.8327, + "step": 26050 + }, + { + "epoch": 0.9142065773788655, + "grad_norm": 8.892468452453613, + "learning_rate": 3.862600475269125e-05, + "loss": 1.769, + "step": 26075 + }, + { + "epoch": 0.9150830937521913, + "grad_norm": 5.162864685058594, + "learning_rate": 3.860977288368892e-05, + "loss": 1.7529, + "step": 26100 + }, + { + "epoch": 0.9159596101255172, + "grad_norm": 5.840184211730957, + "learning_rate": 3.85935410146866e-05, + "loss": 1.7316, + "step": 26125 + }, + { + "epoch": 0.916836126498843, + "grad_norm": 4.895603179931641, + "learning_rate": 3.857730914568427e-05, + "loss": 1.6979, + "step": 26150 + }, + { + "epoch": 0.9177126428721688, + "grad_norm": 4.862699031829834, + "learning_rate": 3.856107727668195e-05, + "loss": 1.7594, + "step": 26175 + }, + { + "epoch": 0.9185891592454947, + "grad_norm": 10.821337699890137, + "learning_rate": 3.854484540767963e-05, + "loss": 1.7482, + "step": 26200 + }, + { + "epoch": 0.9194656756188205, + "grad_norm": 5.728857040405273, + "learning_rate": 3.85286135386773e-05, + "loss": 1.6495, + "step": 26225 + }, + { + "epoch": 0.9203421919921464, + "grad_norm": 10.813159942626953, + "learning_rate": 3.851238166967498e-05, + "loss": 1.6796, + "step": 26250 + }, + { + "epoch": 0.9212187083654723, + "grad_norm": 4.14129114151001, + "learning_rate": 3.849614980067265e-05, + "loss": 1.7122, + "step": 26275 + }, + { + "epoch": 0.9220952247387981, + "grad_norm": 3.8782107830047607, + "learning_rate": 3.8479917931670326e-05, + "loss": 1.618, + "step": 26300 + }, + { + "epoch": 0.922971741112124, + "grad_norm": 4.153103351593018, + "learning_rate": 3.8463686062668e-05, + "loss": 1.5587, + "step": 26325 + }, + { + "epoch": 0.9238482574854499, + "grad_norm": 4.528087139129639, + "learning_rate": 3.8447454193665676e-05, + "loss": 1.8929, + "step": 26350 + }, + { + "epoch": 0.9247247738587757, + "grad_norm": 7.691710948944092, + "learning_rate": 3.843122232466335e-05, + "loss": 1.7466, + "step": 26375 + }, + { + "epoch": 0.9256012902321016, + "grad_norm": 3.6844239234924316, + "learning_rate": 3.8414990455661025e-05, + "loss": 1.6805, + "step": 26400 + }, + { + "epoch": 0.9264778066054273, + "grad_norm": 5.356090068817139, + "learning_rate": 3.8398758586658706e-05, + "loss": 1.7968, + "step": 26425 + }, + { + "epoch": 0.9273543229787532, + "grad_norm": 5.705104351043701, + "learning_rate": 3.838252671765638e-05, + "loss": 1.6759, + "step": 26450 + }, + { + "epoch": 0.9282308393520791, + "grad_norm": 5.192763805389404, + "learning_rate": 3.8366294848654056e-05, + "loss": 1.691, + "step": 26475 + }, + { + "epoch": 0.9291073557254049, + "grad_norm": 5.05523157119751, + "learning_rate": 3.835006297965173e-05, + "loss": 1.8017, + "step": 26500 + }, + { + "epoch": 0.9299838720987308, + "grad_norm": 5.506353855133057, + "learning_rate": 3.8333831110649405e-05, + "loss": 1.6705, + "step": 26525 + }, + { + "epoch": 0.9308603884720567, + "grad_norm": 9.477849006652832, + "learning_rate": 3.8317599241647086e-05, + "loss": 1.9036, + "step": 26550 + }, + { + "epoch": 0.9317369048453825, + "grad_norm": 4.572815895080566, + "learning_rate": 3.830136737264476e-05, + "loss": 1.7842, + "step": 26575 + }, + { + "epoch": 0.9326134212187084, + "grad_norm": 5.646363258361816, + "learning_rate": 3.8285135503642436e-05, + "loss": 1.7733, + "step": 26600 + }, + { + "epoch": 0.9334899375920342, + "grad_norm": 4.405515670776367, + "learning_rate": 3.826890363464011e-05, + "loss": 1.7845, + "step": 26625 + }, + { + "epoch": 0.9343664539653601, + "grad_norm": 3.5077264308929443, + "learning_rate": 3.8252671765637785e-05, + "loss": 1.7319, + "step": 26650 + }, + { + "epoch": 0.935242970338686, + "grad_norm": 3.5029592514038086, + "learning_rate": 3.823643989663546e-05, + "loss": 1.6691, + "step": 26675 + }, + { + "epoch": 0.9361194867120117, + "grad_norm": 3.7130258083343506, + "learning_rate": 3.8220208027633134e-05, + "loss": 1.6843, + "step": 26700 + }, + { + "epoch": 0.9369960030853376, + "grad_norm": 5.1545891761779785, + "learning_rate": 3.820397615863081e-05, + "loss": 1.7591, + "step": 26725 + }, + { + "epoch": 0.9378725194586635, + "grad_norm": 5.694048881530762, + "learning_rate": 3.8187744289628484e-05, + "loss": 1.7401, + "step": 26750 + }, + { + "epoch": 0.9387490358319893, + "grad_norm": 4.995056629180908, + "learning_rate": 3.817151242062616e-05, + "loss": 1.7325, + "step": 26775 + }, + { + "epoch": 0.9396255522053152, + "grad_norm": 6.251009464263916, + "learning_rate": 3.815528055162384e-05, + "loss": 1.7418, + "step": 26800 + }, + { + "epoch": 0.940502068578641, + "grad_norm": 11.969161987304688, + "learning_rate": 3.8139048682621514e-05, + "loss": 1.6358, + "step": 26825 + }, + { + "epoch": 0.9413785849519669, + "grad_norm": 3.8903982639312744, + "learning_rate": 3.812281681361919e-05, + "loss": 1.4328, + "step": 26850 + }, + { + "epoch": 0.9422551013252928, + "grad_norm": 4.499454975128174, + "learning_rate": 3.8106584944616864e-05, + "loss": 1.6736, + "step": 26875 + }, + { + "epoch": 0.9431316176986186, + "grad_norm": 5.656023979187012, + "learning_rate": 3.809035307561454e-05, + "loss": 1.721, + "step": 26900 + }, + { + "epoch": 0.9440081340719445, + "grad_norm": 3.6959307193756104, + "learning_rate": 3.807412120661222e-05, + "loss": 1.6514, + "step": 26925 + }, + { + "epoch": 0.9448846504452704, + "grad_norm": 16.425174713134766, + "learning_rate": 3.8057889337609894e-05, + "loss": 1.5938, + "step": 26950 + }, + { + "epoch": 0.9457611668185961, + "grad_norm": 4.096930027008057, + "learning_rate": 3.804165746860757e-05, + "loss": 1.8624, + "step": 26975 + }, + { + "epoch": 0.946637683191922, + "grad_norm": 6.247106075286865, + "learning_rate": 3.8025425599605244e-05, + "loss": 1.7425, + "step": 27000 + }, + { + "epoch": 0.9475141995652478, + "grad_norm": 3.154416799545288, + "learning_rate": 3.800919373060292e-05, + "loss": 1.8109, + "step": 27025 + }, + { + "epoch": 0.9483907159385737, + "grad_norm": 3.2379212379455566, + "learning_rate": 3.799296186160059e-05, + "loss": 1.5155, + "step": 27050 + }, + { + "epoch": 0.9492672323118996, + "grad_norm": 4.131470680236816, + "learning_rate": 3.797672999259827e-05, + "loss": 1.6899, + "step": 27075 + }, + { + "epoch": 0.9501437486852254, + "grad_norm": 8.208736419677734, + "learning_rate": 3.796049812359594e-05, + "loss": 1.6251, + "step": 27100 + }, + { + "epoch": 0.9510202650585513, + "grad_norm": 5.311698913574219, + "learning_rate": 3.794426625459362e-05, + "loss": 1.8844, + "step": 27125 + }, + { + "epoch": 0.9518967814318772, + "grad_norm": 3.720669746398926, + "learning_rate": 3.79280343855913e-05, + "loss": 1.818, + "step": 27150 + }, + { + "epoch": 0.952773297805203, + "grad_norm": 4.133859157562256, + "learning_rate": 3.791180251658897e-05, + "loss": 1.5844, + "step": 27175 + }, + { + "epoch": 0.9536498141785289, + "grad_norm": 3.9013166427612305, + "learning_rate": 3.789557064758665e-05, + "loss": 1.956, + "step": 27200 + }, + { + "epoch": 0.9545263305518548, + "grad_norm": 4.4850873947143555, + "learning_rate": 3.787933877858432e-05, + "loss": 1.6774, + "step": 27225 + }, + { + "epoch": 0.9554028469251805, + "grad_norm": 6.449410915374756, + "learning_rate": 3.7863106909582e-05, + "loss": 1.7861, + "step": 27250 + }, + { + "epoch": 0.9562793632985064, + "grad_norm": 4.815369606018066, + "learning_rate": 3.784687504057968e-05, + "loss": 1.6417, + "step": 27275 + }, + { + "epoch": 0.9571558796718322, + "grad_norm": 3.214726686477661, + "learning_rate": 3.783064317157735e-05, + "loss": 1.7851, + "step": 27300 + }, + { + "epoch": 0.9580323960451581, + "grad_norm": 9.263312339782715, + "learning_rate": 3.781441130257503e-05, + "loss": 1.4841, + "step": 27325 + }, + { + "epoch": 0.958908912418484, + "grad_norm": 3.6658241748809814, + "learning_rate": 3.77981794335727e-05, + "loss": 1.932, + "step": 27350 + }, + { + "epoch": 0.9597854287918098, + "grad_norm": 4.093572616577148, + "learning_rate": 3.778194756457038e-05, + "loss": 1.8645, + "step": 27375 + }, + { + "epoch": 0.9606619451651357, + "grad_norm": 4.013779640197754, + "learning_rate": 3.776571569556805e-05, + "loss": 1.6717, + "step": 27400 + }, + { + "epoch": 0.9615384615384616, + "grad_norm": 4.258903980255127, + "learning_rate": 3.7749483826565726e-05, + "loss": 1.659, + "step": 27425 + }, + { + "epoch": 0.9624149779117874, + "grad_norm": 3.4645369052886963, + "learning_rate": 3.77332519575634e-05, + "loss": 1.4382, + "step": 27450 + }, + { + "epoch": 0.9632914942851133, + "grad_norm": 6.789366722106934, + "learning_rate": 3.7717020088561075e-05, + "loss": 1.8874, + "step": 27475 + }, + { + "epoch": 0.964168010658439, + "grad_norm": 3.8764424324035645, + "learning_rate": 3.770078821955875e-05, + "loss": 1.6542, + "step": 27500 + }, + { + "epoch": 0.9650445270317649, + "grad_norm": 10.170833587646484, + "learning_rate": 3.768455635055643e-05, + "loss": 1.9087, + "step": 27525 + }, + { + "epoch": 0.9659210434050908, + "grad_norm": 4.972094535827637, + "learning_rate": 3.7668324481554106e-05, + "loss": 1.6298, + "step": 27550 + }, + { + "epoch": 0.9667975597784166, + "grad_norm": 5.272711277008057, + "learning_rate": 3.765209261255178e-05, + "loss": 1.7195, + "step": 27575 + }, + { + "epoch": 0.9676740761517425, + "grad_norm": 7.9466423988342285, + "learning_rate": 3.7635860743549455e-05, + "loss": 1.6217, + "step": 27600 + }, + { + "epoch": 0.9685505925250684, + "grad_norm": 4.728545665740967, + "learning_rate": 3.761962887454713e-05, + "loss": 1.7341, + "step": 27625 + }, + { + "epoch": 0.9694271088983942, + "grad_norm": 7.282612323760986, + "learning_rate": 3.760339700554481e-05, + "loss": 1.6776, + "step": 27650 + }, + { + "epoch": 0.9703036252717201, + "grad_norm": 4.635673999786377, + "learning_rate": 3.7587165136542486e-05, + "loss": 1.5534, + "step": 27675 + }, + { + "epoch": 0.971180141645046, + "grad_norm": 5.475913047790527, + "learning_rate": 3.757093326754016e-05, + "loss": 1.5635, + "step": 27700 + }, + { + "epoch": 0.9720566580183718, + "grad_norm": 6.214550018310547, + "learning_rate": 3.7554701398537835e-05, + "loss": 1.8633, + "step": 27725 + }, + { + "epoch": 0.9729331743916977, + "grad_norm": 5.960138320922852, + "learning_rate": 3.753846952953551e-05, + "loss": 1.6179, + "step": 27750 + }, + { + "epoch": 0.9738096907650234, + "grad_norm": 3.297053575515747, + "learning_rate": 3.752223766053319e-05, + "loss": 1.6428, + "step": 27775 + }, + { + "epoch": 0.9746862071383493, + "grad_norm": 8.335491180419922, + "learning_rate": 3.750600579153086e-05, + "loss": 1.7857, + "step": 27800 + }, + { + "epoch": 0.9755627235116752, + "grad_norm": 6.567666530609131, + "learning_rate": 3.7489773922528534e-05, + "loss": 1.9142, + "step": 27825 + }, + { + "epoch": 0.976439239885001, + "grad_norm": 7.466176509857178, + "learning_rate": 3.747354205352621e-05, + "loss": 1.7099, + "step": 27850 + }, + { + "epoch": 0.9773157562583269, + "grad_norm": 5.165148735046387, + "learning_rate": 3.745731018452388e-05, + "loss": 1.7446, + "step": 27875 + }, + { + "epoch": 0.9781922726316528, + "grad_norm": 6.873377799987793, + "learning_rate": 3.7441078315521565e-05, + "loss": 1.7814, + "step": 27900 + }, + { + "epoch": 0.9790687890049786, + "grad_norm": 3.175889730453491, + "learning_rate": 3.742484644651924e-05, + "loss": 1.495, + "step": 27925 + }, + { + "epoch": 0.9799453053783045, + "grad_norm": 7.226979732513428, + "learning_rate": 3.7408614577516914e-05, + "loss": 1.8832, + "step": 27950 + }, + { + "epoch": 0.9808218217516304, + "grad_norm": 4.444784164428711, + "learning_rate": 3.739238270851459e-05, + "loss": 1.6692, + "step": 27975 + }, + { + "epoch": 0.9816983381249562, + "grad_norm": 8.872031211853027, + "learning_rate": 3.737615083951226e-05, + "loss": 1.8261, + "step": 28000 + }, + { + "epoch": 0.9825748544982821, + "grad_norm": 5.315586566925049, + "learning_rate": 3.7359918970509945e-05, + "loss": 1.5923, + "step": 28025 + }, + { + "epoch": 0.9834513708716078, + "grad_norm": 5.07174015045166, + "learning_rate": 3.734368710150762e-05, + "loss": 1.7588, + "step": 28050 + }, + { + "epoch": 0.9843278872449337, + "grad_norm": 9.916199684143066, + "learning_rate": 3.7327455232505294e-05, + "loss": 1.8017, + "step": 28075 + }, + { + "epoch": 0.9852044036182596, + "grad_norm": 7.86973762512207, + "learning_rate": 3.731122336350297e-05, + "loss": 1.7693, + "step": 28100 + }, + { + "epoch": 0.9860809199915854, + "grad_norm": 5.744624614715576, + "learning_rate": 3.729499149450065e-05, + "loss": 1.6599, + "step": 28125 + }, + { + "epoch": 0.9869574363649113, + "grad_norm": 11.120006561279297, + "learning_rate": 3.7278759625498325e-05, + "loss": 1.6398, + "step": 28150 + }, + { + "epoch": 0.9878339527382372, + "grad_norm": 8.170663833618164, + "learning_rate": 3.7262527756496e-05, + "loss": 1.7629, + "step": 28175 + }, + { + "epoch": 0.988710469111563, + "grad_norm": 7.6870269775390625, + "learning_rate": 3.7246295887493674e-05, + "loss": 1.6981, + "step": 28200 + }, + { + "epoch": 0.9895869854848889, + "grad_norm": 13.430487632751465, + "learning_rate": 3.723006401849134e-05, + "loss": 1.6462, + "step": 28225 + }, + { + "epoch": 0.9904635018582147, + "grad_norm": 10.077984809875488, + "learning_rate": 3.721383214948902e-05, + "loss": 1.6344, + "step": 28250 + }, + { + "epoch": 0.9913400182315406, + "grad_norm": 4.5883708000183105, + "learning_rate": 3.71976002804867e-05, + "loss": 1.7009, + "step": 28275 + }, + { + "epoch": 0.9922165346048665, + "grad_norm": 4.229771614074707, + "learning_rate": 3.718136841148437e-05, + "loss": 1.8118, + "step": 28300 + }, + { + "epoch": 0.9930930509781922, + "grad_norm": 7.726940631866455, + "learning_rate": 3.716513654248205e-05, + "loss": 1.7501, + "step": 28325 + }, + { + "epoch": 0.9939695673515181, + "grad_norm": 5.5130486488342285, + "learning_rate": 3.714890467347972e-05, + "loss": 1.6985, + "step": 28350 + }, + { + "epoch": 0.994846083724844, + "grad_norm": 5.712647438049316, + "learning_rate": 3.71326728044774e-05, + "loss": 1.676, + "step": 28375 + }, + { + "epoch": 0.9957226000981698, + "grad_norm": 5.912980556488037, + "learning_rate": 3.711644093547508e-05, + "loss": 1.5149, + "step": 28400 + }, + { + "epoch": 0.9965991164714957, + "grad_norm": 6.120546340942383, + "learning_rate": 3.710020906647275e-05, + "loss": 1.5728, + "step": 28425 + }, + { + "epoch": 0.9974756328448215, + "grad_norm": 4.5899529457092285, + "learning_rate": 3.708397719747043e-05, + "loss": 1.872, + "step": 28450 + }, + { + "epoch": 0.9983521492181474, + "grad_norm": 2.925464153289795, + "learning_rate": 3.70677453284681e-05, + "loss": 1.5937, + "step": 28475 + }, + { + "epoch": 0.9992286655914733, + "grad_norm": 4.069231033325195, + "learning_rate": 3.705151345946578e-05, + "loss": 1.8188, + "step": 28500 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.3333567071032887, + "eval_f1_macro": 0.07143232786146277, + "eval_f1_micro": 0.3333567071032887, + "eval_f1_weighted": 0.1666871191763381, + "eval_loss": 1.7088940143585205, + "eval_precision_macro": 0.04762238672904124, + "eval_precision_micro": 0.3333567071032887, + "eval_precision_weighted": 0.1111266941707478, + "eval_recall_macro": 0.14285714285714285, + "eval_recall_micro": 0.3333567071032887, + "eval_recall_weighted": 0.3333567071032887, + "eval_runtime": 3157.4909, + "eval_samples_per_second": 4.517, + "eval_steps_per_second": 1.129, + "step": 28522 + }, + { + "epoch": 1.000105181964799, + "grad_norm": 5.335381984710693, + "learning_rate": 3.703528159046346e-05, + "loss": 1.7839, + "step": 28525 + }, + { + "epoch": 1.000981698338125, + "grad_norm": 7.253268718719482, + "learning_rate": 3.701904972146113e-05, + "loss": 1.6862, + "step": 28550 + }, + { + "epoch": 1.0018582147114508, + "grad_norm": 3.468214988708496, + "learning_rate": 3.700281785245881e-05, + "loss": 1.7592, + "step": 28575 + }, + { + "epoch": 1.0027347310847767, + "grad_norm": 10.471994400024414, + "learning_rate": 3.698658598345648e-05, + "loss": 1.5346, + "step": 28600 + }, + { + "epoch": 1.0036112474581025, + "grad_norm": 4.353080749511719, + "learning_rate": 3.6970354114454157e-05, + "loss": 1.7461, + "step": 28625 + }, + { + "epoch": 1.0044877638314285, + "grad_norm": 5.374229431152344, + "learning_rate": 3.695412224545183e-05, + "loss": 1.6585, + "step": 28650 + }, + { + "epoch": 1.0053642802047542, + "grad_norm": 5.316918849945068, + "learning_rate": 3.6937890376449506e-05, + "loss": 1.7114, + "step": 28675 + }, + { + "epoch": 1.0062407965780802, + "grad_norm": 5.652185916900635, + "learning_rate": 3.692165850744718e-05, + "loss": 1.5404, + "step": 28700 + }, + { + "epoch": 1.007117312951406, + "grad_norm": 10.088621139526367, + "learning_rate": 3.6905426638444855e-05, + "loss": 1.759, + "step": 28725 + }, + { + "epoch": 1.0079938293247317, + "grad_norm": 4.098371505737305, + "learning_rate": 3.6889194769442537e-05, + "loss": 1.4545, + "step": 28750 + }, + { + "epoch": 1.0088703456980577, + "grad_norm": 5.221556186676025, + "learning_rate": 3.687296290044021e-05, + "loss": 1.6535, + "step": 28775 + }, + { + "epoch": 1.0097468620713834, + "grad_norm": 11.520200729370117, + "learning_rate": 3.6856731031437886e-05, + "loss": 1.7442, + "step": 28800 + }, + { + "epoch": 1.0106233784447094, + "grad_norm": 7.037756443023682, + "learning_rate": 3.684049916243556e-05, + "loss": 1.7255, + "step": 28825 + }, + { + "epoch": 1.0114998948180352, + "grad_norm": 5.533506393432617, + "learning_rate": 3.6824267293433235e-05, + "loss": 1.8296, + "step": 28850 + }, + { + "epoch": 1.0123764111913611, + "grad_norm": 5.2918853759765625, + "learning_rate": 3.6808035424430917e-05, + "loss": 1.7898, + "step": 28875 + }, + { + "epoch": 1.0132529275646869, + "grad_norm": 3.164388418197632, + "learning_rate": 3.679180355542859e-05, + "loss": 1.8078, + "step": 28900 + }, + { + "epoch": 1.0141294439380129, + "grad_norm": 9.682405471801758, + "learning_rate": 3.6775571686426266e-05, + "loss": 1.6961, + "step": 28925 + }, + { + "epoch": 1.0150059603113386, + "grad_norm": 6.413182258605957, + "learning_rate": 3.675933981742394e-05, + "loss": 1.6179, + "step": 28950 + }, + { + "epoch": 1.0158824766846646, + "grad_norm": 3.6101913452148438, + "learning_rate": 3.6743107948421615e-05, + "loss": 1.421, + "step": 28975 + }, + { + "epoch": 1.0167589930579903, + "grad_norm": 4.515725612640381, + "learning_rate": 3.672687607941929e-05, + "loss": 1.8651, + "step": 29000 + }, + { + "epoch": 1.017635509431316, + "grad_norm": 5.395262718200684, + "learning_rate": 3.6710644210416964e-05, + "loss": 1.6026, + "step": 29025 + }, + { + "epoch": 1.018512025804642, + "grad_norm": 5.097382545471191, + "learning_rate": 3.669441234141464e-05, + "loss": 1.6963, + "step": 29050 + }, + { + "epoch": 1.0193885421779678, + "grad_norm": 6.951759338378906, + "learning_rate": 3.6678180472412314e-05, + "loss": 1.7316, + "step": 29075 + }, + { + "epoch": 1.0202650585512938, + "grad_norm": 5.087795257568359, + "learning_rate": 3.666194860340999e-05, + "loss": 1.6375, + "step": 29100 + }, + { + "epoch": 1.0211415749246195, + "grad_norm": 6.352906227111816, + "learning_rate": 3.664571673440767e-05, + "loss": 1.6392, + "step": 29125 + }, + { + "epoch": 1.0220180912979455, + "grad_norm": 5.554426193237305, + "learning_rate": 3.6629484865405344e-05, + "loss": 1.7243, + "step": 29150 + }, + { + "epoch": 1.0228946076712713, + "grad_norm": 5.37007999420166, + "learning_rate": 3.661325299640302e-05, + "loss": 1.7263, + "step": 29175 + }, + { + "epoch": 1.0237711240445972, + "grad_norm": 5.998013973236084, + "learning_rate": 3.6597021127400694e-05, + "loss": 1.6503, + "step": 29200 + }, + { + "epoch": 1.024647640417923, + "grad_norm": 5.650667190551758, + "learning_rate": 3.658078925839837e-05, + "loss": 1.6907, + "step": 29225 + }, + { + "epoch": 1.0255241567912488, + "grad_norm": 6.349676132202148, + "learning_rate": 3.656455738939605e-05, + "loss": 1.6273, + "step": 29250 + }, + { + "epoch": 1.0264006731645747, + "grad_norm": 12.927947998046875, + "learning_rate": 3.6548325520393724e-05, + "loss": 1.9068, + "step": 29275 + }, + { + "epoch": 1.0272771895379005, + "grad_norm": 7.229135036468506, + "learning_rate": 3.65320936513914e-05, + "loss": 1.8219, + "step": 29300 + }, + { + "epoch": 1.0281537059112265, + "grad_norm": 12.966187477111816, + "learning_rate": 3.6515861782389074e-05, + "loss": 1.7106, + "step": 29325 + }, + { + "epoch": 1.0290302222845522, + "grad_norm": 5.918170928955078, + "learning_rate": 3.649962991338675e-05, + "loss": 1.5912, + "step": 29350 + }, + { + "epoch": 1.0299067386578782, + "grad_norm": 3.668097734451294, + "learning_rate": 3.648339804438442e-05, + "loss": 1.7935, + "step": 29375 + }, + { + "epoch": 1.030783255031204, + "grad_norm": 5.520074844360352, + "learning_rate": 3.64671661753821e-05, + "loss": 1.6728, + "step": 29400 + }, + { + "epoch": 1.03165977140453, + "grad_norm": 4.5544586181640625, + "learning_rate": 3.645093430637977e-05, + "loss": 1.7493, + "step": 29425 + }, + { + "epoch": 1.0325362877778557, + "grad_norm": 3.630657196044922, + "learning_rate": 3.643470243737745e-05, + "loss": 1.6201, + "step": 29450 + }, + { + "epoch": 1.0334128041511816, + "grad_norm": 7.095573425292969, + "learning_rate": 3.641847056837513e-05, + "loss": 1.5221, + "step": 29475 + }, + { + "epoch": 1.0342893205245074, + "grad_norm": 3.620511770248413, + "learning_rate": 3.64022386993728e-05, + "loss": 1.5578, + "step": 29500 + }, + { + "epoch": 1.0351658368978331, + "grad_norm": 5.106364727020264, + "learning_rate": 3.638600683037048e-05, + "loss": 1.5539, + "step": 29525 + }, + { + "epoch": 1.0360423532711591, + "grad_norm": 7.728002071380615, + "learning_rate": 3.636977496136815e-05, + "loss": 1.6942, + "step": 29550 + }, + { + "epoch": 1.0369188696444849, + "grad_norm": 6.721397399902344, + "learning_rate": 3.635354309236583e-05, + "loss": 1.6173, + "step": 29575 + }, + { + "epoch": 1.0377953860178109, + "grad_norm": 5.89910364151001, + "learning_rate": 3.633731122336351e-05, + "loss": 1.7451, + "step": 29600 + }, + { + "epoch": 1.0386719023911366, + "grad_norm": 15.372925758361816, + "learning_rate": 3.632107935436118e-05, + "loss": 1.69, + "step": 29625 + }, + { + "epoch": 1.0395484187644626, + "grad_norm": 5.477090358734131, + "learning_rate": 3.630484748535886e-05, + "loss": 1.6859, + "step": 29650 + }, + { + "epoch": 1.0404249351377883, + "grad_norm": 6.049582004547119, + "learning_rate": 3.628861561635653e-05, + "loss": 1.9245, + "step": 29675 + }, + { + "epoch": 1.0413014515111143, + "grad_norm": 4.347839832305908, + "learning_rate": 3.627238374735421e-05, + "loss": 1.6193, + "step": 29700 + }, + { + "epoch": 1.04217796788444, + "grad_norm": 4.799275875091553, + "learning_rate": 3.625615187835188e-05, + "loss": 1.6824, + "step": 29725 + }, + { + "epoch": 1.043054484257766, + "grad_norm": 3.7621402740478516, + "learning_rate": 3.6239920009349556e-05, + "loss": 1.814, + "step": 29750 + }, + { + "epoch": 1.0439310006310918, + "grad_norm": 5.719250679016113, + "learning_rate": 3.622368814034723e-05, + "loss": 1.6933, + "step": 29775 + }, + { + "epoch": 1.0448075170044175, + "grad_norm": 5.274804592132568, + "learning_rate": 3.6207456271344906e-05, + "loss": 1.6477, + "step": 29800 + }, + { + "epoch": 1.0456840333777435, + "grad_norm": 10.45026969909668, + "learning_rate": 3.619122440234258e-05, + "loss": 1.744, + "step": 29825 + }, + { + "epoch": 1.0465605497510693, + "grad_norm": 3.564901351928711, + "learning_rate": 3.617499253334026e-05, + "loss": 1.6961, + "step": 29850 + }, + { + "epoch": 1.0474370661243952, + "grad_norm": 5.247096061706543, + "learning_rate": 3.6158760664337936e-05, + "loss": 1.6063, + "step": 29875 + }, + { + "epoch": 1.048313582497721, + "grad_norm": 3.209669589996338, + "learning_rate": 3.614252879533561e-05, + "loss": 1.8551, + "step": 29900 + }, + { + "epoch": 1.049190098871047, + "grad_norm": 10.213878631591797, + "learning_rate": 3.6126296926333286e-05, + "loss": 1.8288, + "step": 29925 + }, + { + "epoch": 1.0500666152443727, + "grad_norm": 4.804134845733643, + "learning_rate": 3.611006505733096e-05, + "loss": 1.7273, + "step": 29950 + }, + { + "epoch": 1.0509431316176987, + "grad_norm": 5.757583141326904, + "learning_rate": 3.609383318832864e-05, + "loss": 1.6281, + "step": 29975 + }, + { + "epoch": 1.0518196479910245, + "grad_norm": 5.010678768157959, + "learning_rate": 3.6077601319326316e-05, + "loss": 1.5694, + "step": 30000 + }, + { + "epoch": 1.0526961643643504, + "grad_norm": 4.734757900238037, + "learning_rate": 3.606136945032399e-05, + "loss": 1.5895, + "step": 30025 + }, + { + "epoch": 1.0535726807376762, + "grad_norm": 3.7040040493011475, + "learning_rate": 3.6045137581321666e-05, + "loss": 1.762, + "step": 30050 + }, + { + "epoch": 1.054449197111002, + "grad_norm": 4.9582390785217285, + "learning_rate": 3.602890571231934e-05, + "loss": 1.7673, + "step": 30075 + }, + { + "epoch": 1.055325713484328, + "grad_norm": 5.232799530029297, + "learning_rate": 3.601267384331702e-05, + "loss": 1.6, + "step": 30100 + }, + { + "epoch": 1.0562022298576537, + "grad_norm": 7.785403728485107, + "learning_rate": 3.5996441974314696e-05, + "loss": 1.6609, + "step": 30125 + }, + { + "epoch": 1.0570787462309796, + "grad_norm": 9.67027473449707, + "learning_rate": 3.5980210105312364e-05, + "loss": 1.6752, + "step": 30150 + }, + { + "epoch": 1.0579552626043054, + "grad_norm": 5.535626411437988, + "learning_rate": 3.596397823631004e-05, + "loss": 1.6826, + "step": 30175 + }, + { + "epoch": 1.0588317789776314, + "grad_norm": 4.494045734405518, + "learning_rate": 3.5947746367307713e-05, + "loss": 1.6844, + "step": 30200 + }, + { + "epoch": 1.0597082953509571, + "grad_norm": 3.3252177238464355, + "learning_rate": 3.5931514498305395e-05, + "loss": 1.647, + "step": 30225 + }, + { + "epoch": 1.060584811724283, + "grad_norm": 4.626235008239746, + "learning_rate": 3.591528262930307e-05, + "loss": 1.7638, + "step": 30250 + }, + { + "epoch": 1.0614613280976088, + "grad_norm": 3.4336390495300293, + "learning_rate": 3.5899050760300744e-05, + "loss": 1.7329, + "step": 30275 + }, + { + "epoch": 1.0623378444709348, + "grad_norm": 9.149893760681152, + "learning_rate": 3.588281889129842e-05, + "loss": 1.6431, + "step": 30300 + }, + { + "epoch": 1.0632143608442606, + "grad_norm": 6.410059928894043, + "learning_rate": 3.5866587022296093e-05, + "loss": 1.7431, + "step": 30325 + }, + { + "epoch": 1.0640908772175863, + "grad_norm": 5.503985404968262, + "learning_rate": 3.5850355153293775e-05, + "loss": 1.6474, + "step": 30350 + }, + { + "epoch": 1.0649673935909123, + "grad_norm": 5.691886901855469, + "learning_rate": 3.583412328429145e-05, + "loss": 1.6912, + "step": 30375 + }, + { + "epoch": 1.065843909964238, + "grad_norm": 5.767693996429443, + "learning_rate": 3.5817891415289124e-05, + "loss": 1.8614, + "step": 30400 + }, + { + "epoch": 1.066720426337564, + "grad_norm": 4.068127632141113, + "learning_rate": 3.58016595462868e-05, + "loss": 1.8714, + "step": 30425 + }, + { + "epoch": 1.0675969427108898, + "grad_norm": 7.2136664390563965, + "learning_rate": 3.578542767728448e-05, + "loss": 1.7466, + "step": 30450 + }, + { + "epoch": 1.0684734590842158, + "grad_norm": 4.33587646484375, + "learning_rate": 3.5769195808282155e-05, + "loss": 1.6371, + "step": 30475 + }, + { + "epoch": 1.0693499754575415, + "grad_norm": 3.6857829093933105, + "learning_rate": 3.575296393927983e-05, + "loss": 1.6379, + "step": 30500 + }, + { + "epoch": 1.0702264918308675, + "grad_norm": 8.978949546813965, + "learning_rate": 3.5736732070277504e-05, + "loss": 1.693, + "step": 30525 + }, + { + "epoch": 1.0711030082041932, + "grad_norm": 8.010087966918945, + "learning_rate": 3.572050020127518e-05, + "loss": 1.5837, + "step": 30550 + }, + { + "epoch": 1.071979524577519, + "grad_norm": 5.713315486907959, + "learning_rate": 3.5704268332272853e-05, + "loss": 1.6981, + "step": 30575 + }, + { + "epoch": 1.072856040950845, + "grad_norm": 3.71869158744812, + "learning_rate": 3.568803646327053e-05, + "loss": 1.6543, + "step": 30600 + }, + { + "epoch": 1.0737325573241707, + "grad_norm": 4.090463638305664, + "learning_rate": 3.56718045942682e-05, + "loss": 1.5949, + "step": 30625 + }, + { + "epoch": 1.0746090736974967, + "grad_norm": 5.542525291442871, + "learning_rate": 3.565557272526588e-05, + "loss": 1.7036, + "step": 30650 + }, + { + "epoch": 1.0754855900708225, + "grad_norm": 6.0791144371032715, + "learning_rate": 3.563934085626355e-05, + "loss": 1.8513, + "step": 30675 + }, + { + "epoch": 1.0763621064441484, + "grad_norm": 5.432562351226807, + "learning_rate": 3.5623108987261233e-05, + "loss": 1.8513, + "step": 30700 + }, + { + "epoch": 1.0772386228174742, + "grad_norm": 11.250377655029297, + "learning_rate": 3.560687711825891e-05, + "loss": 1.6625, + "step": 30725 + }, + { + "epoch": 1.0781151391908002, + "grad_norm": 4.239922523498535, + "learning_rate": 3.559064524925658e-05, + "loss": 1.6776, + "step": 30750 + }, + { + "epoch": 1.078991655564126, + "grad_norm": 8.958422660827637, + "learning_rate": 3.557441338025426e-05, + "loss": 1.8314, + "step": 30775 + }, + { + "epoch": 1.0798681719374519, + "grad_norm": 3.4840445518493652, + "learning_rate": 3.555818151125193e-05, + "loss": 1.5383, + "step": 30800 + }, + { + "epoch": 1.0807446883107776, + "grad_norm": 12.310514450073242, + "learning_rate": 3.5541949642249613e-05, + "loss": 1.6062, + "step": 30825 + }, + { + "epoch": 1.0816212046841036, + "grad_norm": 7.644445896148682, + "learning_rate": 3.552571777324729e-05, + "loss": 1.7062, + "step": 30850 + }, + { + "epoch": 1.0824977210574294, + "grad_norm": 5.11651086807251, + "learning_rate": 3.550948590424496e-05, + "loss": 1.549, + "step": 30875 + }, + { + "epoch": 1.0833742374307551, + "grad_norm": 3.996373176574707, + "learning_rate": 3.549325403524264e-05, + "loss": 1.9282, + "step": 30900 + }, + { + "epoch": 1.084250753804081, + "grad_norm": 4.042839527130127, + "learning_rate": 3.547702216624031e-05, + "loss": 1.6247, + "step": 30925 + }, + { + "epoch": 1.0851272701774068, + "grad_norm": 8.541868209838867, + "learning_rate": 3.546079029723799e-05, + "loss": 1.7063, + "step": 30950 + }, + { + "epoch": 1.0860037865507328, + "grad_norm": 4.966433525085449, + "learning_rate": 3.544455842823566e-05, + "loss": 1.7093, + "step": 30975 + }, + { + "epoch": 1.0868803029240586, + "grad_norm": 5.076407432556152, + "learning_rate": 3.5428326559233336e-05, + "loss": 1.7818, + "step": 31000 + }, + { + "epoch": 1.0877568192973845, + "grad_norm": 4.799564838409424, + "learning_rate": 3.541209469023101e-05, + "loss": 1.5923, + "step": 31025 + }, + { + "epoch": 1.0886333356707103, + "grad_norm": 6.721904754638672, + "learning_rate": 3.5395862821228685e-05, + "loss": 1.8551, + "step": 31050 + }, + { + "epoch": 1.0895098520440363, + "grad_norm": 3.9764297008514404, + "learning_rate": 3.537963095222637e-05, + "loss": 1.5838, + "step": 31075 + }, + { + "epoch": 1.090386368417362, + "grad_norm": 5.141238689422607, + "learning_rate": 3.536339908322404e-05, + "loss": 1.6595, + "step": 31100 + }, + { + "epoch": 1.0912628847906878, + "grad_norm": 6.730457782745361, + "learning_rate": 3.5347167214221716e-05, + "loss": 1.7734, + "step": 31125 + }, + { + "epoch": 1.0921394011640138, + "grad_norm": 5.204188346862793, + "learning_rate": 3.533093534521939e-05, + "loss": 1.6945, + "step": 31150 + }, + { + "epoch": 1.0930159175373395, + "grad_norm": 6.801941871643066, + "learning_rate": 3.5314703476217065e-05, + "loss": 1.7232, + "step": 31175 + }, + { + "epoch": 1.0938924339106655, + "grad_norm": 3.5017759799957275, + "learning_rate": 3.529847160721475e-05, + "loss": 1.8137, + "step": 31200 + }, + { + "epoch": 1.0947689502839912, + "grad_norm": 6.291198253631592, + "learning_rate": 3.528223973821242e-05, + "loss": 1.5843, + "step": 31225 + }, + { + "epoch": 1.0956454666573172, + "grad_norm": 4.303759574890137, + "learning_rate": 3.5266007869210096e-05, + "loss": 1.7177, + "step": 31250 + }, + { + "epoch": 1.096521983030643, + "grad_norm": 3.3595798015594482, + "learning_rate": 3.524977600020777e-05, + "loss": 1.5597, + "step": 31275 + }, + { + "epoch": 1.097398499403969, + "grad_norm": 3.9554760456085205, + "learning_rate": 3.5233544131205445e-05, + "loss": 1.7036, + "step": 31300 + }, + { + "epoch": 1.0982750157772947, + "grad_norm": 5.867265224456787, + "learning_rate": 3.521731226220312e-05, + "loss": 1.723, + "step": 31325 + }, + { + "epoch": 1.0991515321506207, + "grad_norm": 3.930715799331665, + "learning_rate": 3.5201080393200795e-05, + "loss": 1.6957, + "step": 31350 + }, + { + "epoch": 1.1000280485239464, + "grad_norm": 4.514988422393799, + "learning_rate": 3.518484852419847e-05, + "loss": 1.5955, + "step": 31375 + }, + { + "epoch": 1.1009045648972724, + "grad_norm": 5.133600234985352, + "learning_rate": 3.5168616655196144e-05, + "loss": 1.7112, + "step": 31400 + }, + { + "epoch": 1.1017810812705982, + "grad_norm": 8.265213966369629, + "learning_rate": 3.515238478619382e-05, + "loss": 1.8393, + "step": 31425 + }, + { + "epoch": 1.102657597643924, + "grad_norm": 6.482197284698486, + "learning_rate": 3.51361529171915e-05, + "loss": 1.6632, + "step": 31450 + }, + { + "epoch": 1.1035341140172499, + "grad_norm": 3.543539047241211, + "learning_rate": 3.5119921048189175e-05, + "loss": 1.8107, + "step": 31475 + }, + { + "epoch": 1.1044106303905756, + "grad_norm": 3.3770008087158203, + "learning_rate": 3.510368917918685e-05, + "loss": 1.6576, + "step": 31500 + }, + { + "epoch": 1.1052871467639016, + "grad_norm": 4.631762504577637, + "learning_rate": 3.5087457310184524e-05, + "loss": 1.7479, + "step": 31525 + }, + { + "epoch": 1.1061636631372274, + "grad_norm": 3.8706533908843994, + "learning_rate": 3.5071225441182205e-05, + "loss": 1.527, + "step": 31550 + }, + { + "epoch": 1.1070401795105533, + "grad_norm": 11.3296537399292, + "learning_rate": 3.505499357217988e-05, + "loss": 1.6594, + "step": 31575 + }, + { + "epoch": 1.107916695883879, + "grad_norm": 7.44377326965332, + "learning_rate": 3.5038761703177555e-05, + "loss": 1.5948, + "step": 31600 + }, + { + "epoch": 1.108793212257205, + "grad_norm": 13.166386604309082, + "learning_rate": 3.502252983417523e-05, + "loss": 1.641, + "step": 31625 + }, + { + "epoch": 1.1096697286305308, + "grad_norm": 10.759387016296387, + "learning_rate": 3.5006297965172904e-05, + "loss": 1.7051, + "step": 31650 + }, + { + "epoch": 1.1105462450038566, + "grad_norm": 3.2466704845428467, + "learning_rate": 3.499006609617058e-05, + "loss": 1.8131, + "step": 31675 + }, + { + "epoch": 1.1114227613771825, + "grad_norm": 3.6874353885650635, + "learning_rate": 3.497383422716825e-05, + "loss": 1.6049, + "step": 31700 + }, + { + "epoch": 1.1122992777505083, + "grad_norm": 5.307967662811279, + "learning_rate": 3.495760235816593e-05, + "loss": 1.697, + "step": 31725 + }, + { + "epoch": 1.1131757941238343, + "grad_norm": 3.618786096572876, + "learning_rate": 3.49413704891636e-05, + "loss": 1.748, + "step": 31750 + }, + { + "epoch": 1.11405231049716, + "grad_norm": 8.764957427978516, + "learning_rate": 3.492513862016128e-05, + "loss": 1.6971, + "step": 31775 + }, + { + "epoch": 1.114928826870486, + "grad_norm": 4.955912113189697, + "learning_rate": 3.490890675115896e-05, + "loss": 1.8434, + "step": 31800 + }, + { + "epoch": 1.1158053432438118, + "grad_norm": 4.0199151039123535, + "learning_rate": 3.489267488215663e-05, + "loss": 1.6708, + "step": 31825 + }, + { + "epoch": 1.1166818596171377, + "grad_norm": 3.6608777046203613, + "learning_rate": 3.487644301315431e-05, + "loss": 1.6611, + "step": 31850 + }, + { + "epoch": 1.1175583759904635, + "grad_norm": 8.878351211547852, + "learning_rate": 3.486021114415198e-05, + "loss": 1.6864, + "step": 31875 + }, + { + "epoch": 1.1184348923637895, + "grad_norm": 3.889843225479126, + "learning_rate": 3.484397927514966e-05, + "loss": 1.5994, + "step": 31900 + }, + { + "epoch": 1.1193114087371152, + "grad_norm": 6.462010860443115, + "learning_rate": 3.482774740614734e-05, + "loss": 1.6124, + "step": 31925 + }, + { + "epoch": 1.120187925110441, + "grad_norm": 5.456188201904297, + "learning_rate": 3.481151553714501e-05, + "loss": 1.6048, + "step": 31950 + }, + { + "epoch": 1.121064441483767, + "grad_norm": 5.864627838134766, + "learning_rate": 3.479528366814269e-05, + "loss": 1.8091, + "step": 31975 + }, + { + "epoch": 1.1219409578570927, + "grad_norm": 4.878820419311523, + "learning_rate": 3.477905179914036e-05, + "loss": 1.6434, + "step": 32000 + }, + { + "epoch": 1.1228174742304187, + "grad_norm": 3.0742177963256836, + "learning_rate": 3.476281993013804e-05, + "loss": 1.5224, + "step": 32025 + }, + { + "epoch": 1.1236939906037444, + "grad_norm": 6.770904541015625, + "learning_rate": 3.474658806113572e-05, + "loss": 1.8105, + "step": 32050 + }, + { + "epoch": 1.1245705069770704, + "grad_norm": 3.3337113857269287, + "learning_rate": 3.4730356192133386e-05, + "loss": 1.7639, + "step": 32075 + }, + { + "epoch": 1.1254470233503961, + "grad_norm": 9.339410781860352, + "learning_rate": 3.471412432313106e-05, + "loss": 1.6251, + "step": 32100 + }, + { + "epoch": 1.1263235397237221, + "grad_norm": 3.3532233238220215, + "learning_rate": 3.4697892454128736e-05, + "loss": 1.5649, + "step": 32125 + }, + { + "epoch": 1.1272000560970479, + "grad_norm": 7.2148356437683105, + "learning_rate": 3.468166058512641e-05, + "loss": 1.777, + "step": 32150 + }, + { + "epoch": 1.1280765724703739, + "grad_norm": 3.936981201171875, + "learning_rate": 3.466542871612409e-05, + "loss": 1.5618, + "step": 32175 + }, + { + "epoch": 1.1289530888436996, + "grad_norm": 6.870660781860352, + "learning_rate": 3.4649196847121766e-05, + "loss": 1.6041, + "step": 32200 + }, + { + "epoch": 1.1298296052170254, + "grad_norm": 9.602493286132812, + "learning_rate": 3.463296497811944e-05, + "loss": 1.5622, + "step": 32225 + }, + { + "epoch": 1.1307061215903513, + "grad_norm": 4.943943500518799, + "learning_rate": 3.4616733109117116e-05, + "loss": 2.0084, + "step": 32250 + }, + { + "epoch": 1.131582637963677, + "grad_norm": 5.241890907287598, + "learning_rate": 3.460050124011479e-05, + "loss": 1.6336, + "step": 32275 + }, + { + "epoch": 1.132459154337003, + "grad_norm": 9.62393856048584, + "learning_rate": 3.458426937111247e-05, + "loss": 1.8858, + "step": 32300 + }, + { + "epoch": 1.1333356707103288, + "grad_norm": 4.486839771270752, + "learning_rate": 3.4568037502110146e-05, + "loss": 1.7246, + "step": 32325 + }, + { + "epoch": 1.1342121870836548, + "grad_norm": 9.51328182220459, + "learning_rate": 3.455180563310782e-05, + "loss": 1.6808, + "step": 32350 + }, + { + "epoch": 1.1350887034569805, + "grad_norm": 4.624545574188232, + "learning_rate": 3.4535573764105496e-05, + "loss": 1.8211, + "step": 32375 + }, + { + "epoch": 1.1359652198303065, + "grad_norm": 3.161308765411377, + "learning_rate": 3.451934189510317e-05, + "loss": 1.698, + "step": 32400 + }, + { + "epoch": 1.1368417362036323, + "grad_norm": 3.363229274749756, + "learning_rate": 3.450311002610085e-05, + "loss": 1.7004, + "step": 32425 + }, + { + "epoch": 1.137718252576958, + "grad_norm": 5.495654582977295, + "learning_rate": 3.4486878157098526e-05, + "loss": 1.6572, + "step": 32450 + }, + { + "epoch": 1.138594768950284, + "grad_norm": 3.2298738956451416, + "learning_rate": 3.44706462880962e-05, + "loss": 1.6383, + "step": 32475 + }, + { + "epoch": 1.1394712853236098, + "grad_norm": 11.661173820495605, + "learning_rate": 3.445441441909387e-05, + "loss": 1.7741, + "step": 32500 + }, + { + "epoch": 1.1403478016969357, + "grad_norm": 4.880887031555176, + "learning_rate": 3.4438182550091544e-05, + "loss": 1.7686, + "step": 32525 + }, + { + "epoch": 1.1412243180702615, + "grad_norm": 3.6941330432891846, + "learning_rate": 3.4421950681089225e-05, + "loss": 1.5982, + "step": 32550 + }, + { + "epoch": 1.1421008344435875, + "grad_norm": 5.9598822593688965, + "learning_rate": 3.44057188120869e-05, + "loss": 1.6074, + "step": 32575 + }, + { + "epoch": 1.1429773508169132, + "grad_norm": 14.139555931091309, + "learning_rate": 3.4389486943084574e-05, + "loss": 1.6946, + "step": 32600 + }, + { + "epoch": 1.1438538671902392, + "grad_norm": 5.2708821296691895, + "learning_rate": 3.437325507408225e-05, + "loss": 1.6888, + "step": 32625 + }, + { + "epoch": 1.144730383563565, + "grad_norm": 9.40365982055664, + "learning_rate": 3.435702320507993e-05, + "loss": 1.5312, + "step": 32650 + }, + { + "epoch": 1.145606899936891, + "grad_norm": 3.3270912170410156, + "learning_rate": 3.4340791336077605e-05, + "loss": 1.7828, + "step": 32675 + }, + { + "epoch": 1.1464834163102167, + "grad_norm": 9.106575965881348, + "learning_rate": 3.432455946707528e-05, + "loss": 1.7392, + "step": 32700 + }, + { + "epoch": 1.1473599326835426, + "grad_norm": 5.586655139923096, + "learning_rate": 3.4308327598072954e-05, + "loss": 1.6413, + "step": 32725 + }, + { + "epoch": 1.1482364490568684, + "grad_norm": 4.709921360015869, + "learning_rate": 3.429209572907063e-05, + "loss": 1.8373, + "step": 32750 + }, + { + "epoch": 1.1491129654301941, + "grad_norm": 7.205475807189941, + "learning_rate": 3.427586386006831e-05, + "loss": 1.724, + "step": 32775 + }, + { + "epoch": 1.1499894818035201, + "grad_norm": 8.06248950958252, + "learning_rate": 3.4259631991065985e-05, + "loss": 1.6261, + "step": 32800 + }, + { + "epoch": 1.1508659981768459, + "grad_norm": 3.235469341278076, + "learning_rate": 3.424340012206366e-05, + "loss": 1.7957, + "step": 32825 + }, + { + "epoch": 1.1517425145501718, + "grad_norm": 3.467836856842041, + "learning_rate": 3.4227168253061334e-05, + "loss": 1.8346, + "step": 32850 + }, + { + "epoch": 1.1526190309234976, + "grad_norm": 5.422499656677246, + "learning_rate": 3.421093638405901e-05, + "loss": 1.686, + "step": 32875 + }, + { + "epoch": 1.1534955472968236, + "grad_norm": 5.081619739532471, + "learning_rate": 3.4194704515056684e-05, + "loss": 1.8309, + "step": 32900 + }, + { + "epoch": 1.1543720636701493, + "grad_norm": 4.628939628601074, + "learning_rate": 3.417847264605436e-05, + "loss": 1.65, + "step": 32925 + }, + { + "epoch": 1.1552485800434753, + "grad_norm": 4.1766581535339355, + "learning_rate": 3.416224077705203e-05, + "loss": 1.5476, + "step": 32950 + }, + { + "epoch": 1.156125096416801, + "grad_norm": 3.8786728382110596, + "learning_rate": 3.414600890804971e-05, + "loss": 1.7169, + "step": 32975 + }, + { + "epoch": 1.1570016127901268, + "grad_norm": 4.876051902770996, + "learning_rate": 3.412977703904738e-05, + "loss": 1.6218, + "step": 33000 + }, + { + "epoch": 1.1578781291634528, + "grad_norm": 3.353567123413086, + "learning_rate": 3.4113545170045064e-05, + "loss": 1.6765, + "step": 33025 + }, + { + "epoch": 1.1587546455367785, + "grad_norm": 7.594830513000488, + "learning_rate": 3.409731330104274e-05, + "loss": 1.7358, + "step": 33050 + }, + { + "epoch": 1.1596311619101045, + "grad_norm": 7.927544593811035, + "learning_rate": 3.408108143204041e-05, + "loss": 1.8087, + "step": 33075 + }, + { + "epoch": 1.1605076782834303, + "grad_norm": 6.34503698348999, + "learning_rate": 3.406484956303809e-05, + "loss": 1.5737, + "step": 33100 + }, + { + "epoch": 1.1613841946567562, + "grad_norm": 3.7515461444854736, + "learning_rate": 3.404861769403576e-05, + "loss": 1.7698, + "step": 33125 + }, + { + "epoch": 1.162260711030082, + "grad_norm": 5.008838176727295, + "learning_rate": 3.4032385825033444e-05, + "loss": 1.7721, + "step": 33150 + }, + { + "epoch": 1.163137227403408, + "grad_norm": 6.798007488250732, + "learning_rate": 3.401615395603112e-05, + "loss": 1.6439, + "step": 33175 + }, + { + "epoch": 1.1640137437767337, + "grad_norm": 4.783827304840088, + "learning_rate": 3.399992208702879e-05, + "loss": 1.677, + "step": 33200 + }, + { + "epoch": 1.1648902601500597, + "grad_norm": 4.062950611114502, + "learning_rate": 3.398369021802647e-05, + "loss": 1.7497, + "step": 33225 + }, + { + "epoch": 1.1657667765233855, + "grad_norm": 9.965224266052246, + "learning_rate": 3.396745834902414e-05, + "loss": 1.7649, + "step": 33250 + }, + { + "epoch": 1.1666432928967114, + "grad_norm": 4.952746391296387, + "learning_rate": 3.395122648002182e-05, + "loss": 1.6265, + "step": 33275 + }, + { + "epoch": 1.1675198092700372, + "grad_norm": 16.904611587524414, + "learning_rate": 3.393499461101949e-05, + "loss": 1.6585, + "step": 33300 + }, + { + "epoch": 1.168396325643363, + "grad_norm": 4.481830596923828, + "learning_rate": 3.3918762742017166e-05, + "loss": 1.6709, + "step": 33325 + }, + { + "epoch": 1.169272842016689, + "grad_norm": 4.9990620613098145, + "learning_rate": 3.390253087301484e-05, + "loss": 1.7228, + "step": 33350 + }, + { + "epoch": 1.1701493583900147, + "grad_norm": 4.822597503662109, + "learning_rate": 3.3886299004012515e-05, + "loss": 1.9271, + "step": 33375 + }, + { + "epoch": 1.1710258747633406, + "grad_norm": 3.6993818283081055, + "learning_rate": 3.38700671350102e-05, + "loss": 1.8334, + "step": 33400 + }, + { + "epoch": 1.1719023911366664, + "grad_norm": 4.255173206329346, + "learning_rate": 3.385383526600787e-05, + "loss": 1.7132, + "step": 33425 + }, + { + "epoch": 1.1727789075099924, + "grad_norm": 3.04487943649292, + "learning_rate": 3.3837603397005546e-05, + "loss": 1.6647, + "step": 33450 + }, + { + "epoch": 1.1736554238833181, + "grad_norm": 4.672458648681641, + "learning_rate": 3.382137152800322e-05, + "loss": 1.7113, + "step": 33475 + }, + { + "epoch": 1.174531940256644, + "grad_norm": 3.572430372238159, + "learning_rate": 3.3805139659000895e-05, + "loss": 1.5845, + "step": 33500 + }, + { + "epoch": 1.1754084566299698, + "grad_norm": 3.120678424835205, + "learning_rate": 3.378890778999858e-05, + "loss": 1.6439, + "step": 33525 + }, + { + "epoch": 1.1762849730032956, + "grad_norm": 13.256449699401855, + "learning_rate": 3.377267592099625e-05, + "loss": 1.71, + "step": 33550 + }, + { + "epoch": 1.1771614893766216, + "grad_norm": 4.686601161956787, + "learning_rate": 3.3756444051993926e-05, + "loss": 1.7474, + "step": 33575 + }, + { + "epoch": 1.1780380057499473, + "grad_norm": 3.3477165699005127, + "learning_rate": 3.37402121829916e-05, + "loss": 1.7016, + "step": 33600 + }, + { + "epoch": 1.1789145221232733, + "grad_norm": 3.1460652351379395, + "learning_rate": 3.3723980313989275e-05, + "loss": 1.6835, + "step": 33625 + }, + { + "epoch": 1.179791038496599, + "grad_norm": 5.0962419509887695, + "learning_rate": 3.370774844498695e-05, + "loss": 1.5733, + "step": 33650 + }, + { + "epoch": 1.180667554869925, + "grad_norm": 4.109542369842529, + "learning_rate": 3.3691516575984625e-05, + "loss": 1.6835, + "step": 33675 + }, + { + "epoch": 1.1815440712432508, + "grad_norm": 3.727200508117676, + "learning_rate": 3.36752847069823e-05, + "loss": 1.7619, + "step": 33700 + }, + { + "epoch": 1.1824205876165768, + "grad_norm": 5.048363208770752, + "learning_rate": 3.3659052837979974e-05, + "loss": 1.7022, + "step": 33725 + }, + { + "epoch": 1.1832971039899025, + "grad_norm": 3.313615083694458, + "learning_rate": 3.3642820968977655e-05, + "loss": 1.7035, + "step": 33750 + }, + { + "epoch": 1.1841736203632283, + "grad_norm": 10.365707397460938, + "learning_rate": 3.362658909997533e-05, + "loss": 1.7919, + "step": 33775 + }, + { + "epoch": 1.1850501367365542, + "grad_norm": 5.409087181091309, + "learning_rate": 3.3610357230973005e-05, + "loss": 1.7984, + "step": 33800 + }, + { + "epoch": 1.1859266531098802, + "grad_norm": 4.895572662353516, + "learning_rate": 3.359412536197068e-05, + "loss": 1.8113, + "step": 33825 + }, + { + "epoch": 1.186803169483206, + "grad_norm": 16.20047950744629, + "learning_rate": 3.3577893492968354e-05, + "loss": 1.6642, + "step": 33850 + }, + { + "epoch": 1.1876796858565317, + "grad_norm": 5.052983283996582, + "learning_rate": 3.3561661623966035e-05, + "loss": 1.5651, + "step": 33875 + }, + { + "epoch": 1.1885562022298577, + "grad_norm": 4.995593070983887, + "learning_rate": 3.354542975496371e-05, + "loss": 1.7602, + "step": 33900 + }, + { + "epoch": 1.1894327186031834, + "grad_norm": 6.724891662597656, + "learning_rate": 3.3529197885961385e-05, + "loss": 1.7842, + "step": 33925 + }, + { + "epoch": 1.1903092349765094, + "grad_norm": 4.982700347900391, + "learning_rate": 3.351296601695906e-05, + "loss": 1.8149, + "step": 33950 + }, + { + "epoch": 1.1911857513498352, + "grad_norm": 6.445688247680664, + "learning_rate": 3.3496734147956734e-05, + "loss": 1.8338, + "step": 33975 + }, + { + "epoch": 1.1920622677231612, + "grad_norm": 3.735886812210083, + "learning_rate": 3.3480502278954415e-05, + "loss": 1.6479, + "step": 34000 + }, + { + "epoch": 1.192938784096487, + "grad_norm": 3.1526427268981934, + "learning_rate": 3.346427040995208e-05, + "loss": 1.5918, + "step": 34025 + }, + { + "epoch": 1.1938153004698129, + "grad_norm": 3.9770755767822266, + "learning_rate": 3.344803854094976e-05, + "loss": 1.7223, + "step": 34050 + }, + { + "epoch": 1.1946918168431386, + "grad_norm": 9.136998176574707, + "learning_rate": 3.343180667194743e-05, + "loss": 1.6722, + "step": 34075 + }, + { + "epoch": 1.1955683332164644, + "grad_norm": 4.92293119430542, + "learning_rate": 3.341557480294511e-05, + "loss": 1.605, + "step": 34100 + }, + { + "epoch": 1.1964448495897904, + "grad_norm": 4.763546943664551, + "learning_rate": 3.339934293394279e-05, + "loss": 1.7378, + "step": 34125 + }, + { + "epoch": 1.1973213659631161, + "grad_norm": 4.108649253845215, + "learning_rate": 3.338311106494046e-05, + "loss": 1.6126, + "step": 34150 + }, + { + "epoch": 1.198197882336442, + "grad_norm": 3.8493666648864746, + "learning_rate": 3.336687919593814e-05, + "loss": 1.7237, + "step": 34175 + }, + { + "epoch": 1.1990743987097678, + "grad_norm": 3.2094647884368896, + "learning_rate": 3.335064732693581e-05, + "loss": 1.6956, + "step": 34200 + }, + { + "epoch": 1.1999509150830938, + "grad_norm": 4.852198600769043, + "learning_rate": 3.333441545793349e-05, + "loss": 1.7864, + "step": 34225 + }, + { + "epoch": 1.2008274314564196, + "grad_norm": 3.222156286239624, + "learning_rate": 3.331818358893117e-05, + "loss": 1.7479, + "step": 34250 + }, + { + "epoch": 1.2017039478297455, + "grad_norm": 3.614089250564575, + "learning_rate": 3.330195171992884e-05, + "loss": 1.6359, + "step": 34275 + }, + { + "epoch": 1.2025804642030713, + "grad_norm": 7.783929824829102, + "learning_rate": 3.328571985092652e-05, + "loss": 1.7065, + "step": 34300 + }, + { + "epoch": 1.203456980576397, + "grad_norm": 8.550490379333496, + "learning_rate": 3.326948798192419e-05, + "loss": 1.6505, + "step": 34325 + }, + { + "epoch": 1.204333496949723, + "grad_norm": 3.916203022003174, + "learning_rate": 3.325325611292187e-05, + "loss": 1.6618, + "step": 34350 + }, + { + "epoch": 1.2052100133230488, + "grad_norm": 5.686614036560059, + "learning_rate": 3.323702424391955e-05, + "loss": 1.7421, + "step": 34375 + }, + { + "epoch": 1.2060865296963748, + "grad_norm": 6.222768306732178, + "learning_rate": 3.322079237491722e-05, + "loss": 1.8617, + "step": 34400 + }, + { + "epoch": 1.2069630460697005, + "grad_norm": 4.665896415710449, + "learning_rate": 3.320456050591489e-05, + "loss": 1.6593, + "step": 34425 + }, + { + "epoch": 1.2078395624430265, + "grad_norm": 5.070887565612793, + "learning_rate": 3.3188328636912566e-05, + "loss": 1.5352, + "step": 34450 + }, + { + "epoch": 1.2087160788163522, + "grad_norm": 5.212978363037109, + "learning_rate": 3.317209676791024e-05, + "loss": 1.9507, + "step": 34475 + }, + { + "epoch": 1.2095925951896782, + "grad_norm": 5.295562744140625, + "learning_rate": 3.315586489890792e-05, + "loss": 1.5529, + "step": 34500 + }, + { + "epoch": 1.210469111563004, + "grad_norm": 5.466311454772949, + "learning_rate": 3.3139633029905597e-05, + "loss": 1.9003, + "step": 34525 + }, + { + "epoch": 1.21134562793633, + "grad_norm": 7.4343109130859375, + "learning_rate": 3.312340116090327e-05, + "loss": 1.6257, + "step": 34550 + }, + { + "epoch": 1.2122221443096557, + "grad_norm": 5.824229717254639, + "learning_rate": 3.3107169291900946e-05, + "loss": 1.7455, + "step": 34575 + }, + { + "epoch": 1.2130986606829817, + "grad_norm": 3.014615774154663, + "learning_rate": 3.309093742289862e-05, + "loss": 1.7217, + "step": 34600 + }, + { + "epoch": 1.2139751770563074, + "grad_norm": 4.862308025360107, + "learning_rate": 3.30747055538963e-05, + "loss": 1.7304, + "step": 34625 + }, + { + "epoch": 1.2148516934296332, + "grad_norm": 5.247799873352051, + "learning_rate": 3.3058473684893977e-05, + "loss": 1.8311, + "step": 34650 + }, + { + "epoch": 1.2157282098029591, + "grad_norm": 5.521081447601318, + "learning_rate": 3.304224181589165e-05, + "loss": 1.5175, + "step": 34675 + }, + { + "epoch": 1.216604726176285, + "grad_norm": 6.532599449157715, + "learning_rate": 3.3026009946889326e-05, + "loss": 1.827, + "step": 34700 + }, + { + "epoch": 1.2174812425496109, + "grad_norm": 6.034439563751221, + "learning_rate": 3.3009778077887e-05, + "loss": 1.6376, + "step": 34725 + }, + { + "epoch": 1.2183577589229366, + "grad_norm": 3.610398530960083, + "learning_rate": 3.299354620888468e-05, + "loss": 1.7943, + "step": 34750 + }, + { + "epoch": 1.2192342752962626, + "grad_norm": 3.7602903842926025, + "learning_rate": 3.2977314339882357e-05, + "loss": 1.6396, + "step": 34775 + }, + { + "epoch": 1.2201107916695884, + "grad_norm": 3.107238292694092, + "learning_rate": 3.296108247088003e-05, + "loss": 1.8522, + "step": 34800 + }, + { + "epoch": 1.2209873080429143, + "grad_norm": 9.370879173278809, + "learning_rate": 3.2944850601877706e-05, + "loss": 1.6341, + "step": 34825 + }, + { + "epoch": 1.22186382441624, + "grad_norm": 9.671378135681152, + "learning_rate": 3.2928618732875374e-05, + "loss": 1.6219, + "step": 34850 + }, + { + "epoch": 1.2227403407895658, + "grad_norm": 11.739994049072266, + "learning_rate": 3.2912386863873055e-05, + "loss": 1.8407, + "step": 34875 + }, + { + "epoch": 1.2236168571628918, + "grad_norm": 3.7292745113372803, + "learning_rate": 3.289615499487073e-05, + "loss": 1.7742, + "step": 34900 + }, + { + "epoch": 1.2244933735362176, + "grad_norm": 3.558182716369629, + "learning_rate": 3.2879923125868404e-05, + "loss": 1.804, + "step": 34925 + }, + { + "epoch": 1.2253698899095435, + "grad_norm": 5.023896217346191, + "learning_rate": 3.286369125686608e-05, + "loss": 1.6568, + "step": 34950 + }, + { + "epoch": 1.2262464062828693, + "grad_norm": 6.200103759765625, + "learning_rate": 3.284745938786376e-05, + "loss": 1.6657, + "step": 34975 + }, + { + "epoch": 1.2271229226561953, + "grad_norm": 10.985984802246094, + "learning_rate": 3.2831227518861435e-05, + "loss": 1.7314, + "step": 35000 + }, + { + "epoch": 1.227999439029521, + "grad_norm": 4.904130935668945, + "learning_rate": 3.281499564985911e-05, + "loss": 1.7107, + "step": 35025 + }, + { + "epoch": 1.228875955402847, + "grad_norm": 3.054142713546753, + "learning_rate": 3.2798763780856784e-05, + "loss": 1.6412, + "step": 35050 + }, + { + "epoch": 1.2297524717761728, + "grad_norm": 3.8094217777252197, + "learning_rate": 3.278253191185446e-05, + "loss": 1.6666, + "step": 35075 + }, + { + "epoch": 1.2306289881494987, + "grad_norm": 4.930931568145752, + "learning_rate": 3.276630004285214e-05, + "loss": 1.6518, + "step": 35100 + }, + { + "epoch": 1.2315055045228245, + "grad_norm": 5.187088489532471, + "learning_rate": 3.2750068173849815e-05, + "loss": 1.7733, + "step": 35125 + }, + { + "epoch": 1.2323820208961505, + "grad_norm": 3.6691765785217285, + "learning_rate": 3.273383630484749e-05, + "loss": 1.7567, + "step": 35150 + }, + { + "epoch": 1.2332585372694762, + "grad_norm": 3.145847797393799, + "learning_rate": 3.2717604435845164e-05, + "loss": 1.4521, + "step": 35175 + }, + { + "epoch": 1.234135053642802, + "grad_norm": 3.1673848628997803, + "learning_rate": 3.270137256684284e-05, + "loss": 1.6333, + "step": 35200 + }, + { + "epoch": 1.235011570016128, + "grad_norm": 3.6136956214904785, + "learning_rate": 3.2685140697840514e-05, + "loss": 1.6081, + "step": 35225 + }, + { + "epoch": 1.2358880863894537, + "grad_norm": 3.493204116821289, + "learning_rate": 3.266890882883819e-05, + "loss": 1.8698, + "step": 35250 + }, + { + "epoch": 1.2367646027627797, + "grad_norm": 10.24739933013916, + "learning_rate": 3.265267695983586e-05, + "loss": 1.6844, + "step": 35275 + }, + { + "epoch": 1.2376411191361054, + "grad_norm": 5.707400321960449, + "learning_rate": 3.263644509083354e-05, + "loss": 1.6419, + "step": 35300 + }, + { + "epoch": 1.2385176355094314, + "grad_norm": 5.437505722045898, + "learning_rate": 3.262021322183121e-05, + "loss": 1.7568, + "step": 35325 + }, + { + "epoch": 1.2393941518827571, + "grad_norm": 4.80645227432251, + "learning_rate": 3.2603981352828894e-05, + "loss": 1.6643, + "step": 35350 + }, + { + "epoch": 1.2402706682560831, + "grad_norm": 5.4157209396362305, + "learning_rate": 3.258774948382657e-05, + "loss": 1.7528, + "step": 35375 + }, + { + "epoch": 1.2411471846294089, + "grad_norm": 7.365719795227051, + "learning_rate": 3.257151761482424e-05, + "loss": 1.9042, + "step": 35400 + }, + { + "epoch": 1.2420237010027346, + "grad_norm": 5.11865234375, + "learning_rate": 3.255528574582192e-05, + "loss": 1.7593, + "step": 35425 + }, + { + "epoch": 1.2429002173760606, + "grad_norm": 4.72868537902832, + "learning_rate": 3.253905387681959e-05, + "loss": 1.7653, + "step": 35450 + }, + { + "epoch": 1.2437767337493864, + "grad_norm": 8.486702919006348, + "learning_rate": 3.2522822007817274e-05, + "loss": 1.7297, + "step": 35475 + }, + { + "epoch": 1.2446532501227123, + "grad_norm": 5.47340202331543, + "learning_rate": 3.250659013881495e-05, + "loss": 1.5979, + "step": 35500 + }, + { + "epoch": 1.245529766496038, + "grad_norm": 5.425364971160889, + "learning_rate": 3.249035826981262e-05, + "loss": 1.644, + "step": 35525 + }, + { + "epoch": 1.246406282869364, + "grad_norm": 3.3704581260681152, + "learning_rate": 3.24741264008103e-05, + "loss": 1.8707, + "step": 35550 + }, + { + "epoch": 1.2472827992426898, + "grad_norm": 9.056315422058105, + "learning_rate": 3.245789453180797e-05, + "loss": 1.6564, + "step": 35575 + }, + { + "epoch": 1.2481593156160158, + "grad_norm": 3.6392366886138916, + "learning_rate": 3.244166266280565e-05, + "loss": 1.8565, + "step": 35600 + }, + { + "epoch": 1.2490358319893415, + "grad_norm": 4.045741558074951, + "learning_rate": 3.242543079380332e-05, + "loss": 1.793, + "step": 35625 + }, + { + "epoch": 1.2499123483626673, + "grad_norm": 3.674990177154541, + "learning_rate": 3.2409198924800996e-05, + "loss": 1.6945, + "step": 35650 + }, + { + "epoch": 1.2507888647359933, + "grad_norm": 3.0996217727661133, + "learning_rate": 3.239296705579867e-05, + "loss": 1.5686, + "step": 35675 + }, + { + "epoch": 1.2516653811093192, + "grad_norm": 7.521422863006592, + "learning_rate": 3.2376735186796346e-05, + "loss": 1.6186, + "step": 35700 + }, + { + "epoch": 1.252541897482645, + "grad_norm": 3.9716885089874268, + "learning_rate": 3.236050331779403e-05, + "loss": 1.9526, + "step": 35725 + }, + { + "epoch": 1.2534184138559707, + "grad_norm": 4.257104396820068, + "learning_rate": 3.23442714487917e-05, + "loss": 1.7416, + "step": 35750 + }, + { + "epoch": 1.2542949302292967, + "grad_norm": 4.128645420074463, + "learning_rate": 3.2328039579789376e-05, + "loss": 1.7035, + "step": 35775 + }, + { + "epoch": 1.2551714466026225, + "grad_norm": 3.894669771194458, + "learning_rate": 3.231180771078705e-05, + "loss": 1.7536, + "step": 35800 + }, + { + "epoch": 1.2560479629759485, + "grad_norm": 5.146975517272949, + "learning_rate": 3.2295575841784726e-05, + "loss": 1.8325, + "step": 35825 + }, + { + "epoch": 1.2569244793492742, + "grad_norm": 4.967433929443359, + "learning_rate": 3.227934397278241e-05, + "loss": 1.6782, + "step": 35850 + }, + { + "epoch": 1.2578009957226002, + "grad_norm": 3.9021224975585938, + "learning_rate": 3.226311210378008e-05, + "loss": 1.5777, + "step": 35875 + }, + { + "epoch": 1.258677512095926, + "grad_norm": 11.703023910522461, + "learning_rate": 3.2246880234777756e-05, + "loss": 1.7876, + "step": 35900 + }, + { + "epoch": 1.259554028469252, + "grad_norm": 3.11285138130188, + "learning_rate": 3.223064836577543e-05, + "loss": 1.6516, + "step": 35925 + }, + { + "epoch": 1.2604305448425777, + "grad_norm": 4.384336471557617, + "learning_rate": 3.2214416496773106e-05, + "loss": 1.7613, + "step": 35950 + }, + { + "epoch": 1.2613070612159034, + "grad_norm": 6.687605857849121, + "learning_rate": 3.219818462777078e-05, + "loss": 1.6128, + "step": 35975 + }, + { + "epoch": 1.2621835775892294, + "grad_norm": 4.054645538330078, + "learning_rate": 3.2181952758768455e-05, + "loss": 1.5905, + "step": 36000 + }, + { + "epoch": 1.2630600939625551, + "grad_norm": 3.265498638153076, + "learning_rate": 3.216572088976613e-05, + "loss": 1.7181, + "step": 36025 + }, + { + "epoch": 1.2639366103358811, + "grad_norm": 5.564462184906006, + "learning_rate": 3.2149489020763804e-05, + "loss": 1.892, + "step": 36050 + }, + { + "epoch": 1.2648131267092069, + "grad_norm": 3.955968141555786, + "learning_rate": 3.2133257151761486e-05, + "loss": 1.6216, + "step": 36075 + }, + { + "epoch": 1.2656896430825328, + "grad_norm": 7.751063823699951, + "learning_rate": 3.211702528275916e-05, + "loss": 1.8914, + "step": 36100 + }, + { + "epoch": 1.2665661594558586, + "grad_norm": 3.715543270111084, + "learning_rate": 3.2100793413756835e-05, + "loss": 1.6228, + "step": 36125 + }, + { + "epoch": 1.2674426758291846, + "grad_norm": 5.17460298538208, + "learning_rate": 3.208456154475451e-05, + "loss": 1.7523, + "step": 36150 + }, + { + "epoch": 1.2683191922025103, + "grad_norm": 4.992551803588867, + "learning_rate": 3.2068329675752184e-05, + "loss": 1.691, + "step": 36175 + }, + { + "epoch": 1.269195708575836, + "grad_norm": 4.834174633026123, + "learning_rate": 3.2052097806749866e-05, + "loss": 1.7013, + "step": 36200 + }, + { + "epoch": 1.270072224949162, + "grad_norm": 6.97681188583374, + "learning_rate": 3.203586593774754e-05, + "loss": 1.6583, + "step": 36225 + }, + { + "epoch": 1.270948741322488, + "grad_norm": 5.1908488273620605, + "learning_rate": 3.2019634068745215e-05, + "loss": 1.8719, + "step": 36250 + }, + { + "epoch": 1.2718252576958138, + "grad_norm": 4.424920082092285, + "learning_rate": 3.200340219974289e-05, + "loss": 1.7413, + "step": 36275 + }, + { + "epoch": 1.2727017740691395, + "grad_norm": 3.0645010471343994, + "learning_rate": 3.1987170330740564e-05, + "loss": 1.7423, + "step": 36300 + }, + { + "epoch": 1.2735782904424655, + "grad_norm": 3.6790196895599365, + "learning_rate": 3.1970938461738246e-05, + "loss": 1.7315, + "step": 36325 + }, + { + "epoch": 1.2744548068157913, + "grad_norm": 2.9263851642608643, + "learning_rate": 3.1954706592735913e-05, + "loss": 1.6663, + "step": 36350 + }, + { + "epoch": 1.2753313231891172, + "grad_norm": 3.30521559715271, + "learning_rate": 3.193847472373359e-05, + "loss": 1.7088, + "step": 36375 + }, + { + "epoch": 1.276207839562443, + "grad_norm": 6.537885665893555, + "learning_rate": 3.192224285473126e-05, + "loss": 1.6143, + "step": 36400 + }, + { + "epoch": 1.2770843559357687, + "grad_norm": 4.29196834564209, + "learning_rate": 3.190601098572894e-05, + "loss": 2.0388, + "step": 36425 + }, + { + "epoch": 1.2779608723090947, + "grad_norm": 3.0255699157714844, + "learning_rate": 3.188977911672662e-05, + "loss": 1.7721, + "step": 36450 + }, + { + "epoch": 1.2788373886824207, + "grad_norm": 3.1006205081939697, + "learning_rate": 3.1873547247724293e-05, + "loss": 1.5647, + "step": 36475 + }, + { + "epoch": 1.2797139050557464, + "grad_norm": 4.409492015838623, + "learning_rate": 3.185731537872197e-05, + "loss": 1.6123, + "step": 36500 + }, + { + "epoch": 1.2805904214290722, + "grad_norm": 4.705583572387695, + "learning_rate": 3.184108350971964e-05, + "loss": 1.4906, + "step": 36525 + }, + { + "epoch": 1.2814669378023982, + "grad_norm": 3.095221519470215, + "learning_rate": 3.182485164071732e-05, + "loss": 1.637, + "step": 36550 + }, + { + "epoch": 1.282343454175724, + "grad_norm": 3.5929512977600098, + "learning_rate": 3.1808619771715e-05, + "loss": 1.9377, + "step": 36575 + }, + { + "epoch": 1.28321997054905, + "grad_norm": 4.935823440551758, + "learning_rate": 3.1792387902712673e-05, + "loss": 1.7357, + "step": 36600 + }, + { + "epoch": 1.2840964869223757, + "grad_norm": 9.214683532714844, + "learning_rate": 3.177615603371035e-05, + "loss": 1.7893, + "step": 36625 + }, + { + "epoch": 1.2849730032957016, + "grad_norm": 4.582629680633545, + "learning_rate": 3.175992416470802e-05, + "loss": 1.6171, + "step": 36650 + }, + { + "epoch": 1.2858495196690274, + "grad_norm": 3.2935731410980225, + "learning_rate": 3.17436922957057e-05, + "loss": 1.5171, + "step": 36675 + }, + { + "epoch": 1.2867260360423534, + "grad_norm": 5.536621570587158, + "learning_rate": 3.172746042670338e-05, + "loss": 1.745, + "step": 36700 + }, + { + "epoch": 1.2876025524156791, + "grad_norm": 4.8144612312316895, + "learning_rate": 3.1711228557701053e-05, + "loss": 1.674, + "step": 36725 + }, + { + "epoch": 1.2884790687890049, + "grad_norm": 3.510770082473755, + "learning_rate": 3.169499668869873e-05, + "loss": 1.6822, + "step": 36750 + }, + { + "epoch": 1.2893555851623308, + "grad_norm": 4.992552280426025, + "learning_rate": 3.1678764819696396e-05, + "loss": 1.9235, + "step": 36775 + }, + { + "epoch": 1.2902321015356568, + "grad_norm": 8.45614242553711, + "learning_rate": 3.166253295069407e-05, + "loss": 1.7391, + "step": 36800 + }, + { + "epoch": 1.2911086179089826, + "grad_norm": 3.498290538787842, + "learning_rate": 3.164630108169175e-05, + "loss": 1.8176, + "step": 36825 + }, + { + "epoch": 1.2919851342823083, + "grad_norm": 6.756962776184082, + "learning_rate": 3.163006921268943e-05, + "loss": 1.6397, + "step": 36850 + }, + { + "epoch": 1.2928616506556343, + "grad_norm": 7.894597053527832, + "learning_rate": 3.16138373436871e-05, + "loss": 1.6428, + "step": 36875 + }, + { + "epoch": 1.29373816702896, + "grad_norm": 3.734628200531006, + "learning_rate": 3.1597605474684776e-05, + "loss": 1.7813, + "step": 36900 + }, + { + "epoch": 1.294614683402286, + "grad_norm": 4.598880290985107, + "learning_rate": 3.158137360568245e-05, + "loss": 1.7328, + "step": 36925 + }, + { + "epoch": 1.2954911997756118, + "grad_norm": 4.250609397888184, + "learning_rate": 3.156514173668013e-05, + "loss": 1.7769, + "step": 36950 + }, + { + "epoch": 1.2963677161489375, + "grad_norm": 4.562142848968506, + "learning_rate": 3.154890986767781e-05, + "loss": 1.7571, + "step": 36975 + }, + { + "epoch": 1.2972442325222635, + "grad_norm": 3.832435369491577, + "learning_rate": 3.153267799867548e-05, + "loss": 1.5423, + "step": 37000 + }, + { + "epoch": 1.2981207488955895, + "grad_norm": 5.013363361358643, + "learning_rate": 3.1516446129673156e-05, + "loss": 1.5996, + "step": 37025 + }, + { + "epoch": 1.2989972652689152, + "grad_norm": 3.089940309524536, + "learning_rate": 3.150021426067083e-05, + "loss": 1.7358, + "step": 37050 + }, + { + "epoch": 1.299873781642241, + "grad_norm": 3.569154739379883, + "learning_rate": 3.148398239166851e-05, + "loss": 1.7705, + "step": 37075 + }, + { + "epoch": 1.300750298015567, + "grad_norm": 5.43502140045166, + "learning_rate": 3.146775052266619e-05, + "loss": 1.5084, + "step": 37100 + }, + { + "epoch": 1.3016268143888927, + "grad_norm": 3.063920021057129, + "learning_rate": 3.145151865366386e-05, + "loss": 1.7026, + "step": 37125 + }, + { + "epoch": 1.3025033307622187, + "grad_norm": 6.531482219696045, + "learning_rate": 3.1435286784661536e-05, + "loss": 1.6206, + "step": 37150 + }, + { + "epoch": 1.3033798471355444, + "grad_norm": 3.5023579597473145, + "learning_rate": 3.141905491565921e-05, + "loss": 1.7288, + "step": 37175 + }, + { + "epoch": 1.3042563635088704, + "grad_norm": 4.985012531280518, + "learning_rate": 3.1402823046656885e-05, + "loss": 1.869, + "step": 37200 + }, + { + "epoch": 1.3051328798821962, + "grad_norm": 4.442305088043213, + "learning_rate": 3.138659117765456e-05, + "loss": 1.4604, + "step": 37225 + }, + { + "epoch": 1.3060093962555221, + "grad_norm": 3.581800699234009, + "learning_rate": 3.1370359308652235e-05, + "loss": 1.6059, + "step": 37250 + }, + { + "epoch": 1.306885912628848, + "grad_norm": 2.902801513671875, + "learning_rate": 3.135412743964991e-05, + "loss": 1.7744, + "step": 37275 + }, + { + "epoch": 1.3077624290021737, + "grad_norm": 7.01090145111084, + "learning_rate": 3.133789557064759e-05, + "loss": 1.852, + "step": 37300 + }, + { + "epoch": 1.3086389453754996, + "grad_norm": 4.556249618530273, + "learning_rate": 3.1321663701645265e-05, + "loss": 1.7302, + "step": 37325 + }, + { + "epoch": 1.3095154617488256, + "grad_norm": 2.918569564819336, + "learning_rate": 3.130543183264294e-05, + "loss": 1.9352, + "step": 37350 + }, + { + "epoch": 1.3103919781221514, + "grad_norm": 4.562246799468994, + "learning_rate": 3.1289199963640615e-05, + "loss": 1.6425, + "step": 37375 + }, + { + "epoch": 1.311268494495477, + "grad_norm": 2.8795440196990967, + "learning_rate": 3.127296809463829e-05, + "loss": 1.6705, + "step": 37400 + }, + { + "epoch": 1.312145010868803, + "grad_norm": 3.769611358642578, + "learning_rate": 3.125673622563597e-05, + "loss": 1.6873, + "step": 37425 + }, + { + "epoch": 1.3130215272421288, + "grad_norm": 3.931591510772705, + "learning_rate": 3.1240504356633645e-05, + "loss": 1.799, + "step": 37450 + }, + { + "epoch": 1.3138980436154548, + "grad_norm": 6.639450550079346, + "learning_rate": 3.122427248763132e-05, + "loss": 1.5792, + "step": 37475 + }, + { + "epoch": 1.3147745599887806, + "grad_norm": 4.934543609619141, + "learning_rate": 3.1208040618628995e-05, + "loss": 1.6828, + "step": 37500 + }, + { + "epoch": 1.3156510763621063, + "grad_norm": 3.531926155090332, + "learning_rate": 3.119180874962667e-05, + "loss": 1.6762, + "step": 37525 + }, + { + "epoch": 1.3165275927354323, + "grad_norm": 3.8164520263671875, + "learning_rate": 3.1175576880624344e-05, + "loss": 1.6359, + "step": 37550 + }, + { + "epoch": 1.3174041091087583, + "grad_norm": 4.766887187957764, + "learning_rate": 3.115934501162202e-05, + "loss": 1.76, + "step": 37575 + }, + { + "epoch": 1.318280625482084, + "grad_norm": 6.7767438888549805, + "learning_rate": 3.114311314261969e-05, + "loss": 1.7363, + "step": 37600 + }, + { + "epoch": 1.3191571418554098, + "grad_norm": 3.2926998138427734, + "learning_rate": 3.112688127361737e-05, + "loss": 1.701, + "step": 37625 + }, + { + "epoch": 1.3200336582287358, + "grad_norm": 11.002528190612793, + "learning_rate": 3.111064940461504e-05, + "loss": 1.7648, + "step": 37650 + }, + { + "epoch": 1.3209101746020615, + "grad_norm": 5.17566442489624, + "learning_rate": 3.1094417535612724e-05, + "loss": 1.8597, + "step": 37675 + }, + { + "epoch": 1.3217866909753875, + "grad_norm": 3.594583034515381, + "learning_rate": 3.10781856666104e-05, + "loss": 1.7517, + "step": 37700 + }, + { + "epoch": 1.3226632073487132, + "grad_norm": 5.364834308624268, + "learning_rate": 3.106195379760807e-05, + "loss": 1.7539, + "step": 37725 + }, + { + "epoch": 1.3235397237220392, + "grad_norm": 6.4243855476379395, + "learning_rate": 3.104572192860575e-05, + "loss": 1.5891, + "step": 37750 + }, + { + "epoch": 1.324416240095365, + "grad_norm": 8.934224128723145, + "learning_rate": 3.102949005960342e-05, + "loss": 1.7915, + "step": 37775 + }, + { + "epoch": 1.325292756468691, + "grad_norm": 5.169863224029541, + "learning_rate": 3.1013258190601104e-05, + "loss": 1.722, + "step": 37800 + }, + { + "epoch": 1.3261692728420167, + "grad_norm": 3.7404873371124268, + "learning_rate": 3.099702632159878e-05, + "loss": 1.8262, + "step": 37825 + }, + { + "epoch": 1.3270457892153424, + "grad_norm": 10.143453598022461, + "learning_rate": 3.098079445259645e-05, + "loss": 1.7412, + "step": 37850 + }, + { + "epoch": 1.3279223055886684, + "grad_norm": 7.068303108215332, + "learning_rate": 3.096456258359413e-05, + "loss": 1.658, + "step": 37875 + }, + { + "epoch": 1.3287988219619942, + "grad_norm": 3.198611259460449, + "learning_rate": 3.09483307145918e-05, + "loss": 1.6642, + "step": 37900 + }, + { + "epoch": 1.3296753383353201, + "grad_norm": 5.6031904220581055, + "learning_rate": 3.093209884558948e-05, + "loss": 1.7386, + "step": 37925 + }, + { + "epoch": 1.330551854708646, + "grad_norm": 3.828928232192993, + "learning_rate": 3.091586697658715e-05, + "loss": 1.6268, + "step": 37950 + }, + { + "epoch": 1.3314283710819719, + "grad_norm": 6.800772666931152, + "learning_rate": 3.0899635107584826e-05, + "loss": 1.7289, + "step": 37975 + }, + { + "epoch": 1.3323048874552976, + "grad_norm": 4.721773147583008, + "learning_rate": 3.08834032385825e-05, + "loss": 1.7285, + "step": 38000 + }, + { + "epoch": 1.3331814038286236, + "grad_norm": 4.706860065460205, + "learning_rate": 3.0867171369580176e-05, + "loss": 1.8497, + "step": 38025 + }, + { + "epoch": 1.3340579202019494, + "grad_norm": 5.310672283172607, + "learning_rate": 3.085093950057786e-05, + "loss": 1.5676, + "step": 38050 + }, + { + "epoch": 1.334934436575275, + "grad_norm": 3.6046810150146484, + "learning_rate": 3.083470763157553e-05, + "loss": 1.6568, + "step": 38075 + }, + { + "epoch": 1.335810952948601, + "grad_norm": 4.70573091506958, + "learning_rate": 3.0818475762573206e-05, + "loss": 1.7224, + "step": 38100 + }, + { + "epoch": 1.336687469321927, + "grad_norm": 4.847733974456787, + "learning_rate": 3.080224389357088e-05, + "loss": 1.7071, + "step": 38125 + }, + { + "epoch": 1.3375639856952528, + "grad_norm": 4.663862228393555, + "learning_rate": 3.0786012024568556e-05, + "loss": 1.6887, + "step": 38150 + }, + { + "epoch": 1.3384405020685786, + "grad_norm": 4.871497631072998, + "learning_rate": 3.076978015556624e-05, + "loss": 1.6012, + "step": 38175 + }, + { + "epoch": 1.3393170184419045, + "grad_norm": 4.810940265655518, + "learning_rate": 3.075354828656391e-05, + "loss": 1.7756, + "step": 38200 + }, + { + "epoch": 1.3401935348152303, + "grad_norm": 3.9586503505706787, + "learning_rate": 3.0737316417561586e-05, + "loss": 1.7214, + "step": 38225 + }, + { + "epoch": 1.3410700511885563, + "grad_norm": 7.281888484954834, + "learning_rate": 3.072108454855926e-05, + "loss": 1.77, + "step": 38250 + }, + { + "epoch": 1.341946567561882, + "grad_norm": 10.831209182739258, + "learning_rate": 3.070485267955694e-05, + "loss": 1.7135, + "step": 38275 + }, + { + "epoch": 1.3428230839352078, + "grad_norm": 4.594773769378662, + "learning_rate": 3.068862081055461e-05, + "loss": 1.7768, + "step": 38300 + }, + { + "epoch": 1.3436996003085337, + "grad_norm": 4.028745651245117, + "learning_rate": 3.0672388941552285e-05, + "loss": 1.6291, + "step": 38325 + }, + { + "epoch": 1.3445761166818597, + "grad_norm": 3.7432808876037598, + "learning_rate": 3.065615707254996e-05, + "loss": 1.7671, + "step": 38350 + }, + { + "epoch": 1.3454526330551855, + "grad_norm": 5.106191635131836, + "learning_rate": 3.0639925203547634e-05, + "loss": 1.7181, + "step": 38375 + }, + { + "epoch": 1.3463291494285112, + "grad_norm": 3.416844129562378, + "learning_rate": 3.0623693334545316e-05, + "loss": 1.8201, + "step": 38400 + }, + { + "epoch": 1.3472056658018372, + "grad_norm": 3.3406858444213867, + "learning_rate": 3.060746146554299e-05, + "loss": 1.6306, + "step": 38425 + }, + { + "epoch": 1.348082182175163, + "grad_norm": 3.3163771629333496, + "learning_rate": 3.0591229596540665e-05, + "loss": 1.7693, + "step": 38450 + }, + { + "epoch": 1.348958698548489, + "grad_norm": 11.846861839294434, + "learning_rate": 3.057499772753834e-05, + "loss": 1.828, + "step": 38475 + }, + { + "epoch": 1.3498352149218147, + "grad_norm": 3.8585193157196045, + "learning_rate": 3.0558765858536014e-05, + "loss": 1.6569, + "step": 38500 + }, + { + "epoch": 1.3507117312951407, + "grad_norm": 3.1589348316192627, + "learning_rate": 3.0542533989533696e-05, + "loss": 1.7358, + "step": 38525 + }, + { + "epoch": 1.3515882476684664, + "grad_norm": 4.430388927459717, + "learning_rate": 3.052630212053137e-05, + "loss": 1.7459, + "step": 38550 + }, + { + "epoch": 1.3524647640417924, + "grad_norm": 9.75197982788086, + "learning_rate": 3.0510070251529045e-05, + "loss": 1.7558, + "step": 38575 + }, + { + "epoch": 1.3533412804151181, + "grad_norm": 3.5475902557373047, + "learning_rate": 3.049383838252672e-05, + "loss": 1.6, + "step": 38600 + }, + { + "epoch": 1.354217796788444, + "grad_norm": 4.8683929443359375, + "learning_rate": 3.0477606513524394e-05, + "loss": 1.6816, + "step": 38625 + }, + { + "epoch": 1.3550943131617699, + "grad_norm": 5.273770809173584, + "learning_rate": 3.0461374644522072e-05, + "loss": 1.7298, + "step": 38650 + }, + { + "epoch": 1.3559708295350958, + "grad_norm": 6.389270305633545, + "learning_rate": 3.0445142775519747e-05, + "loss": 1.6471, + "step": 38675 + }, + { + "epoch": 1.3568473459084216, + "grad_norm": 4.977392673492432, + "learning_rate": 3.042891090651742e-05, + "loss": 1.7512, + "step": 38700 + }, + { + "epoch": 1.3577238622817474, + "grad_norm": 3.59269642829895, + "learning_rate": 3.0412679037515096e-05, + "loss": 1.5909, + "step": 38725 + }, + { + "epoch": 1.3586003786550733, + "grad_norm": 4.649091720581055, + "learning_rate": 3.039644716851277e-05, + "loss": 1.6975, + "step": 38750 + }, + { + "epoch": 1.359476895028399, + "grad_norm": 4.556885242462158, + "learning_rate": 3.038021529951045e-05, + "loss": 1.9217, + "step": 38775 + }, + { + "epoch": 1.360353411401725, + "grad_norm": 8.439022064208984, + "learning_rate": 3.0363983430508124e-05, + "loss": 1.7597, + "step": 38800 + }, + { + "epoch": 1.3612299277750508, + "grad_norm": 3.1670708656311035, + "learning_rate": 3.0347751561505798e-05, + "loss": 1.5974, + "step": 38825 + }, + { + "epoch": 1.3621064441483766, + "grad_norm": 7.222106456756592, + "learning_rate": 3.0331519692503473e-05, + "loss": 1.6073, + "step": 38850 + }, + { + "epoch": 1.3629829605217025, + "grad_norm": 9.27762222290039, + "learning_rate": 3.0315287823501148e-05, + "loss": 1.7686, + "step": 38875 + }, + { + "epoch": 1.3638594768950285, + "grad_norm": 5.2907395362854, + "learning_rate": 3.029905595449883e-05, + "loss": 1.7771, + "step": 38900 + }, + { + "epoch": 1.3647359932683543, + "grad_norm": 4.561214923858643, + "learning_rate": 3.0282824085496504e-05, + "loss": 1.7085, + "step": 38925 + }, + { + "epoch": 1.36561250964168, + "grad_norm": 3.287623405456543, + "learning_rate": 3.0266592216494178e-05, + "loss": 1.719, + "step": 38950 + }, + { + "epoch": 1.366489026015006, + "grad_norm": 4.685939788818359, + "learning_rate": 3.0250360347491853e-05, + "loss": 1.6378, + "step": 38975 + }, + { + "epoch": 1.3673655423883317, + "grad_norm": 8.011797904968262, + "learning_rate": 3.0234128478489528e-05, + "loss": 1.7297, + "step": 39000 + }, + { + "epoch": 1.3682420587616577, + "grad_norm": 6.5263237953186035, + "learning_rate": 3.0217896609487206e-05, + "loss": 1.5643, + "step": 39025 + }, + { + "epoch": 1.3691185751349835, + "grad_norm": 5.667575836181641, + "learning_rate": 3.020166474048488e-05, + "loss": 1.6656, + "step": 39050 + }, + { + "epoch": 1.3699950915083094, + "grad_norm": 10.147462844848633, + "learning_rate": 3.0185432871482555e-05, + "loss": 1.9281, + "step": 39075 + }, + { + "epoch": 1.3708716078816352, + "grad_norm": 4.23015832901001, + "learning_rate": 3.016920100248023e-05, + "loss": 1.5699, + "step": 39100 + }, + { + "epoch": 1.3717481242549612, + "grad_norm": 6.946453094482422, + "learning_rate": 3.0152969133477904e-05, + "loss": 1.6085, + "step": 39125 + }, + { + "epoch": 1.372624640628287, + "grad_norm": 3.689013719558716, + "learning_rate": 3.0136737264475586e-05, + "loss": 1.7291, + "step": 39150 + }, + { + "epoch": 1.3735011570016127, + "grad_norm": 10.221916198730469, + "learning_rate": 3.012050539547326e-05, + "loss": 1.6745, + "step": 39175 + }, + { + "epoch": 1.3743776733749387, + "grad_norm": 4.998299598693848, + "learning_rate": 3.010427352647093e-05, + "loss": 1.6017, + "step": 39200 + }, + { + "epoch": 1.3752541897482646, + "grad_norm": 4.031212329864502, + "learning_rate": 3.0088041657468606e-05, + "loss": 1.7051, + "step": 39225 + }, + { + "epoch": 1.3761307061215904, + "grad_norm": 3.0789272785186768, + "learning_rate": 3.007180978846628e-05, + "loss": 1.889, + "step": 39250 + }, + { + "epoch": 1.3770072224949161, + "grad_norm": 5.7297563552856445, + "learning_rate": 3.0055577919463962e-05, + "loss": 1.6931, + "step": 39275 + }, + { + "epoch": 1.3778837388682421, + "grad_norm": 9.960515022277832, + "learning_rate": 3.0039346050461637e-05, + "loss": 1.5989, + "step": 39300 + }, + { + "epoch": 1.3787602552415679, + "grad_norm": 4.811001300811768, + "learning_rate": 3.002311418145931e-05, + "loss": 1.7638, + "step": 39325 + }, + { + "epoch": 1.3796367716148938, + "grad_norm": 8.660208702087402, + "learning_rate": 3.0006882312456986e-05, + "loss": 1.8072, + "step": 39350 + }, + { + "epoch": 1.3805132879882196, + "grad_norm": 5.4923930168151855, + "learning_rate": 2.9990650443454664e-05, + "loss": 1.6417, + "step": 39375 + }, + { + "epoch": 1.3813898043615453, + "grad_norm": 4.320521354675293, + "learning_rate": 2.997441857445234e-05, + "loss": 1.6107, + "step": 39400 + }, + { + "epoch": 1.3822663207348713, + "grad_norm": 4.352777004241943, + "learning_rate": 2.9958186705450013e-05, + "loss": 1.8236, + "step": 39425 + }, + { + "epoch": 1.3831428371081973, + "grad_norm": 4.426023483276367, + "learning_rate": 2.9941954836447688e-05, + "loss": 1.7996, + "step": 39450 + }, + { + "epoch": 1.384019353481523, + "grad_norm": 7.008428573608398, + "learning_rate": 2.9925722967445363e-05, + "loss": 1.5672, + "step": 39475 + }, + { + "epoch": 1.3848958698548488, + "grad_norm": 7.020242691040039, + "learning_rate": 2.9909491098443044e-05, + "loss": 1.6717, + "step": 39500 + }, + { + "epoch": 1.3857723862281748, + "grad_norm": 8.760161399841309, + "learning_rate": 2.989325922944072e-05, + "loss": 1.5697, + "step": 39525 + }, + { + "epoch": 1.3866489026015005, + "grad_norm": 3.2164878845214844, + "learning_rate": 2.9877027360438393e-05, + "loss": 1.7955, + "step": 39550 + }, + { + "epoch": 1.3875254189748265, + "grad_norm": 3.243166208267212, + "learning_rate": 2.9860795491436068e-05, + "loss": 1.6338, + "step": 39575 + }, + { + "epoch": 1.3884019353481523, + "grad_norm": 4.6208319664001465, + "learning_rate": 2.984456362243374e-05, + "loss": 1.6908, + "step": 39600 + }, + { + "epoch": 1.3892784517214782, + "grad_norm": 10.963808059692383, + "learning_rate": 2.982833175343142e-05, + "loss": 1.6109, + "step": 39625 + }, + { + "epoch": 1.390154968094804, + "grad_norm": 7.057652473449707, + "learning_rate": 2.9812099884429095e-05, + "loss": 1.8133, + "step": 39650 + }, + { + "epoch": 1.39103148446813, + "grad_norm": 4.457187652587891, + "learning_rate": 2.979586801542677e-05, + "loss": 1.7885, + "step": 39675 + }, + { + "epoch": 1.3919080008414557, + "grad_norm": 4.3722639083862305, + "learning_rate": 2.9779636146424445e-05, + "loss": 1.5911, + "step": 39700 + }, + { + "epoch": 1.3927845172147815, + "grad_norm": 3.315735101699829, + "learning_rate": 2.976340427742212e-05, + "loss": 1.7196, + "step": 39725 + }, + { + "epoch": 1.3936610335881074, + "grad_norm": 5.211250305175781, + "learning_rate": 2.9747172408419797e-05, + "loss": 1.6506, + "step": 39750 + }, + { + "epoch": 1.3945375499614332, + "grad_norm": 6.400726318359375, + "learning_rate": 2.9730940539417472e-05, + "loss": 1.772, + "step": 39775 + }, + { + "epoch": 1.3954140663347592, + "grad_norm": 6.720447540283203, + "learning_rate": 2.9714708670415147e-05, + "loss": 1.6433, + "step": 39800 + }, + { + "epoch": 1.396290582708085, + "grad_norm": 4.0358476638793945, + "learning_rate": 2.969847680141282e-05, + "loss": 1.6351, + "step": 39825 + }, + { + "epoch": 1.397167099081411, + "grad_norm": 5.438233375549316, + "learning_rate": 2.9682244932410496e-05, + "loss": 1.7891, + "step": 39850 + }, + { + "epoch": 1.3980436154547367, + "grad_norm": 6.147374153137207, + "learning_rate": 2.9666013063408177e-05, + "loss": 1.6212, + "step": 39875 + }, + { + "epoch": 1.3989201318280626, + "grad_norm": 4.017621994018555, + "learning_rate": 2.9649781194405852e-05, + "loss": 1.7115, + "step": 39900 + }, + { + "epoch": 1.3997966482013884, + "grad_norm": 4.179771900177002, + "learning_rate": 2.9633549325403527e-05, + "loss": 1.8847, + "step": 39925 + }, + { + "epoch": 1.4006731645747141, + "grad_norm": 4.444271564483643, + "learning_rate": 2.96173174564012e-05, + "loss": 1.7979, + "step": 39950 + }, + { + "epoch": 1.40154968094804, + "grad_norm": 4.12730598449707, + "learning_rate": 2.9601085587398876e-05, + "loss": 1.8037, + "step": 39975 + }, + { + "epoch": 1.402426197321366, + "grad_norm": 5.099287033081055, + "learning_rate": 2.9584853718396554e-05, + "loss": 1.7097, + "step": 40000 + }, + { + "epoch": 1.4033027136946918, + "grad_norm": 5.896966457366943, + "learning_rate": 2.956862184939423e-05, + "loss": 1.6711, + "step": 40025 + }, + { + "epoch": 1.4041792300680176, + "grad_norm": 7.882509231567383, + "learning_rate": 2.9552389980391903e-05, + "loss": 1.878, + "step": 40050 + }, + { + "epoch": 1.4050557464413436, + "grad_norm": 5.627753734588623, + "learning_rate": 2.9536158111389578e-05, + "loss": 1.8922, + "step": 40075 + }, + { + "epoch": 1.4059322628146693, + "grad_norm": 4.8547163009643555, + "learning_rate": 2.9519926242387253e-05, + "loss": 1.6261, + "step": 40100 + }, + { + "epoch": 1.4068087791879953, + "grad_norm": 3.536393165588379, + "learning_rate": 2.9503694373384934e-05, + "loss": 1.9623, + "step": 40125 + }, + { + "epoch": 1.407685295561321, + "grad_norm": 4.4823222160339355, + "learning_rate": 2.9487462504382605e-05, + "loss": 1.7286, + "step": 40150 + }, + { + "epoch": 1.4085618119346468, + "grad_norm": 4.886867046356201, + "learning_rate": 2.947123063538028e-05, + "loss": 1.614, + "step": 40175 + }, + { + "epoch": 1.4094383283079728, + "grad_norm": 5.535654544830322, + "learning_rate": 2.9454998766377955e-05, + "loss": 1.6473, + "step": 40200 + }, + { + "epoch": 1.4103148446812988, + "grad_norm": 10.975579261779785, + "learning_rate": 2.943876689737563e-05, + "loss": 1.7515, + "step": 40225 + }, + { + "epoch": 1.4111913610546245, + "grad_norm": 8.717653274536133, + "learning_rate": 2.942253502837331e-05, + "loss": 1.6993, + "step": 40250 + }, + { + "epoch": 1.4120678774279503, + "grad_norm": 9.524510383605957, + "learning_rate": 2.9406303159370985e-05, + "loss": 1.8043, + "step": 40275 + }, + { + "epoch": 1.4129443938012762, + "grad_norm": 8.502878189086914, + "learning_rate": 2.939007129036866e-05, + "loss": 1.7534, + "step": 40300 + }, + { + "epoch": 1.413820910174602, + "grad_norm": 9.911563873291016, + "learning_rate": 2.9373839421366335e-05, + "loss": 1.6642, + "step": 40325 + }, + { + "epoch": 1.414697426547928, + "grad_norm": 5.0485734939575195, + "learning_rate": 2.935760755236401e-05, + "loss": 1.7002, + "step": 40350 + }, + { + "epoch": 1.4155739429212537, + "grad_norm": 5.639858722686768, + "learning_rate": 2.9341375683361687e-05, + "loss": 1.7726, + "step": 40375 + }, + { + "epoch": 1.4164504592945797, + "grad_norm": 3.90643572807312, + "learning_rate": 2.9325143814359362e-05, + "loss": 1.767, + "step": 40400 + }, + { + "epoch": 1.4173269756679054, + "grad_norm": 9.091129302978516, + "learning_rate": 2.9308911945357037e-05, + "loss": 1.7622, + "step": 40425 + }, + { + "epoch": 1.4182034920412314, + "grad_norm": 5.284657955169678, + "learning_rate": 2.929268007635471e-05, + "loss": 1.8105, + "step": 40450 + }, + { + "epoch": 1.4190800084145572, + "grad_norm": 3.5910520553588867, + "learning_rate": 2.9276448207352386e-05, + "loss": 1.8297, + "step": 40475 + }, + { + "epoch": 1.419956524787883, + "grad_norm": 5.045989990234375, + "learning_rate": 2.9260216338350067e-05, + "loss": 1.5524, + "step": 40500 + }, + { + "epoch": 1.420833041161209, + "grad_norm": 5.05369234085083, + "learning_rate": 2.9243984469347742e-05, + "loss": 1.8392, + "step": 40525 + }, + { + "epoch": 1.4217095575345349, + "grad_norm": 3.4966213703155518, + "learning_rate": 2.9227752600345417e-05, + "loss": 1.507, + "step": 40550 + }, + { + "epoch": 1.4225860739078606, + "grad_norm": 5.0198822021484375, + "learning_rate": 2.9211520731343088e-05, + "loss": 1.7399, + "step": 40575 + }, + { + "epoch": 1.4234625902811864, + "grad_norm": 13.08991813659668, + "learning_rate": 2.919528886234077e-05, + "loss": 1.727, + "step": 40600 + }, + { + "epoch": 1.4243391066545124, + "grad_norm": 3.279853343963623, + "learning_rate": 2.9179056993338444e-05, + "loss": 1.688, + "step": 40625 + }, + { + "epoch": 1.425215623027838, + "grad_norm": 5.380153179168701, + "learning_rate": 2.916282512433612e-05, + "loss": 1.6466, + "step": 40650 + }, + { + "epoch": 1.426092139401164, + "grad_norm": 3.2769737243652344, + "learning_rate": 2.9146593255333793e-05, + "loss": 1.7112, + "step": 40675 + }, + { + "epoch": 1.4269686557744898, + "grad_norm": 8.26187801361084, + "learning_rate": 2.9130361386331468e-05, + "loss": 1.7798, + "step": 40700 + }, + { + "epoch": 1.4278451721478156, + "grad_norm": 3.17978572845459, + "learning_rate": 2.9114129517329146e-05, + "loss": 1.7874, + "step": 40725 + }, + { + "epoch": 1.4287216885211416, + "grad_norm": 10.601489067077637, + "learning_rate": 2.909789764832682e-05, + "loss": 1.659, + "step": 40750 + }, + { + "epoch": 1.4295982048944675, + "grad_norm": 3.4151501655578613, + "learning_rate": 2.9081665779324495e-05, + "loss": 1.7061, + "step": 40775 + }, + { + "epoch": 1.4304747212677933, + "grad_norm": 6.118912696838379, + "learning_rate": 2.906543391032217e-05, + "loss": 1.7345, + "step": 40800 + }, + { + "epoch": 1.431351237641119, + "grad_norm": 5.693037986755371, + "learning_rate": 2.9049202041319844e-05, + "loss": 1.555, + "step": 40825 + }, + { + "epoch": 1.432227754014445, + "grad_norm": 4.426840305328369, + "learning_rate": 2.9032970172317526e-05, + "loss": 1.6014, + "step": 40850 + }, + { + "epoch": 1.4331042703877708, + "grad_norm": 3.1415064334869385, + "learning_rate": 2.90167383033152e-05, + "loss": 1.5327, + "step": 40875 + }, + { + "epoch": 1.4339807867610967, + "grad_norm": 8.058968544006348, + "learning_rate": 2.9000506434312875e-05, + "loss": 1.7449, + "step": 40900 + }, + { + "epoch": 1.4348573031344225, + "grad_norm": 4.121816158294678, + "learning_rate": 2.898427456531055e-05, + "loss": 1.7035, + "step": 40925 + }, + { + "epoch": 1.4357338195077485, + "grad_norm": 4.494134426116943, + "learning_rate": 2.8968042696308224e-05, + "loss": 1.6145, + "step": 40950 + }, + { + "epoch": 1.4366103358810742, + "grad_norm": 11.607808113098145, + "learning_rate": 2.8951810827305902e-05, + "loss": 1.8061, + "step": 40975 + }, + { + "epoch": 1.4374868522544002, + "grad_norm": 5.735234260559082, + "learning_rate": 2.8935578958303577e-05, + "loss": 1.6435, + "step": 41000 + }, + { + "epoch": 1.438363368627726, + "grad_norm": 5.897707462310791, + "learning_rate": 2.8919347089301252e-05, + "loss": 1.691, + "step": 41025 + }, + { + "epoch": 1.4392398850010517, + "grad_norm": 3.623082160949707, + "learning_rate": 2.8903115220298926e-05, + "loss": 1.9323, + "step": 41050 + }, + { + "epoch": 1.4401164013743777, + "grad_norm": 4.9649248123168945, + "learning_rate": 2.88868833512966e-05, + "loss": 1.8059, + "step": 41075 + }, + { + "epoch": 1.4409929177477037, + "grad_norm": 4.807201862335205, + "learning_rate": 2.8870651482294282e-05, + "loss": 1.7652, + "step": 41100 + }, + { + "epoch": 1.4418694341210294, + "grad_norm": 7.252068042755127, + "learning_rate": 2.8854419613291954e-05, + "loss": 1.6797, + "step": 41125 + }, + { + "epoch": 1.4427459504943552, + "grad_norm": 4.438121795654297, + "learning_rate": 2.883818774428963e-05, + "loss": 1.6787, + "step": 41150 + }, + { + "epoch": 1.4436224668676811, + "grad_norm": 4.505495548248291, + "learning_rate": 2.8821955875287303e-05, + "loss": 1.7111, + "step": 41175 + }, + { + "epoch": 1.444498983241007, + "grad_norm": 5.1515350341796875, + "learning_rate": 2.8805724006284978e-05, + "loss": 1.8798, + "step": 41200 + }, + { + "epoch": 1.4453754996143329, + "grad_norm": 5.004429817199707, + "learning_rate": 2.878949213728266e-05, + "loss": 1.7224, + "step": 41225 + }, + { + "epoch": 1.4462520159876586, + "grad_norm": 4.963683128356934, + "learning_rate": 2.8773260268280334e-05, + "loss": 1.6061, + "step": 41250 + }, + { + "epoch": 1.4471285323609844, + "grad_norm": 5.645432949066162, + "learning_rate": 2.875702839927801e-05, + "loss": 1.6049, + "step": 41275 + }, + { + "epoch": 1.4480050487343104, + "grad_norm": 4.890880584716797, + "learning_rate": 2.8740796530275683e-05, + "loss": 1.4116, + "step": 41300 + }, + { + "epoch": 1.4488815651076363, + "grad_norm": 5.556877136230469, + "learning_rate": 2.8724564661273358e-05, + "loss": 1.5776, + "step": 41325 + }, + { + "epoch": 1.449758081480962, + "grad_norm": 4.7523908615112305, + "learning_rate": 2.8708332792271036e-05, + "loss": 1.6914, + "step": 41350 + }, + { + "epoch": 1.4506345978542878, + "grad_norm": 8.238377571105957, + "learning_rate": 2.869210092326871e-05, + "loss": 1.6216, + "step": 41375 + }, + { + "epoch": 1.4515111142276138, + "grad_norm": 5.427084445953369, + "learning_rate": 2.8675869054266385e-05, + "loss": 1.7796, + "step": 41400 + }, + { + "epoch": 1.4523876306009396, + "grad_norm": 5.702132701873779, + "learning_rate": 2.865963718526406e-05, + "loss": 1.6086, + "step": 41425 + }, + { + "epoch": 1.4532641469742655, + "grad_norm": 5.482133865356445, + "learning_rate": 2.8643405316261734e-05, + "loss": 1.7151, + "step": 41450 + }, + { + "epoch": 1.4541406633475913, + "grad_norm": 9.090470314025879, + "learning_rate": 2.8627173447259416e-05, + "loss": 1.7279, + "step": 41475 + }, + { + "epoch": 1.4550171797209173, + "grad_norm": 3.1185860633850098, + "learning_rate": 2.861094157825709e-05, + "loss": 1.6554, + "step": 41500 + }, + { + "epoch": 1.455893696094243, + "grad_norm": 5.439527988433838, + "learning_rate": 2.8594709709254765e-05, + "loss": 1.8765, + "step": 41525 + }, + { + "epoch": 1.456770212467569, + "grad_norm": 5.290731430053711, + "learning_rate": 2.8578477840252436e-05, + "loss": 1.6329, + "step": 41550 + }, + { + "epoch": 1.4576467288408947, + "grad_norm": 5.432644367218018, + "learning_rate": 2.856224597125011e-05, + "loss": 1.6579, + "step": 41575 + }, + { + "epoch": 1.4585232452142205, + "grad_norm": 3.1819963455200195, + "learning_rate": 2.8546014102247792e-05, + "loss": 1.6017, + "step": 41600 + }, + { + "epoch": 1.4593997615875465, + "grad_norm": 4.199990749359131, + "learning_rate": 2.8529782233245467e-05, + "loss": 1.8544, + "step": 41625 + }, + { + "epoch": 1.4602762779608722, + "grad_norm": 3.1017990112304688, + "learning_rate": 2.851355036424314e-05, + "loss": 1.679, + "step": 41650 + }, + { + "epoch": 1.4611527943341982, + "grad_norm": 5.270884037017822, + "learning_rate": 2.8497318495240816e-05, + "loss": 1.9649, + "step": 41675 + }, + { + "epoch": 1.462029310707524, + "grad_norm": 4.49120569229126, + "learning_rate": 2.8481086626238494e-05, + "loss": 1.7546, + "step": 41700 + }, + { + "epoch": 1.46290582708085, + "grad_norm": 5.345789432525635, + "learning_rate": 2.846485475723617e-05, + "loss": 1.8028, + "step": 41725 + }, + { + "epoch": 1.4637823434541757, + "grad_norm": 4.959238052368164, + "learning_rate": 2.8448622888233844e-05, + "loss": 1.7176, + "step": 41750 + }, + { + "epoch": 1.4646588598275017, + "grad_norm": 8.672146797180176, + "learning_rate": 2.8432391019231518e-05, + "loss": 1.6508, + "step": 41775 + }, + { + "epoch": 1.4655353762008274, + "grad_norm": 3.2933592796325684, + "learning_rate": 2.8416159150229193e-05, + "loss": 1.634, + "step": 41800 + }, + { + "epoch": 1.4664118925741532, + "grad_norm": 5.783046722412109, + "learning_rate": 2.8399927281226874e-05, + "loss": 1.8738, + "step": 41825 + }, + { + "epoch": 1.4672884089474791, + "grad_norm": 8.762445449829102, + "learning_rate": 2.838369541222455e-05, + "loss": 1.8004, + "step": 41850 + }, + { + "epoch": 1.4681649253208051, + "grad_norm": 3.2268946170806885, + "learning_rate": 2.8367463543222224e-05, + "loss": 1.7397, + "step": 41875 + }, + { + "epoch": 1.4690414416941309, + "grad_norm": 3.827584981918335, + "learning_rate": 2.8351231674219898e-05, + "loss": 1.7112, + "step": 41900 + }, + { + "epoch": 1.4699179580674566, + "grad_norm": 7.226800441741943, + "learning_rate": 2.8334999805217573e-05, + "loss": 1.7481, + "step": 41925 + }, + { + "epoch": 1.4707944744407826, + "grad_norm": 5.120868682861328, + "learning_rate": 2.831876793621525e-05, + "loss": 1.6177, + "step": 41950 + }, + { + "epoch": 1.4716709908141083, + "grad_norm": 8.19483470916748, + "learning_rate": 2.8302536067212926e-05, + "loss": 1.734, + "step": 41975 + }, + { + "epoch": 1.4725475071874343, + "grad_norm": 4.128667831420898, + "learning_rate": 2.82863041982106e-05, + "loss": 1.895, + "step": 42000 + }, + { + "epoch": 1.47342402356076, + "grad_norm": 7.316489219665527, + "learning_rate": 2.8270072329208275e-05, + "loss": 1.6182, + "step": 42025 + }, + { + "epoch": 1.474300539934086, + "grad_norm": 5.4408721923828125, + "learning_rate": 2.825384046020595e-05, + "loss": 1.6662, + "step": 42050 + }, + { + "epoch": 1.4751770563074118, + "grad_norm": 5.16735315322876, + "learning_rate": 2.8237608591203628e-05, + "loss": 1.6254, + "step": 42075 + }, + { + "epoch": 1.4760535726807378, + "grad_norm": 7.734886169433594, + "learning_rate": 2.8221376722201302e-05, + "loss": 1.8601, + "step": 42100 + }, + { + "epoch": 1.4769300890540635, + "grad_norm": 6.003991603851318, + "learning_rate": 2.8205144853198977e-05, + "loss": 1.741, + "step": 42125 + }, + { + "epoch": 1.4778066054273893, + "grad_norm": 5.665590286254883, + "learning_rate": 2.818891298419665e-05, + "loss": 1.7316, + "step": 42150 + }, + { + "epoch": 1.4786831218007153, + "grad_norm": 6.747347831726074, + "learning_rate": 2.8172681115194326e-05, + "loss": 1.739, + "step": 42175 + }, + { + "epoch": 1.479559638174041, + "grad_norm": 5.4637556076049805, + "learning_rate": 2.8156449246192008e-05, + "loss": 1.6445, + "step": 42200 + }, + { + "epoch": 1.480436154547367, + "grad_norm": 6.445901393890381, + "learning_rate": 2.8140217377189682e-05, + "loss": 1.6278, + "step": 42225 + }, + { + "epoch": 1.4813126709206927, + "grad_norm": 5.169373035430908, + "learning_rate": 2.8123985508187357e-05, + "loss": 1.6091, + "step": 42250 + }, + { + "epoch": 1.4821891872940187, + "grad_norm": 5.670196056365967, + "learning_rate": 2.810775363918503e-05, + "loss": 1.5129, + "step": 42275 + }, + { + "epoch": 1.4830657036673445, + "grad_norm": 4.358458995819092, + "learning_rate": 2.8091521770182706e-05, + "loss": 1.6393, + "step": 42300 + }, + { + "epoch": 1.4839422200406704, + "grad_norm": 3.971757411956787, + "learning_rate": 2.8075289901180384e-05, + "loss": 1.8666, + "step": 42325 + }, + { + "epoch": 1.4848187364139962, + "grad_norm": 10.5786714553833, + "learning_rate": 2.805905803217806e-05, + "loss": 1.7051, + "step": 42350 + }, + { + "epoch": 1.485695252787322, + "grad_norm": 3.25716495513916, + "learning_rate": 2.8042826163175733e-05, + "loss": 1.5943, + "step": 42375 + }, + { + "epoch": 1.486571769160648, + "grad_norm": 4.715320110321045, + "learning_rate": 2.8026594294173408e-05, + "loss": 1.9286, + "step": 42400 + }, + { + "epoch": 1.487448285533974, + "grad_norm": 9.6753568649292, + "learning_rate": 2.8010362425171083e-05, + "loss": 1.5303, + "step": 42425 + }, + { + "epoch": 1.4883248019072997, + "grad_norm": 8.707253456115723, + "learning_rate": 2.7994130556168764e-05, + "loss": 1.7325, + "step": 42450 + }, + { + "epoch": 1.4892013182806254, + "grad_norm": 6.7807087898254395, + "learning_rate": 2.797789868716644e-05, + "loss": 1.5329, + "step": 42475 + }, + { + "epoch": 1.4900778346539514, + "grad_norm": 4.58537483215332, + "learning_rate": 2.796166681816411e-05, + "loss": 1.8713, + "step": 42500 + }, + { + "epoch": 1.4909543510272771, + "grad_norm": 11.407867431640625, + "learning_rate": 2.7945434949161785e-05, + "loss": 1.8328, + "step": 42525 + }, + { + "epoch": 1.4918308674006031, + "grad_norm": 4.452264308929443, + "learning_rate": 2.792920308015946e-05, + "loss": 1.7238, + "step": 42550 + }, + { + "epoch": 1.4927073837739289, + "grad_norm": 5.456773281097412, + "learning_rate": 2.791297121115714e-05, + "loss": 1.5382, + "step": 42575 + }, + { + "epoch": 1.4935839001472546, + "grad_norm": 3.829904794692993, + "learning_rate": 2.7896739342154815e-05, + "loss": 1.7836, + "step": 42600 + }, + { + "epoch": 1.4944604165205806, + "grad_norm": 4.590965270996094, + "learning_rate": 2.788050747315249e-05, + "loss": 1.7986, + "step": 42625 + }, + { + "epoch": 1.4953369328939066, + "grad_norm": 7.6305155754089355, + "learning_rate": 2.7864275604150165e-05, + "loss": 1.83, + "step": 42650 + }, + { + "epoch": 1.4962134492672323, + "grad_norm": 5.5344557762146, + "learning_rate": 2.784804373514784e-05, + "loss": 1.52, + "step": 42675 + }, + { + "epoch": 1.497089965640558, + "grad_norm": 4.056852340698242, + "learning_rate": 2.7831811866145517e-05, + "loss": 1.7033, + "step": 42700 + }, + { + "epoch": 1.497966482013884, + "grad_norm": 6.037247180938721, + "learning_rate": 2.7815579997143192e-05, + "loss": 1.7253, + "step": 42725 + }, + { + "epoch": 1.4988429983872098, + "grad_norm": 5.510132789611816, + "learning_rate": 2.7799348128140867e-05, + "loss": 1.6842, + "step": 42750 + }, + { + "epoch": 1.4997195147605358, + "grad_norm": 9.735057830810547, + "learning_rate": 2.778311625913854e-05, + "loss": 1.7134, + "step": 42775 + }, + { + "epoch": 1.5005960311338615, + "grad_norm": 4.701155185699463, + "learning_rate": 2.7766884390136223e-05, + "loss": 1.7, + "step": 42800 + }, + { + "epoch": 1.5014725475071873, + "grad_norm": 3.894798755645752, + "learning_rate": 2.7750652521133897e-05, + "loss": 1.6177, + "step": 42825 + }, + { + "epoch": 1.5023490638805133, + "grad_norm": 7.776621341705322, + "learning_rate": 2.7734420652131572e-05, + "loss": 1.5908, + "step": 42850 + }, + { + "epoch": 1.5032255802538392, + "grad_norm": 4.043931484222412, + "learning_rate": 2.7718188783129247e-05, + "loss": 1.7605, + "step": 42875 + }, + { + "epoch": 1.504102096627165, + "grad_norm": 4.0506157875061035, + "learning_rate": 2.770195691412692e-05, + "loss": 1.6597, + "step": 42900 + }, + { + "epoch": 1.5049786130004907, + "grad_norm": 6.529386520385742, + "learning_rate": 2.76857250451246e-05, + "loss": 1.8009, + "step": 42925 + }, + { + "epoch": 1.5058551293738167, + "grad_norm": 5.745316982269287, + "learning_rate": 2.7669493176122274e-05, + "loss": 1.647, + "step": 42950 + }, + { + "epoch": 1.5067316457471427, + "grad_norm": 9.816146850585938, + "learning_rate": 2.765326130711995e-05, + "loss": 1.7391, + "step": 42975 + }, + { + "epoch": 1.5076081621204684, + "grad_norm": 4.046548843383789, + "learning_rate": 2.7637029438117623e-05, + "loss": 1.6439, + "step": 43000 + }, + { + "epoch": 1.5084846784937942, + "grad_norm": 4.897729396820068, + "learning_rate": 2.7620797569115298e-05, + "loss": 1.7959, + "step": 43025 + }, + { + "epoch": 1.5093611948671202, + "grad_norm": 5.10048770904541, + "learning_rate": 2.7604565700112976e-05, + "loss": 1.7039, + "step": 43050 + }, + { + "epoch": 1.510237711240446, + "grad_norm": 4.402742862701416, + "learning_rate": 2.758833383111065e-05, + "loss": 1.7855, + "step": 43075 + }, + { + "epoch": 1.511114227613772, + "grad_norm": 3.5327882766723633, + "learning_rate": 2.7572101962108325e-05, + "loss": 1.719, + "step": 43100 + }, + { + "epoch": 1.5119907439870977, + "grad_norm": 4.636297702789307, + "learning_rate": 2.7555870093106e-05, + "loss": 1.8062, + "step": 43125 + }, + { + "epoch": 1.5128672603604234, + "grad_norm": 6.225801944732666, + "learning_rate": 2.7539638224103675e-05, + "loss": 1.7192, + "step": 43150 + }, + { + "epoch": 1.5137437767337494, + "grad_norm": 3.7468364238739014, + "learning_rate": 2.7523406355101356e-05, + "loss": 1.6527, + "step": 43175 + }, + { + "epoch": 1.5146202931070754, + "grad_norm": 3.919508457183838, + "learning_rate": 2.750717448609903e-05, + "loss": 1.7773, + "step": 43200 + }, + { + "epoch": 1.515496809480401, + "grad_norm": 7.054482936859131, + "learning_rate": 2.7490942617096705e-05, + "loss": 1.4854, + "step": 43225 + }, + { + "epoch": 1.5163733258537269, + "grad_norm": 3.211181402206421, + "learning_rate": 2.747471074809438e-05, + "loss": 1.7407, + "step": 43250 + }, + { + "epoch": 1.5172498422270528, + "grad_norm": 4.979792594909668, + "learning_rate": 2.7458478879092055e-05, + "loss": 1.6637, + "step": 43275 + }, + { + "epoch": 1.5181263586003788, + "grad_norm": 5.2386040687561035, + "learning_rate": 2.7442247010089733e-05, + "loss": 1.731, + "step": 43300 + }, + { + "epoch": 1.5190028749737046, + "grad_norm": 4.169620037078857, + "learning_rate": 2.7426015141087407e-05, + "loss": 1.6915, + "step": 43325 + }, + { + "epoch": 1.5198793913470303, + "grad_norm": 5.1745991706848145, + "learning_rate": 2.7409783272085082e-05, + "loss": 1.7234, + "step": 43350 + }, + { + "epoch": 1.520755907720356, + "grad_norm": 11.17226505279541, + "learning_rate": 2.7393551403082757e-05, + "loss": 1.9059, + "step": 43375 + }, + { + "epoch": 1.521632424093682, + "grad_norm": 5.5368146896362305, + "learning_rate": 2.737731953408043e-05, + "loss": 1.6187, + "step": 43400 + }, + { + "epoch": 1.522508940467008, + "grad_norm": 4.033499717712402, + "learning_rate": 2.7361087665078113e-05, + "loss": 1.8078, + "step": 43425 + }, + { + "epoch": 1.5233854568403338, + "grad_norm": 6.552101135253906, + "learning_rate": 2.7344855796075787e-05, + "loss": 1.785, + "step": 43450 + }, + { + "epoch": 1.5242619732136595, + "grad_norm": 3.383706569671631, + "learning_rate": 2.732862392707346e-05, + "loss": 1.6833, + "step": 43475 + }, + { + "epoch": 1.5251384895869855, + "grad_norm": 5.029642105102539, + "learning_rate": 2.7312392058071133e-05, + "loss": 1.6855, + "step": 43500 + }, + { + "epoch": 1.5260150059603115, + "grad_norm": 3.2033305168151855, + "learning_rate": 2.7296160189068808e-05, + "loss": 1.6597, + "step": 43525 + }, + { + "epoch": 1.5268915223336372, + "grad_norm": 3.9210758209228516, + "learning_rate": 2.727992832006649e-05, + "loss": 1.6735, + "step": 43550 + }, + { + "epoch": 1.527768038706963, + "grad_norm": 8.81615924835205, + "learning_rate": 2.7263696451064164e-05, + "loss": 1.761, + "step": 43575 + }, + { + "epoch": 1.5286445550802887, + "grad_norm": 4.172060012817383, + "learning_rate": 2.724746458206184e-05, + "loss": 1.6778, + "step": 43600 + }, + { + "epoch": 1.5295210714536147, + "grad_norm": 11.73141098022461, + "learning_rate": 2.7231232713059513e-05, + "loss": 1.7295, + "step": 43625 + }, + { + "epoch": 1.5303975878269407, + "grad_norm": 8.56633186340332, + "learning_rate": 2.7215000844057188e-05, + "loss": 1.7383, + "step": 43650 + }, + { + "epoch": 1.5312741042002664, + "grad_norm": 7.758413791656494, + "learning_rate": 2.7198768975054866e-05, + "loss": 1.6498, + "step": 43675 + }, + { + "epoch": 1.5321506205735922, + "grad_norm": 4.2075066566467285, + "learning_rate": 2.718253710605254e-05, + "loss": 1.6888, + "step": 43700 + }, + { + "epoch": 1.5330271369469182, + "grad_norm": 5.744470596313477, + "learning_rate": 2.7166305237050215e-05, + "loss": 1.8266, + "step": 43725 + }, + { + "epoch": 1.5339036533202441, + "grad_norm": 8.253798484802246, + "learning_rate": 2.715007336804789e-05, + "loss": 1.7778, + "step": 43750 + }, + { + "epoch": 1.53478016969357, + "grad_norm": 3.9458231925964355, + "learning_rate": 2.7133841499045564e-05, + "loss": 1.6211, + "step": 43775 + }, + { + "epoch": 1.5356566860668956, + "grad_norm": 7.463688850402832, + "learning_rate": 2.7117609630043246e-05, + "loss": 1.5588, + "step": 43800 + }, + { + "epoch": 1.5365332024402216, + "grad_norm": 5.605813503265381, + "learning_rate": 2.710137776104092e-05, + "loss": 1.8281, + "step": 43825 + }, + { + "epoch": 1.5374097188135476, + "grad_norm": 5.214519500732422, + "learning_rate": 2.7085145892038595e-05, + "loss": 1.7308, + "step": 43850 + }, + { + "epoch": 1.5382862351868734, + "grad_norm": 3.852630376815796, + "learning_rate": 2.7068914023036266e-05, + "loss": 1.6398, + "step": 43875 + }, + { + "epoch": 1.539162751560199, + "grad_norm": 7.625736713409424, + "learning_rate": 2.7052682154033948e-05, + "loss": 1.7071, + "step": 43900 + }, + { + "epoch": 1.5400392679335249, + "grad_norm": 4.949635982513428, + "learning_rate": 2.7036450285031623e-05, + "loss": 1.7475, + "step": 43925 + }, + { + "epoch": 1.5409157843068508, + "grad_norm": 5.469110488891602, + "learning_rate": 2.7020218416029297e-05, + "loss": 1.7622, + "step": 43950 + }, + { + "epoch": 1.5417923006801768, + "grad_norm": 7.908682823181152, + "learning_rate": 2.7003986547026972e-05, + "loss": 1.8515, + "step": 43975 + }, + { + "epoch": 1.5426688170535026, + "grad_norm": 7.061572074890137, + "learning_rate": 2.6987754678024646e-05, + "loss": 1.831, + "step": 44000 + }, + { + "epoch": 1.5435453334268283, + "grad_norm": 3.650700569152832, + "learning_rate": 2.6971522809022324e-05, + "loss": 1.6382, + "step": 44025 + }, + { + "epoch": 1.5444218498001543, + "grad_norm": 3.6606802940368652, + "learning_rate": 2.695529094002e-05, + "loss": 1.7134, + "step": 44050 + }, + { + "epoch": 1.5452983661734803, + "grad_norm": 3.9104857444763184, + "learning_rate": 2.6939059071017674e-05, + "loss": 1.754, + "step": 44075 + }, + { + "epoch": 1.546174882546806, + "grad_norm": 5.324155330657959, + "learning_rate": 2.692282720201535e-05, + "loss": 1.7029, + "step": 44100 + }, + { + "epoch": 1.5470513989201318, + "grad_norm": 3.6383779048919678, + "learning_rate": 2.6906595333013023e-05, + "loss": 1.6603, + "step": 44125 + }, + { + "epoch": 1.5479279152934575, + "grad_norm": 3.3966617584228516, + "learning_rate": 2.6890363464010704e-05, + "loss": 1.7789, + "step": 44150 + }, + { + "epoch": 1.5488044316667835, + "grad_norm": 4.788395881652832, + "learning_rate": 2.687413159500838e-05, + "loss": 1.7408, + "step": 44175 + }, + { + "epoch": 1.5496809480401095, + "grad_norm": 5.642242908477783, + "learning_rate": 2.6857899726006054e-05, + "loss": 1.7851, + "step": 44200 + }, + { + "epoch": 1.5505574644134352, + "grad_norm": 3.7619571685791016, + "learning_rate": 2.684166785700373e-05, + "loss": 1.7589, + "step": 44225 + }, + { + "epoch": 1.551433980786761, + "grad_norm": 3.898613929748535, + "learning_rate": 2.6825435988001403e-05, + "loss": 1.9112, + "step": 44250 + }, + { + "epoch": 1.552310497160087, + "grad_norm": 4.305700302124023, + "learning_rate": 2.680920411899908e-05, + "loss": 1.6754, + "step": 44275 + }, + { + "epoch": 1.553187013533413, + "grad_norm": 4.16615629196167, + "learning_rate": 2.6792972249996756e-05, + "loss": 1.7118, + "step": 44300 + }, + { + "epoch": 1.5540635299067387, + "grad_norm": 3.374859094619751, + "learning_rate": 2.677674038099443e-05, + "loss": 1.8269, + "step": 44325 + }, + { + "epoch": 1.5549400462800644, + "grad_norm": 5.1734442710876465, + "learning_rate": 2.6760508511992105e-05, + "loss": 1.8278, + "step": 44350 + }, + { + "epoch": 1.5558165626533904, + "grad_norm": 4.2271647453308105, + "learning_rate": 2.674427664298978e-05, + "loss": 1.7921, + "step": 44375 + }, + { + "epoch": 1.5566930790267162, + "grad_norm": 5.084333896636963, + "learning_rate": 2.672804477398746e-05, + "loss": 1.6909, + "step": 44400 + }, + { + "epoch": 1.5575695954000421, + "grad_norm": 3.8225436210632324, + "learning_rate": 2.6711812904985132e-05, + "loss": 1.6712, + "step": 44425 + }, + { + "epoch": 1.558446111773368, + "grad_norm": 5.419105529785156, + "learning_rate": 2.6695581035982807e-05, + "loss": 1.6028, + "step": 44450 + }, + { + "epoch": 1.5593226281466936, + "grad_norm": 3.7679717540740967, + "learning_rate": 2.667934916698048e-05, + "loss": 1.6575, + "step": 44475 + }, + { + "epoch": 1.5601991445200196, + "grad_norm": 6.5742411613464355, + "learning_rate": 2.6663117297978156e-05, + "loss": 1.623, + "step": 44500 + }, + { + "epoch": 1.5610756608933456, + "grad_norm": 7.135974407196045, + "learning_rate": 2.6646885428975838e-05, + "loss": 1.6441, + "step": 44525 + }, + { + "epoch": 1.5619521772666713, + "grad_norm": 7.5504255294799805, + "learning_rate": 2.6630653559973512e-05, + "loss": 1.7097, + "step": 44550 + }, + { + "epoch": 1.562828693639997, + "grad_norm": 7.51389741897583, + "learning_rate": 2.6614421690971187e-05, + "loss": 1.6711, + "step": 44575 + }, + { + "epoch": 1.563705210013323, + "grad_norm": 5.313292503356934, + "learning_rate": 2.659818982196886e-05, + "loss": 1.6814, + "step": 44600 + }, + { + "epoch": 1.564581726386649, + "grad_norm": 7.748077392578125, + "learning_rate": 2.6581957952966536e-05, + "loss": 1.7697, + "step": 44625 + }, + { + "epoch": 1.5654582427599748, + "grad_norm": 5.231040000915527, + "learning_rate": 2.6565726083964214e-05, + "loss": 1.6163, + "step": 44650 + }, + { + "epoch": 1.5663347591333006, + "grad_norm": 4.059249401092529, + "learning_rate": 2.654949421496189e-05, + "loss": 1.6534, + "step": 44675 + }, + { + "epoch": 1.5672112755066263, + "grad_norm": 8.183411598205566, + "learning_rate": 2.6533262345959564e-05, + "loss": 1.8827, + "step": 44700 + }, + { + "epoch": 1.5680877918799523, + "grad_norm": 4.997443675994873, + "learning_rate": 2.6517030476957238e-05, + "loss": 1.7258, + "step": 44725 + }, + { + "epoch": 1.5689643082532783, + "grad_norm": 5.019183158874512, + "learning_rate": 2.6500798607954913e-05, + "loss": 1.5905, + "step": 44750 + }, + { + "epoch": 1.569840824626604, + "grad_norm": 7.976799964904785, + "learning_rate": 2.6484566738952594e-05, + "loss": 1.6497, + "step": 44775 + }, + { + "epoch": 1.5707173409999298, + "grad_norm": 3.2926299571990967, + "learning_rate": 2.646833486995027e-05, + "loss": 1.7619, + "step": 44800 + }, + { + "epoch": 1.5715938573732557, + "grad_norm": 5.085639953613281, + "learning_rate": 2.6452103000947944e-05, + "loss": 1.6913, + "step": 44825 + }, + { + "epoch": 1.5724703737465817, + "grad_norm": 5.212141990661621, + "learning_rate": 2.6435871131945615e-05, + "loss": 1.6641, + "step": 44850 + }, + { + "epoch": 1.5733468901199075, + "grad_norm": 5.663016319274902, + "learning_rate": 2.641963926294329e-05, + "loss": 1.7825, + "step": 44875 + }, + { + "epoch": 1.5742234064932332, + "grad_norm": 5.066298961639404, + "learning_rate": 2.640340739394097e-05, + "loss": 1.5448, + "step": 44900 + }, + { + "epoch": 1.5750999228665592, + "grad_norm": 3.931999921798706, + "learning_rate": 2.6387175524938646e-05, + "loss": 1.9483, + "step": 44925 + }, + { + "epoch": 1.575976439239885, + "grad_norm": 7.803314208984375, + "learning_rate": 2.637094365593632e-05, + "loss": 1.7568, + "step": 44950 + }, + { + "epoch": 1.576852955613211, + "grad_norm": 6.254623889923096, + "learning_rate": 2.6354711786933995e-05, + "loss": 1.681, + "step": 44975 + }, + { + "epoch": 1.5777294719865367, + "grad_norm": 4.104866027832031, + "learning_rate": 2.6338479917931673e-05, + "loss": 1.8166, + "step": 45000 + }, + { + "epoch": 1.5786059883598624, + "grad_norm": 5.768352031707764, + "learning_rate": 2.6322248048929348e-05, + "loss": 1.5814, + "step": 45025 + }, + { + "epoch": 1.5794825047331884, + "grad_norm": 5.7847771644592285, + "learning_rate": 2.6306016179927022e-05, + "loss": 1.7544, + "step": 45050 + }, + { + "epoch": 1.5803590211065144, + "grad_norm": 3.270430564880371, + "learning_rate": 2.6289784310924697e-05, + "loss": 1.638, + "step": 45075 + }, + { + "epoch": 1.5812355374798401, + "grad_norm": 3.5221643447875977, + "learning_rate": 2.627355244192237e-05, + "loss": 1.7406, + "step": 45100 + }, + { + "epoch": 1.582112053853166, + "grad_norm": 6.36970853805542, + "learning_rate": 2.6257320572920053e-05, + "loss": 1.8179, + "step": 45125 + }, + { + "epoch": 1.5829885702264919, + "grad_norm": 5.196274280548096, + "learning_rate": 2.6241088703917728e-05, + "loss": 1.6219, + "step": 45150 + }, + { + "epoch": 1.5838650865998178, + "grad_norm": 5.7737250328063965, + "learning_rate": 2.6224856834915402e-05, + "loss": 1.7624, + "step": 45175 + }, + { + "epoch": 1.5847416029731436, + "grad_norm": 5.617661476135254, + "learning_rate": 2.6208624965913077e-05, + "loss": 1.7125, + "step": 45200 + }, + { + "epoch": 1.5856181193464693, + "grad_norm": 5.276029586791992, + "learning_rate": 2.619239309691075e-05, + "loss": 1.7653, + "step": 45225 + }, + { + "epoch": 1.586494635719795, + "grad_norm": 4.512963771820068, + "learning_rate": 2.617616122790843e-05, + "loss": 1.6917, + "step": 45250 + }, + { + "epoch": 1.587371152093121, + "grad_norm": 3.8339197635650635, + "learning_rate": 2.6159929358906104e-05, + "loss": 1.6247, + "step": 45275 + }, + { + "epoch": 1.588247668466447, + "grad_norm": 5.8529133796691895, + "learning_rate": 2.614369748990378e-05, + "loss": 1.646, + "step": 45300 + }, + { + "epoch": 1.5891241848397728, + "grad_norm": 5.452278137207031, + "learning_rate": 2.6127465620901453e-05, + "loss": 1.6965, + "step": 45325 + }, + { + "epoch": 1.5900007012130986, + "grad_norm": 3.170767307281494, + "learning_rate": 2.6111233751899128e-05, + "loss": 1.7423, + "step": 45350 + }, + { + "epoch": 1.5908772175864245, + "grad_norm": 7.901683807373047, + "learning_rate": 2.609500188289681e-05, + "loss": 1.6301, + "step": 45375 + }, + { + "epoch": 1.5917537339597505, + "grad_norm": 3.4312217235565186, + "learning_rate": 2.607877001389448e-05, + "loss": 1.7023, + "step": 45400 + }, + { + "epoch": 1.5926302503330763, + "grad_norm": 4.831717491149902, + "learning_rate": 2.6062538144892155e-05, + "loss": 1.7593, + "step": 45425 + }, + { + "epoch": 1.593506766706402, + "grad_norm": 5.930304050445557, + "learning_rate": 2.604630627588983e-05, + "loss": 1.8082, + "step": 45450 + }, + { + "epoch": 1.5943832830797278, + "grad_norm": 4.276729583740234, + "learning_rate": 2.6030074406887505e-05, + "loss": 1.823, + "step": 45475 + }, + { + "epoch": 1.5952597994530537, + "grad_norm": 4.215411186218262, + "learning_rate": 2.6013842537885186e-05, + "loss": 1.7052, + "step": 45500 + }, + { + "epoch": 1.5961363158263797, + "grad_norm": 5.094689846038818, + "learning_rate": 2.599761066888286e-05, + "loss": 1.6954, + "step": 45525 + }, + { + "epoch": 1.5970128321997055, + "grad_norm": 5.492352485656738, + "learning_rate": 2.5981378799880535e-05, + "loss": 1.7768, + "step": 45550 + }, + { + "epoch": 1.5978893485730312, + "grad_norm": 5.252201080322266, + "learning_rate": 2.596514693087821e-05, + "loss": 1.6429, + "step": 45575 + }, + { + "epoch": 1.5987658649463572, + "grad_norm": 9.471896171569824, + "learning_rate": 2.5948915061875885e-05, + "loss": 1.6547, + "step": 45600 + }, + { + "epoch": 1.5996423813196832, + "grad_norm": 4.121469020843506, + "learning_rate": 2.5932683192873563e-05, + "loss": 1.7179, + "step": 45625 + }, + { + "epoch": 1.600518897693009, + "grad_norm": 5.290020942687988, + "learning_rate": 2.5916451323871237e-05, + "loss": 1.4837, + "step": 45650 + }, + { + "epoch": 1.6013954140663347, + "grad_norm": 4.250192642211914, + "learning_rate": 2.5900219454868912e-05, + "loss": 1.7283, + "step": 45675 + }, + { + "epoch": 1.6022719304396607, + "grad_norm": 3.507981777191162, + "learning_rate": 2.5883987585866587e-05, + "loss": 1.5329, + "step": 45700 + }, + { + "epoch": 1.6031484468129866, + "grad_norm": 4.7535810470581055, + "learning_rate": 2.586775571686426e-05, + "loss": 1.6455, + "step": 45725 + }, + { + "epoch": 1.6040249631863124, + "grad_norm": 5.767619609832764, + "learning_rate": 2.5851523847861943e-05, + "loss": 1.6822, + "step": 45750 + }, + { + "epoch": 1.6049014795596381, + "grad_norm": 5.024048805236816, + "learning_rate": 2.5835291978859617e-05, + "loss": 1.7594, + "step": 45775 + }, + { + "epoch": 1.6057779959329639, + "grad_norm": 5.923440933227539, + "learning_rate": 2.5819060109857292e-05, + "loss": 1.6676, + "step": 45800 + }, + { + "epoch": 1.6066545123062899, + "grad_norm": 4.087776184082031, + "learning_rate": 2.5802828240854963e-05, + "loss": 1.7111, + "step": 45825 + }, + { + "epoch": 1.6075310286796158, + "grad_norm": 7.226887226104736, + "learning_rate": 2.5786596371852638e-05, + "loss": 1.6693, + "step": 45850 + }, + { + "epoch": 1.6084075450529416, + "grad_norm": 7.586813449859619, + "learning_rate": 2.577036450285032e-05, + "loss": 1.6996, + "step": 45875 + }, + { + "epoch": 1.6092840614262673, + "grad_norm": 5.4278244972229, + "learning_rate": 2.5754132633847994e-05, + "loss": 1.671, + "step": 45900 + }, + { + "epoch": 1.6101605777995933, + "grad_norm": 4.652691841125488, + "learning_rate": 2.573790076484567e-05, + "loss": 1.8332, + "step": 45925 + }, + { + "epoch": 1.6110370941729193, + "grad_norm": 5.290700435638428, + "learning_rate": 2.5721668895843343e-05, + "loss": 1.5048, + "step": 45950 + }, + { + "epoch": 1.611913610546245, + "grad_norm": 3.3494927883148193, + "learning_rate": 2.5705437026841018e-05, + "loss": 1.7868, + "step": 45975 + }, + { + "epoch": 1.6127901269195708, + "grad_norm": 5.033234596252441, + "learning_rate": 2.5689205157838696e-05, + "loss": 1.7298, + "step": 46000 + }, + { + "epoch": 1.6136666432928966, + "grad_norm": 3.2991814613342285, + "learning_rate": 2.567297328883637e-05, + "loss": 1.7059, + "step": 46025 + }, + { + "epoch": 1.6145431596662225, + "grad_norm": 3.196774959564209, + "learning_rate": 2.5656741419834045e-05, + "loss": 1.7482, + "step": 46050 + }, + { + "epoch": 1.6154196760395485, + "grad_norm": 5.274489402770996, + "learning_rate": 2.564050955083172e-05, + "loss": 1.6599, + "step": 46075 + }, + { + "epoch": 1.6162961924128743, + "grad_norm": 6.3980326652526855, + "learning_rate": 2.5624277681829395e-05, + "loss": 1.7216, + "step": 46100 + }, + { + "epoch": 1.6171727087862, + "grad_norm": 7.111309051513672, + "learning_rate": 2.5608045812827076e-05, + "loss": 1.9299, + "step": 46125 + }, + { + "epoch": 1.618049225159526, + "grad_norm": 4.152295112609863, + "learning_rate": 2.559181394382475e-05, + "loss": 1.7551, + "step": 46150 + }, + { + "epoch": 1.618925741532852, + "grad_norm": 4.3329267501831055, + "learning_rate": 2.5575582074822425e-05, + "loss": 1.6082, + "step": 46175 + }, + { + "epoch": 1.6198022579061777, + "grad_norm": 4.791024684906006, + "learning_rate": 2.55593502058201e-05, + "loss": 1.7769, + "step": 46200 + }, + { + "epoch": 1.6206787742795035, + "grad_norm": 8.804193496704102, + "learning_rate": 2.5543118336817778e-05, + "loss": 1.7196, + "step": 46225 + }, + { + "epoch": 1.6215552906528294, + "grad_norm": 8.565489768981934, + "learning_rate": 2.5526886467815453e-05, + "loss": 1.6625, + "step": 46250 + }, + { + "epoch": 1.6224318070261554, + "grad_norm": 4.071286678314209, + "learning_rate": 2.5510654598813127e-05, + "loss": 1.5889, + "step": 46275 + }, + { + "epoch": 1.6233083233994812, + "grad_norm": 4.140472412109375, + "learning_rate": 2.5494422729810802e-05, + "loss": 1.6133, + "step": 46300 + }, + { + "epoch": 1.624184839772807, + "grad_norm": 6.452969074249268, + "learning_rate": 2.5478190860808477e-05, + "loss": 1.6509, + "step": 46325 + }, + { + "epoch": 1.6250613561461327, + "grad_norm": 5.349558353424072, + "learning_rate": 2.5461958991806155e-05, + "loss": 1.7431, + "step": 46350 + }, + { + "epoch": 1.6259378725194586, + "grad_norm": 5.624844551086426, + "learning_rate": 2.544572712280383e-05, + "loss": 1.6966, + "step": 46375 + }, + { + "epoch": 1.6268143888927846, + "grad_norm": 4.070344924926758, + "learning_rate": 2.5429495253801504e-05, + "loss": 1.7764, + "step": 46400 + }, + { + "epoch": 1.6276909052661104, + "grad_norm": 8.119019508361816, + "learning_rate": 2.541326338479918e-05, + "loss": 1.766, + "step": 46425 + }, + { + "epoch": 1.6285674216394361, + "grad_norm": 6.889199733734131, + "learning_rate": 2.5397031515796853e-05, + "loss": 1.6703, + "step": 46450 + }, + { + "epoch": 1.629443938012762, + "grad_norm": 7.113272190093994, + "learning_rate": 2.5380799646794535e-05, + "loss": 1.7996, + "step": 46475 + }, + { + "epoch": 1.630320454386088, + "grad_norm": 5.308492660522461, + "learning_rate": 2.536456777779221e-05, + "loss": 1.5732, + "step": 46500 + }, + { + "epoch": 1.6311969707594138, + "grad_norm": 6.758711338043213, + "learning_rate": 2.5348335908789884e-05, + "loss": 1.7745, + "step": 46525 + }, + { + "epoch": 1.6320734871327396, + "grad_norm": 5.365380764007568, + "learning_rate": 2.533210403978756e-05, + "loss": 1.8064, + "step": 46550 + }, + { + "epoch": 1.6329500035060653, + "grad_norm": 4.188457012176514, + "learning_rate": 2.5315872170785233e-05, + "loss": 1.7032, + "step": 46575 + }, + { + "epoch": 1.6338265198793913, + "grad_norm": 3.7492713928222656, + "learning_rate": 2.529964030178291e-05, + "loss": 1.6535, + "step": 46600 + }, + { + "epoch": 1.6347030362527173, + "grad_norm": 4.869802951812744, + "learning_rate": 2.5283408432780586e-05, + "loss": 1.6724, + "step": 46625 + }, + { + "epoch": 1.635579552626043, + "grad_norm": 4.506399631500244, + "learning_rate": 2.526717656377826e-05, + "loss": 1.7047, + "step": 46650 + }, + { + "epoch": 1.6364560689993688, + "grad_norm": 8.555376052856445, + "learning_rate": 2.5250944694775935e-05, + "loss": 1.5554, + "step": 46675 + }, + { + "epoch": 1.6373325853726948, + "grad_norm": 8.136832237243652, + "learning_rate": 2.523471282577361e-05, + "loss": 1.7825, + "step": 46700 + }, + { + "epoch": 1.6382091017460207, + "grad_norm": 6.392385482788086, + "learning_rate": 2.521848095677129e-05, + "loss": 1.7826, + "step": 46725 + }, + { + "epoch": 1.6390856181193465, + "grad_norm": 5.159294605255127, + "learning_rate": 2.5202249087768966e-05, + "loss": 1.7411, + "step": 46750 + }, + { + "epoch": 1.6399621344926723, + "grad_norm": 3.868372678756714, + "learning_rate": 2.5186017218766637e-05, + "loss": 1.676, + "step": 46775 + }, + { + "epoch": 1.6408386508659982, + "grad_norm": 3.948577880859375, + "learning_rate": 2.5169785349764312e-05, + "loss": 1.7508, + "step": 46800 + }, + { + "epoch": 1.641715167239324, + "grad_norm": 6.344902515411377, + "learning_rate": 2.5153553480761986e-05, + "loss": 1.4854, + "step": 46825 + }, + { + "epoch": 1.64259168361265, + "grad_norm": 3.124419689178467, + "learning_rate": 2.5137321611759668e-05, + "loss": 1.6111, + "step": 46850 + }, + { + "epoch": 1.6434681999859757, + "grad_norm": 5.306356430053711, + "learning_rate": 2.5121089742757343e-05, + "loss": 1.6432, + "step": 46875 + }, + { + "epoch": 1.6443447163593015, + "grad_norm": 5.927875995635986, + "learning_rate": 2.5104857873755017e-05, + "loss": 1.4396, + "step": 46900 + }, + { + "epoch": 1.6452212327326274, + "grad_norm": 5.023409366607666, + "learning_rate": 2.5088626004752692e-05, + "loss": 1.8786, + "step": 46925 + }, + { + "epoch": 1.6460977491059534, + "grad_norm": 5.565727233886719, + "learning_rate": 2.5072394135750366e-05, + "loss": 1.8907, + "step": 46950 + }, + { + "epoch": 1.6469742654792792, + "grad_norm": 7.064675807952881, + "learning_rate": 2.5056162266748044e-05, + "loss": 1.5696, + "step": 46975 + }, + { + "epoch": 1.647850781852605, + "grad_norm": 6.869761943817139, + "learning_rate": 2.503993039774572e-05, + "loss": 1.6715, + "step": 47000 + }, + { + "epoch": 1.648727298225931, + "grad_norm": 5.138162136077881, + "learning_rate": 2.5023698528743394e-05, + "loss": 1.745, + "step": 47025 + }, + { + "epoch": 1.6496038145992569, + "grad_norm": 5.711406230926514, + "learning_rate": 2.500746665974107e-05, + "loss": 1.7872, + "step": 47050 + }, + { + "epoch": 1.6504803309725826, + "grad_norm": 5.903377532958984, + "learning_rate": 2.4991234790738746e-05, + "loss": 1.6315, + "step": 47075 + }, + { + "epoch": 1.6513568473459084, + "grad_norm": 5.116617202758789, + "learning_rate": 2.497500292173642e-05, + "loss": 1.6695, + "step": 47100 + }, + { + "epoch": 1.6522333637192341, + "grad_norm": 4.229494571685791, + "learning_rate": 2.49587710527341e-05, + "loss": 1.7042, + "step": 47125 + }, + { + "epoch": 1.65310988009256, + "grad_norm": 9.045339584350586, + "learning_rate": 2.4942539183731774e-05, + "loss": 1.6885, + "step": 47150 + }, + { + "epoch": 1.653986396465886, + "grad_norm": 7.164298057556152, + "learning_rate": 2.492630731472945e-05, + "loss": 1.6745, + "step": 47175 + }, + { + "epoch": 1.6548629128392118, + "grad_norm": 5.415690898895264, + "learning_rate": 2.4910075445727123e-05, + "loss": 1.7302, + "step": 47200 + }, + { + "epoch": 1.6557394292125376, + "grad_norm": 4.04472017288208, + "learning_rate": 2.4893843576724798e-05, + "loss": 1.6833, + "step": 47225 + }, + { + "epoch": 1.6566159455858636, + "grad_norm": 3.6857705116271973, + "learning_rate": 2.4877611707722476e-05, + "loss": 1.693, + "step": 47250 + }, + { + "epoch": 1.6574924619591895, + "grad_norm": 6.415241241455078, + "learning_rate": 2.486137983872015e-05, + "loss": 1.7529, + "step": 47275 + }, + { + "epoch": 1.6583689783325153, + "grad_norm": 6.862452030181885, + "learning_rate": 2.484514796971783e-05, + "loss": 1.6919, + "step": 47300 + }, + { + "epoch": 1.659245494705841, + "grad_norm": 6.960758209228516, + "learning_rate": 2.4828916100715503e-05, + "loss": 1.7433, + "step": 47325 + }, + { + "epoch": 1.6601220110791668, + "grad_norm": 7.103149890899658, + "learning_rate": 2.4812684231713178e-05, + "loss": 1.7291, + "step": 47350 + }, + { + "epoch": 1.6609985274524928, + "grad_norm": 7.586047649383545, + "learning_rate": 2.4796452362710852e-05, + "loss": 1.6836, + "step": 47375 + }, + { + "epoch": 1.6618750438258187, + "grad_norm": 8.701732635498047, + "learning_rate": 2.4780220493708527e-05, + "loss": 1.5502, + "step": 47400 + }, + { + "epoch": 1.6627515601991445, + "grad_norm": 6.425958633422852, + "learning_rate": 2.4763988624706205e-05, + "loss": 1.6689, + "step": 47425 + }, + { + "epoch": 1.6636280765724702, + "grad_norm": 3.2147409915924072, + "learning_rate": 2.474775675570388e-05, + "loss": 1.6946, + "step": 47450 + }, + { + "epoch": 1.6645045929457962, + "grad_norm": 5.6546549797058105, + "learning_rate": 2.4731524886701554e-05, + "loss": 1.7262, + "step": 47475 + }, + { + "epoch": 1.6653811093191222, + "grad_norm": 4.279230117797852, + "learning_rate": 2.4715293017699232e-05, + "loss": 1.8842, + "step": 47500 + }, + { + "epoch": 1.666257625692448, + "grad_norm": 5.654289245605469, + "learning_rate": 2.4699061148696907e-05, + "loss": 1.6489, + "step": 47525 + }, + { + "epoch": 1.6671341420657737, + "grad_norm": 9.096597671508789, + "learning_rate": 2.468282927969458e-05, + "loss": 1.5849, + "step": 47550 + }, + { + "epoch": 1.6680106584390997, + "grad_norm": 8.85276985168457, + "learning_rate": 2.4666597410692256e-05, + "loss": 1.6959, + "step": 47575 + }, + { + "epoch": 1.6688871748124257, + "grad_norm": 5.5670084953308105, + "learning_rate": 2.4650365541689934e-05, + "loss": 1.5809, + "step": 47600 + }, + { + "epoch": 1.6697636911857514, + "grad_norm": 5.48366641998291, + "learning_rate": 2.463413367268761e-05, + "loss": 1.8261, + "step": 47625 + }, + { + "epoch": 1.6706402075590772, + "grad_norm": 3.685605764389038, + "learning_rate": 2.4617901803685284e-05, + "loss": 1.822, + "step": 47650 + }, + { + "epoch": 1.671516723932403, + "grad_norm": 5.668590068817139, + "learning_rate": 2.460166993468296e-05, + "loss": 1.5991, + "step": 47675 + }, + { + "epoch": 1.672393240305729, + "grad_norm": 5.12325382232666, + "learning_rate": 2.4585438065680636e-05, + "loss": 1.6171, + "step": 47700 + }, + { + "epoch": 1.6732697566790549, + "grad_norm": 3.366123914718628, + "learning_rate": 2.4569206196678314e-05, + "loss": 1.7124, + "step": 47725 + }, + { + "epoch": 1.6741462730523806, + "grad_norm": 3.6215405464172363, + "learning_rate": 2.4552974327675986e-05, + "loss": 1.5393, + "step": 47750 + }, + { + "epoch": 1.6750227894257064, + "grad_norm": 5.997084617614746, + "learning_rate": 2.453674245867366e-05, + "loss": 1.7962, + "step": 47775 + }, + { + "epoch": 1.6758993057990323, + "grad_norm": 4.404360771179199, + "learning_rate": 2.4520510589671338e-05, + "loss": 1.7233, + "step": 47800 + }, + { + "epoch": 1.6767758221723583, + "grad_norm": 6.291036128997803, + "learning_rate": 2.4504278720669013e-05, + "loss": 1.6123, + "step": 47825 + }, + { + "epoch": 1.677652338545684, + "grad_norm": 10.182136535644531, + "learning_rate": 2.448804685166669e-05, + "loss": 1.4832, + "step": 47850 + }, + { + "epoch": 1.6785288549190098, + "grad_norm": 4.585189342498779, + "learning_rate": 2.4471814982664366e-05, + "loss": 1.7608, + "step": 47875 + }, + { + "epoch": 1.6794053712923356, + "grad_norm": 3.5971858501434326, + "learning_rate": 2.445558311366204e-05, + "loss": 1.5924, + "step": 47900 + }, + { + "epoch": 1.6802818876656616, + "grad_norm": 3.5670604705810547, + "learning_rate": 2.4439351244659718e-05, + "loss": 1.6858, + "step": 47925 + }, + { + "epoch": 1.6811584040389875, + "grad_norm": 4.908016681671143, + "learning_rate": 2.442311937565739e-05, + "loss": 1.7304, + "step": 47950 + }, + { + "epoch": 1.6820349204123133, + "grad_norm": 7.072869300842285, + "learning_rate": 2.4406887506655068e-05, + "loss": 1.7029, + "step": 47975 + }, + { + "epoch": 1.682911436785639, + "grad_norm": 3.2202091217041016, + "learning_rate": 2.4390655637652742e-05, + "loss": 1.8698, + "step": 48000 + }, + { + "epoch": 1.683787953158965, + "grad_norm": 3.763542890548706, + "learning_rate": 2.4374423768650417e-05, + "loss": 1.6199, + "step": 48025 + }, + { + "epoch": 1.684664469532291, + "grad_norm": 3.847662925720215, + "learning_rate": 2.4358191899648095e-05, + "loss": 1.6571, + "step": 48050 + }, + { + "epoch": 1.6855409859056167, + "grad_norm": 3.779877185821533, + "learning_rate": 2.434196003064577e-05, + "loss": 1.6582, + "step": 48075 + }, + { + "epoch": 1.6864175022789425, + "grad_norm": 6.7976837158203125, + "learning_rate": 2.4325728161643448e-05, + "loss": 1.5958, + "step": 48100 + }, + { + "epoch": 1.6872940186522685, + "grad_norm": 6.776675224304199, + "learning_rate": 2.4309496292641122e-05, + "loss": 1.7309, + "step": 48125 + }, + { + "epoch": 1.6881705350255944, + "grad_norm": 6.207943916320801, + "learning_rate": 2.4293264423638797e-05, + "loss": 1.6104, + "step": 48150 + }, + { + "epoch": 1.6890470513989202, + "grad_norm": 8.19918441772461, + "learning_rate": 2.427703255463647e-05, + "loss": 1.6205, + "step": 48175 + }, + { + "epoch": 1.689923567772246, + "grad_norm": 5.207898139953613, + "learning_rate": 2.4260800685634146e-05, + "loss": 1.6118, + "step": 48200 + }, + { + "epoch": 1.6908000841455717, + "grad_norm": 3.7016918659210205, + "learning_rate": 2.4244568816631824e-05, + "loss": 1.5646, + "step": 48225 + }, + { + "epoch": 1.6916766005188977, + "grad_norm": 3.850053548812866, + "learning_rate": 2.42283369476295e-05, + "loss": 1.8766, + "step": 48250 + }, + { + "epoch": 1.6925531168922237, + "grad_norm": 10.537520408630371, + "learning_rate": 2.4212105078627177e-05, + "loss": 1.7776, + "step": 48275 + }, + { + "epoch": 1.6934296332655494, + "grad_norm": 11.131444931030273, + "learning_rate": 2.419587320962485e-05, + "loss": 1.7504, + "step": 48300 + }, + { + "epoch": 1.6943061496388752, + "grad_norm": 7.897207736968994, + "learning_rate": 2.4179641340622526e-05, + "loss": 1.7498, + "step": 48325 + }, + { + "epoch": 1.6951826660122011, + "grad_norm": 4.713499069213867, + "learning_rate": 2.41634094716202e-05, + "loss": 1.514, + "step": 48350 + }, + { + "epoch": 1.696059182385527, + "grad_norm": 5.719786643981934, + "learning_rate": 2.4147177602617875e-05, + "loss": 1.7797, + "step": 48375 + }, + { + "epoch": 1.6969356987588529, + "grad_norm": 4.762840747833252, + "learning_rate": 2.4130945733615554e-05, + "loss": 1.7304, + "step": 48400 + }, + { + "epoch": 1.6978122151321786, + "grad_norm": 3.146397829055786, + "learning_rate": 2.4114713864613228e-05, + "loss": 1.6587, + "step": 48425 + }, + { + "epoch": 1.6986887315055044, + "grad_norm": 9.825989723205566, + "learning_rate": 2.4098481995610903e-05, + "loss": 1.7412, + "step": 48450 + }, + { + "epoch": 1.6995652478788303, + "grad_norm": 5.019702434539795, + "learning_rate": 2.408225012660858e-05, + "loss": 1.7625, + "step": 48475 + }, + { + "epoch": 1.7004417642521563, + "grad_norm": 3.841109037399292, + "learning_rate": 2.4066018257606255e-05, + "loss": 1.6396, + "step": 48500 + }, + { + "epoch": 1.701318280625482, + "grad_norm": 5.201150417327881, + "learning_rate": 2.404978638860393e-05, + "loss": 1.7112, + "step": 48525 + }, + { + "epoch": 1.7021947969988078, + "grad_norm": 7.076929092407227, + "learning_rate": 2.4033554519601605e-05, + "loss": 1.7612, + "step": 48550 + }, + { + "epoch": 1.7030713133721338, + "grad_norm": 7.732307434082031, + "learning_rate": 2.401732265059928e-05, + "loss": 1.6503, + "step": 48575 + }, + { + "epoch": 1.7039478297454598, + "grad_norm": 3.840726852416992, + "learning_rate": 2.4001090781596957e-05, + "loss": 1.6712, + "step": 48600 + }, + { + "epoch": 1.7048243461187855, + "grad_norm": 4.419421672821045, + "learning_rate": 2.3984858912594632e-05, + "loss": 1.5402, + "step": 48625 + }, + { + "epoch": 1.7057008624921113, + "grad_norm": 4.809535980224609, + "learning_rate": 2.396862704359231e-05, + "loss": 1.7631, + "step": 48650 + }, + { + "epoch": 1.7065773788654373, + "grad_norm": 4.897665977478027, + "learning_rate": 2.3952395174589985e-05, + "loss": 1.8718, + "step": 48675 + }, + { + "epoch": 1.707453895238763, + "grad_norm": 5.015279769897461, + "learning_rate": 2.393616330558766e-05, + "loss": 1.5017, + "step": 48700 + }, + { + "epoch": 1.708330411612089, + "grad_norm": 7.304866790771484, + "learning_rate": 2.3919931436585334e-05, + "loss": 1.7232, + "step": 48725 + }, + { + "epoch": 1.7092069279854147, + "grad_norm": 3.1692922115325928, + "learning_rate": 2.390369956758301e-05, + "loss": 1.6871, + "step": 48750 + }, + { + "epoch": 1.7100834443587405, + "grad_norm": 5.372016906738281, + "learning_rate": 2.3887467698580687e-05, + "loss": 1.957, + "step": 48775 + }, + { + "epoch": 1.7109599607320665, + "grad_norm": 7.6023478507995605, + "learning_rate": 2.387123582957836e-05, + "loss": 1.5149, + "step": 48800 + }, + { + "epoch": 1.7118364771053924, + "grad_norm": 4.750809192657471, + "learning_rate": 2.385500396057604e-05, + "loss": 1.8245, + "step": 48825 + }, + { + "epoch": 1.7127129934787182, + "grad_norm": 3.924492597579956, + "learning_rate": 2.3838772091573714e-05, + "loss": 1.7728, + "step": 48850 + }, + { + "epoch": 1.713589509852044, + "grad_norm": 5.274599075317383, + "learning_rate": 2.382254022257139e-05, + "loss": 1.744, + "step": 48875 + }, + { + "epoch": 1.71446602622537, + "grad_norm": 5.490636825561523, + "learning_rate": 2.3806308353569067e-05, + "loss": 1.7635, + "step": 48900 + }, + { + "epoch": 1.715342542598696, + "grad_norm": 3.1233789920806885, + "learning_rate": 2.3790076484566738e-05, + "loss": 1.8605, + "step": 48925 + }, + { + "epoch": 1.7162190589720216, + "grad_norm": 4.250492095947266, + "learning_rate": 2.3773844615564416e-05, + "loss": 1.8286, + "step": 48950 + }, + { + "epoch": 1.7170955753453474, + "grad_norm": 4.722350120544434, + "learning_rate": 2.375761274656209e-05, + "loss": 1.6775, + "step": 48975 + }, + { + "epoch": 1.7179720917186732, + "grad_norm": 8.671077728271484, + "learning_rate": 2.3741380877559765e-05, + "loss": 1.7486, + "step": 49000 + }, + { + "epoch": 1.7188486080919991, + "grad_norm": 4.707937240600586, + "learning_rate": 2.3725149008557443e-05, + "loss": 1.7109, + "step": 49025 + }, + { + "epoch": 1.719725124465325, + "grad_norm": 7.386013984680176, + "learning_rate": 2.3708917139555118e-05, + "loss": 1.7366, + "step": 49050 + }, + { + "epoch": 1.7206016408386509, + "grad_norm": 10.429779052734375, + "learning_rate": 2.3692685270552796e-05, + "loss": 1.7925, + "step": 49075 + }, + { + "epoch": 1.7214781572119766, + "grad_norm": 4.86276912689209, + "learning_rate": 2.367645340155047e-05, + "loss": 1.6981, + "step": 49100 + }, + { + "epoch": 1.7223546735853026, + "grad_norm": 5.931055068969727, + "learning_rate": 2.3660221532548142e-05, + "loss": 1.6892, + "step": 49125 + }, + { + "epoch": 1.7232311899586286, + "grad_norm": 3.3761956691741943, + "learning_rate": 2.364398966354582e-05, + "loss": 1.7285, + "step": 49150 + }, + { + "epoch": 1.7241077063319543, + "grad_norm": 7.616459369659424, + "learning_rate": 2.3627757794543495e-05, + "loss": 1.694, + "step": 49175 + }, + { + "epoch": 1.72498422270528, + "grad_norm": 4.2220611572265625, + "learning_rate": 2.3611525925541173e-05, + "loss": 1.6808, + "step": 49200 + }, + { + "epoch": 1.725860739078606, + "grad_norm": 5.743863582611084, + "learning_rate": 2.3595294056538847e-05, + "loss": 1.6281, + "step": 49225 + }, + { + "epoch": 1.7267372554519318, + "grad_norm": 6.657064437866211, + "learning_rate": 2.3579062187536525e-05, + "loss": 1.5649, + "step": 49250 + }, + { + "epoch": 1.7276137718252578, + "grad_norm": 4.410376071929932, + "learning_rate": 2.35628303185342e-05, + "loss": 1.6828, + "step": 49275 + }, + { + "epoch": 1.7284902881985835, + "grad_norm": 3.3516650199890137, + "learning_rate": 2.3546598449531875e-05, + "loss": 1.687, + "step": 49300 + }, + { + "epoch": 1.7293668045719093, + "grad_norm": 4.493885517120361, + "learning_rate": 2.353036658052955e-05, + "loss": 1.635, + "step": 49325 + }, + { + "epoch": 1.7302433209452353, + "grad_norm": 5.071359157562256, + "learning_rate": 2.3514134711527224e-05, + "loss": 1.8039, + "step": 49350 + }, + { + "epoch": 1.7311198373185612, + "grad_norm": 6.660120010375977, + "learning_rate": 2.3497902842524902e-05, + "loss": 1.8021, + "step": 49375 + }, + { + "epoch": 1.731996353691887, + "grad_norm": 8.243348121643066, + "learning_rate": 2.3481670973522577e-05, + "loss": 1.7613, + "step": 49400 + }, + { + "epoch": 1.7328728700652127, + "grad_norm": 3.6419100761413574, + "learning_rate": 2.346543910452025e-05, + "loss": 1.6739, + "step": 49425 + }, + { + "epoch": 1.7337493864385387, + "grad_norm": 4.629736423492432, + "learning_rate": 2.344920723551793e-05, + "loss": 1.5754, + "step": 49450 + }, + { + "epoch": 1.7346259028118647, + "grad_norm": 5.168485641479492, + "learning_rate": 2.3432975366515604e-05, + "loss": 1.7315, + "step": 49475 + }, + { + "epoch": 1.7355024191851904, + "grad_norm": 4.340418815612793, + "learning_rate": 2.341674349751328e-05, + "loss": 1.6725, + "step": 49500 + }, + { + "epoch": 1.7363789355585162, + "grad_norm": 8.31737232208252, + "learning_rate": 2.3400511628510953e-05, + "loss": 1.8076, + "step": 49525 + }, + { + "epoch": 1.737255451931842, + "grad_norm": 3.59226393699646, + "learning_rate": 2.3384279759508628e-05, + "loss": 1.6133, + "step": 49550 + }, + { + "epoch": 1.738131968305168, + "grad_norm": 5.214158535003662, + "learning_rate": 2.3368047890506306e-05, + "loss": 1.6172, + "step": 49575 + }, + { + "epoch": 1.739008484678494, + "grad_norm": 4.873337268829346, + "learning_rate": 2.335181602150398e-05, + "loss": 1.5751, + "step": 49600 + }, + { + "epoch": 1.7398850010518196, + "grad_norm": 5.293067455291748, + "learning_rate": 2.333558415250166e-05, + "loss": 1.5468, + "step": 49625 + }, + { + "epoch": 1.7407615174251454, + "grad_norm": 4.936447620391846, + "learning_rate": 2.3319352283499333e-05, + "loss": 1.603, + "step": 49650 + }, + { + "epoch": 1.7416380337984714, + "grad_norm": 4.487536907196045, + "learning_rate": 2.3303120414497008e-05, + "loss": 1.8241, + "step": 49675 + }, + { + "epoch": 1.7425145501717973, + "grad_norm": 5.666827201843262, + "learning_rate": 2.3286888545494683e-05, + "loss": 1.5918, + "step": 49700 + }, + { + "epoch": 1.743391066545123, + "grad_norm": 7.205603122711182, + "learning_rate": 2.3270656676492357e-05, + "loss": 1.6665, + "step": 49725 + }, + { + "epoch": 1.7442675829184489, + "grad_norm": 3.8806238174438477, + "learning_rate": 2.3254424807490035e-05, + "loss": 1.6298, + "step": 49750 + }, + { + "epoch": 1.7451440992917746, + "grad_norm": 8.41119384765625, + "learning_rate": 2.323819293848771e-05, + "loss": 1.8721, + "step": 49775 + }, + { + "epoch": 1.7460206156651006, + "grad_norm": 5.00311803817749, + "learning_rate": 2.3221961069485388e-05, + "loss": 1.7476, + "step": 49800 + }, + { + "epoch": 1.7468971320384266, + "grad_norm": 7.38602876663208, + "learning_rate": 2.3205729200483063e-05, + "loss": 1.8296, + "step": 49825 + }, + { + "epoch": 1.7477736484117523, + "grad_norm": 5.122575283050537, + "learning_rate": 2.3189497331480737e-05, + "loss": 1.7337, + "step": 49850 + }, + { + "epoch": 1.748650164785078, + "grad_norm": 4.407209873199463, + "learning_rate": 2.3173265462478412e-05, + "loss": 1.6448, + "step": 49875 + }, + { + "epoch": 1.749526681158404, + "grad_norm": 3.4296798706054688, + "learning_rate": 2.3157033593476086e-05, + "loss": 1.7684, + "step": 49900 + }, + { + "epoch": 1.75040319753173, + "grad_norm": 3.3186120986938477, + "learning_rate": 2.3140801724473764e-05, + "loss": 1.6198, + "step": 49925 + }, + { + "epoch": 1.7512797139050558, + "grad_norm": 4.217392921447754, + "learning_rate": 2.312456985547144e-05, + "loss": 1.9095, + "step": 49950 + }, + { + "epoch": 1.7521562302783815, + "grad_norm": 3.5715413093566895, + "learning_rate": 2.3108337986469114e-05, + "loss": 1.7799, + "step": 49975 + }, + { + "epoch": 1.7530327466517075, + "grad_norm": 5.871794700622559, + "learning_rate": 2.3092106117466792e-05, + "loss": 1.7486, + "step": 50000 + }, + { + "epoch": 1.7539092630250335, + "grad_norm": 3.719320774078369, + "learning_rate": 2.3075874248464466e-05, + "loss": 1.5752, + "step": 50025 + }, + { + "epoch": 1.7547857793983592, + "grad_norm": 10.796296119689941, + "learning_rate": 2.3059642379462145e-05, + "loss": 1.6607, + "step": 50050 + }, + { + "epoch": 1.755662295771685, + "grad_norm": 5.059436798095703, + "learning_rate": 2.304341051045982e-05, + "loss": 1.7726, + "step": 50075 + }, + { + "epoch": 1.7565388121450107, + "grad_norm": 3.2864840030670166, + "learning_rate": 2.302717864145749e-05, + "loss": 1.7749, + "step": 50100 + }, + { + "epoch": 1.7574153285183367, + "grad_norm": 6.040686130523682, + "learning_rate": 2.301094677245517e-05, + "loss": 1.6933, + "step": 50125 + }, + { + "epoch": 1.7582918448916627, + "grad_norm": 4.589251518249512, + "learning_rate": 2.2994714903452843e-05, + "loss": 1.8707, + "step": 50150 + }, + { + "epoch": 1.7591683612649884, + "grad_norm": 3.6298768520355225, + "learning_rate": 2.297848303445052e-05, + "loss": 1.6691, + "step": 50175 + }, + { + "epoch": 1.7600448776383142, + "grad_norm": 3.69006085395813, + "learning_rate": 2.2962251165448196e-05, + "loss": 1.7499, + "step": 50200 + }, + { + "epoch": 1.7609213940116402, + "grad_norm": 3.8339924812316895, + "learning_rate": 2.294601929644587e-05, + "loss": 1.7132, + "step": 50225 + }, + { + "epoch": 1.7617979103849661, + "grad_norm": 4.647484302520752, + "learning_rate": 2.292978742744355e-05, + "loss": 1.6144, + "step": 50250 + }, + { + "epoch": 1.762674426758292, + "grad_norm": 3.7352330684661865, + "learning_rate": 2.2913555558441223e-05, + "loss": 1.6688, + "step": 50275 + }, + { + "epoch": 1.7635509431316176, + "grad_norm": 7.0547661781311035, + "learning_rate": 2.2897323689438898e-05, + "loss": 1.676, + "step": 50300 + }, + { + "epoch": 1.7644274595049434, + "grad_norm": 3.678844928741455, + "learning_rate": 2.2881091820436572e-05, + "loss": 1.6953, + "step": 50325 + }, + { + "epoch": 1.7653039758782694, + "grad_norm": 3.66015625, + "learning_rate": 2.286485995143425e-05, + "loss": 1.6347, + "step": 50350 + }, + { + "epoch": 1.7661804922515953, + "grad_norm": 5.1582489013671875, + "learning_rate": 2.2848628082431925e-05, + "loss": 1.7666, + "step": 50375 + }, + { + "epoch": 1.767057008624921, + "grad_norm": 9.089418411254883, + "learning_rate": 2.28323962134296e-05, + "loss": 1.6869, + "step": 50400 + }, + { + "epoch": 1.7679335249982469, + "grad_norm": 4.746682643890381, + "learning_rate": 2.2816164344427278e-05, + "loss": 1.4943, + "step": 50425 + }, + { + "epoch": 1.7688100413715728, + "grad_norm": 4.758810520172119, + "learning_rate": 2.2799932475424952e-05, + "loss": 1.8392, + "step": 50450 + }, + { + "epoch": 1.7696865577448988, + "grad_norm": 6.446689605712891, + "learning_rate": 2.2783700606422627e-05, + "loss": 1.7167, + "step": 50475 + }, + { + "epoch": 1.7705630741182246, + "grad_norm": 8.65810775756836, + "learning_rate": 2.27674687374203e-05, + "loss": 1.7014, + "step": 50500 + }, + { + "epoch": 1.7714395904915503, + "grad_norm": 3.867642641067505, + "learning_rate": 2.2751236868417976e-05, + "loss": 1.8218, + "step": 50525 + }, + { + "epoch": 1.7723161068648763, + "grad_norm": 3.6456284523010254, + "learning_rate": 2.2735004999415654e-05, + "loss": 1.6596, + "step": 50550 + }, + { + "epoch": 1.773192623238202, + "grad_norm": 2.875333786010742, + "learning_rate": 2.271877313041333e-05, + "loss": 1.7138, + "step": 50575 + }, + { + "epoch": 1.774069139611528, + "grad_norm": 3.9218833446502686, + "learning_rate": 2.2702541261411007e-05, + "loss": 1.6585, + "step": 50600 + }, + { + "epoch": 1.7749456559848538, + "grad_norm": 7.197262763977051, + "learning_rate": 2.268630939240868e-05, + "loss": 1.6742, + "step": 50625 + }, + { + "epoch": 1.7758221723581795, + "grad_norm": 4.481894016265869, + "learning_rate": 2.2670077523406356e-05, + "loss": 1.7379, + "step": 50650 + }, + { + "epoch": 1.7766986887315055, + "grad_norm": 3.9013352394104004, + "learning_rate": 2.265384565440403e-05, + "loss": 1.7787, + "step": 50675 + }, + { + "epoch": 1.7775752051048315, + "grad_norm": 5.1078643798828125, + "learning_rate": 2.2637613785401706e-05, + "loss": 1.6218, + "step": 50700 + }, + { + "epoch": 1.7784517214781572, + "grad_norm": 5.096223831176758, + "learning_rate": 2.2621381916399384e-05, + "loss": 1.6614, + "step": 50725 + }, + { + "epoch": 1.779328237851483, + "grad_norm": 4.944703578948975, + "learning_rate": 2.2605150047397058e-05, + "loss": 1.7414, + "step": 50750 + }, + { + "epoch": 1.780204754224809, + "grad_norm": 4.354489326477051, + "learning_rate": 2.2588918178394733e-05, + "loss": 1.9612, + "step": 50775 + }, + { + "epoch": 1.781081270598135, + "grad_norm": 8.340835571289062, + "learning_rate": 2.257268630939241e-05, + "loss": 1.6186, + "step": 50800 + }, + { + "epoch": 1.7819577869714607, + "grad_norm": 4.762833595275879, + "learning_rate": 2.2556454440390086e-05, + "loss": 1.6097, + "step": 50825 + }, + { + "epoch": 1.7828343033447864, + "grad_norm": 6.9798078536987305, + "learning_rate": 2.254022257138776e-05, + "loss": 1.6312, + "step": 50850 + }, + { + "epoch": 1.7837108197181122, + "grad_norm": 3.2704761028289795, + "learning_rate": 2.2523990702385435e-05, + "loss": 1.6607, + "step": 50875 + }, + { + "epoch": 1.7845873360914382, + "grad_norm": 3.8599393367767334, + "learning_rate": 2.2507758833383113e-05, + "loss": 1.7408, + "step": 50900 + }, + { + "epoch": 1.7854638524647641, + "grad_norm": 3.987889289855957, + "learning_rate": 2.2491526964380788e-05, + "loss": 1.7903, + "step": 50925 + }, + { + "epoch": 1.7863403688380899, + "grad_norm": 3.668196201324463, + "learning_rate": 2.2475295095378462e-05, + "loss": 1.703, + "step": 50950 + }, + { + "epoch": 1.7872168852114156, + "grad_norm": 3.9072439670562744, + "learning_rate": 2.245906322637614e-05, + "loss": 1.7031, + "step": 50975 + }, + { + "epoch": 1.7880934015847416, + "grad_norm": 3.216949701309204, + "learning_rate": 2.2442831357373815e-05, + "loss": 1.6297, + "step": 51000 + }, + { + "epoch": 1.7889699179580676, + "grad_norm": 7.813713550567627, + "learning_rate": 2.2426599488371493e-05, + "loss": 1.8584, + "step": 51025 + }, + { + "epoch": 1.7898464343313933, + "grad_norm": 4.424006462097168, + "learning_rate": 2.2410367619369164e-05, + "loss": 1.781, + "step": 51050 + }, + { + "epoch": 1.790722950704719, + "grad_norm": 3.644646406173706, + "learning_rate": 2.239413575036684e-05, + "loss": 1.7361, + "step": 51075 + }, + { + "epoch": 1.791599467078045, + "grad_norm": 3.3452320098876953, + "learning_rate": 2.2377903881364517e-05, + "loss": 1.6238, + "step": 51100 + }, + { + "epoch": 1.7924759834513708, + "grad_norm": 4.19920539855957, + "learning_rate": 2.236167201236219e-05, + "loss": 1.5234, + "step": 51125 + }, + { + "epoch": 1.7933524998246968, + "grad_norm": 6.98471736907959, + "learning_rate": 2.234544014335987e-05, + "loss": 1.7191, + "step": 51150 + }, + { + "epoch": 1.7942290161980226, + "grad_norm": 5.152838706970215, + "learning_rate": 2.2329208274357544e-05, + "loss": 1.7035, + "step": 51175 + }, + { + "epoch": 1.7951055325713483, + "grad_norm": 3.241415023803711, + "learning_rate": 2.231297640535522e-05, + "loss": 1.665, + "step": 51200 + }, + { + "epoch": 1.7959820489446743, + "grad_norm": 4.455251216888428, + "learning_rate": 2.2296744536352897e-05, + "loss": 1.7246, + "step": 51225 + }, + { + "epoch": 1.7968585653180003, + "grad_norm": 5.201000213623047, + "learning_rate": 2.228051266735057e-05, + "loss": 1.7181, + "step": 51250 + }, + { + "epoch": 1.797735081691326, + "grad_norm": 4.755773544311523, + "learning_rate": 2.2264280798348246e-05, + "loss": 1.6905, + "step": 51275 + }, + { + "epoch": 1.7986115980646518, + "grad_norm": 4.806262969970703, + "learning_rate": 2.224804892934592e-05, + "loss": 1.7362, + "step": 51300 + }, + { + "epoch": 1.7994881144379777, + "grad_norm": 5.39224100112915, + "learning_rate": 2.2231817060343595e-05, + "loss": 1.701, + "step": 51325 + }, + { + "epoch": 1.8003646308113037, + "grad_norm": 3.8960375785827637, + "learning_rate": 2.2215585191341274e-05, + "loss": 1.643, + "step": 51350 + }, + { + "epoch": 1.8012411471846295, + "grad_norm": 4.27885627746582, + "learning_rate": 2.2199353322338948e-05, + "loss": 1.6222, + "step": 51375 + }, + { + "epoch": 1.8021176635579552, + "grad_norm": 5.233681678771973, + "learning_rate": 2.2183121453336626e-05, + "loss": 1.4347, + "step": 51400 + }, + { + "epoch": 1.802994179931281, + "grad_norm": 9.178216934204102, + "learning_rate": 2.21668895843343e-05, + "loss": 1.8822, + "step": 51425 + }, + { + "epoch": 1.803870696304607, + "grad_norm": 5.004912853240967, + "learning_rate": 2.2150657715331975e-05, + "loss": 1.5583, + "step": 51450 + }, + { + "epoch": 1.804747212677933, + "grad_norm": 3.121065139770508, + "learning_rate": 2.213442584632965e-05, + "loss": 1.528, + "step": 51475 + }, + { + "epoch": 1.8056237290512587, + "grad_norm": 4.439921855926514, + "learning_rate": 2.2118193977327325e-05, + "loss": 1.5966, + "step": 51500 + }, + { + "epoch": 1.8065002454245844, + "grad_norm": 6.093084812164307, + "learning_rate": 2.2101962108325003e-05, + "loss": 1.803, + "step": 51525 + }, + { + "epoch": 1.8073767617979104, + "grad_norm": 8.051422119140625, + "learning_rate": 2.2085730239322677e-05, + "loss": 1.9081, + "step": 51550 + }, + { + "epoch": 1.8082532781712364, + "grad_norm": 4.8623366355896, + "learning_rate": 2.2069498370320355e-05, + "loss": 1.8681, + "step": 51575 + }, + { + "epoch": 1.8091297945445621, + "grad_norm": 4.5736002922058105, + "learning_rate": 2.205326650131803e-05, + "loss": 1.6519, + "step": 51600 + }, + { + "epoch": 1.8100063109178879, + "grad_norm": 4.227759838104248, + "learning_rate": 2.2037034632315705e-05, + "loss": 1.6575, + "step": 51625 + }, + { + "epoch": 1.8108828272912136, + "grad_norm": 4.523649215698242, + "learning_rate": 2.202080276331338e-05, + "loss": 1.8191, + "step": 51650 + }, + { + "epoch": 1.8117593436645396, + "grad_norm": 3.700531244277954, + "learning_rate": 2.2004570894311054e-05, + "loss": 1.7416, + "step": 51675 + }, + { + "epoch": 1.8126358600378656, + "grad_norm": 7.723850727081299, + "learning_rate": 2.1988339025308732e-05, + "loss": 1.6793, + "step": 51700 + }, + { + "epoch": 1.8135123764111913, + "grad_norm": 6.578819751739502, + "learning_rate": 2.1972107156306407e-05, + "loss": 1.7076, + "step": 51725 + }, + { + "epoch": 1.814388892784517, + "grad_norm": 4.681560039520264, + "learning_rate": 2.195587528730408e-05, + "loss": 1.7416, + "step": 51750 + }, + { + "epoch": 1.815265409157843, + "grad_norm": 4.767965793609619, + "learning_rate": 2.193964341830176e-05, + "loss": 1.7706, + "step": 51775 + }, + { + "epoch": 1.816141925531169, + "grad_norm": 6.624705791473389, + "learning_rate": 2.1923411549299434e-05, + "loss": 1.675, + "step": 51800 + }, + { + "epoch": 1.8170184419044948, + "grad_norm": 5.511445045471191, + "learning_rate": 2.190717968029711e-05, + "loss": 1.7311, + "step": 51825 + }, + { + "epoch": 1.8178949582778205, + "grad_norm": 4.00278377532959, + "learning_rate": 2.1890947811294783e-05, + "loss": 1.5915, + "step": 51850 + }, + { + "epoch": 1.8187714746511465, + "grad_norm": 2.9405245780944824, + "learning_rate": 2.1874715942292458e-05, + "loss": 1.7462, + "step": 51875 + }, + { + "epoch": 1.8196479910244725, + "grad_norm": 5.30010986328125, + "learning_rate": 2.1858484073290136e-05, + "loss": 1.8259, + "step": 51900 + }, + { + "epoch": 1.8205245073977983, + "grad_norm": 3.948014736175537, + "learning_rate": 2.184225220428781e-05, + "loss": 1.6615, + "step": 51925 + }, + { + "epoch": 1.821401023771124, + "grad_norm": 4.708994388580322, + "learning_rate": 2.182602033528549e-05, + "loss": 1.6723, + "step": 51950 + }, + { + "epoch": 1.8222775401444498, + "grad_norm": 4.881672382354736, + "learning_rate": 2.1809788466283163e-05, + "loss": 1.8014, + "step": 51975 + }, + { + "epoch": 1.8231540565177757, + "grad_norm": 4.543352127075195, + "learning_rate": 2.1793556597280838e-05, + "loss": 1.5789, + "step": 52000 + }, + { + "epoch": 1.8240305728911017, + "grad_norm": 5.042555809020996, + "learning_rate": 2.1777324728278513e-05, + "loss": 1.7539, + "step": 52025 + }, + { + "epoch": 1.8249070892644275, + "grad_norm": 4.807198524475098, + "learning_rate": 2.1761092859276187e-05, + "loss": 1.6909, + "step": 52050 + }, + { + "epoch": 1.8257836056377532, + "grad_norm": 4.517866134643555, + "learning_rate": 2.1744860990273865e-05, + "loss": 1.952, + "step": 52075 + }, + { + "epoch": 1.8266601220110792, + "grad_norm": 4.282480716705322, + "learning_rate": 2.172862912127154e-05, + "loss": 1.6019, + "step": 52100 + }, + { + "epoch": 1.8275366383844052, + "grad_norm": 8.610906600952148, + "learning_rate": 2.1712397252269218e-05, + "loss": 1.8305, + "step": 52125 + }, + { + "epoch": 1.828413154757731, + "grad_norm": 3.913853406906128, + "learning_rate": 2.1696165383266893e-05, + "loss": 1.99, + "step": 52150 + }, + { + "epoch": 1.8292896711310567, + "grad_norm": 2.88033390045166, + "learning_rate": 2.1679933514264567e-05, + "loss": 1.7729, + "step": 52175 + }, + { + "epoch": 1.8301661875043824, + "grad_norm": 3.3930115699768066, + "learning_rate": 2.1663701645262245e-05, + "loss": 1.6265, + "step": 52200 + }, + { + "epoch": 1.8310427038777084, + "grad_norm": 4.675701141357422, + "learning_rate": 2.1647469776259917e-05, + "loss": 1.5425, + "step": 52225 + }, + { + "epoch": 1.8319192202510344, + "grad_norm": 6.2647294998168945, + "learning_rate": 2.1631237907257595e-05, + "loss": 1.6818, + "step": 52250 + }, + { + "epoch": 1.8327957366243601, + "grad_norm": 8.024128913879395, + "learning_rate": 2.161500603825527e-05, + "loss": 1.6886, + "step": 52275 + }, + { + "epoch": 1.8336722529976859, + "grad_norm": 3.645638942718506, + "learning_rate": 2.1598774169252944e-05, + "loss": 1.605, + "step": 52300 + }, + { + "epoch": 1.8345487693710119, + "grad_norm": 4.4904561042785645, + "learning_rate": 2.1582542300250622e-05, + "loss": 1.644, + "step": 52325 + }, + { + "epoch": 1.8354252857443378, + "grad_norm": 2.8983476161956787, + "learning_rate": 2.1566310431248297e-05, + "loss": 1.878, + "step": 52350 + }, + { + "epoch": 1.8363018021176636, + "grad_norm": 5.780378818511963, + "learning_rate": 2.1550078562245975e-05, + "loss": 1.7263, + "step": 52375 + }, + { + "epoch": 1.8371783184909893, + "grad_norm": 3.5068273544311523, + "learning_rate": 2.153384669324365e-05, + "loss": 1.7497, + "step": 52400 + }, + { + "epoch": 1.8380548348643153, + "grad_norm": 5.828103542327881, + "learning_rate": 2.151761482424132e-05, + "loss": 1.7983, + "step": 52425 + }, + { + "epoch": 1.838931351237641, + "grad_norm": 2.925856113433838, + "learning_rate": 2.1501382955239e-05, + "loss": 1.7618, + "step": 52450 + }, + { + "epoch": 1.839807867610967, + "grad_norm": 2.861786127090454, + "learning_rate": 2.1485151086236673e-05, + "loss": 1.6016, + "step": 52475 + }, + { + "epoch": 1.8406843839842928, + "grad_norm": 4.46367073059082, + "learning_rate": 2.146891921723435e-05, + "loss": 1.7496, + "step": 52500 + }, + { + "epoch": 1.8415609003576185, + "grad_norm": 3.3921329975128174, + "learning_rate": 2.1452687348232026e-05, + "loss": 1.749, + "step": 52525 + }, + { + "epoch": 1.8424374167309445, + "grad_norm": 3.634174346923828, + "learning_rate": 2.14364554792297e-05, + "loss": 1.6757, + "step": 52550 + }, + { + "epoch": 1.8433139331042705, + "grad_norm": 4.625886917114258, + "learning_rate": 2.142022361022738e-05, + "loss": 1.5391, + "step": 52575 + }, + { + "epoch": 1.8441904494775962, + "grad_norm": 10.018671035766602, + "learning_rate": 2.1403991741225053e-05, + "loss": 1.4575, + "step": 52600 + }, + { + "epoch": 1.845066965850922, + "grad_norm": 3.6055595874786377, + "learning_rate": 2.1387759872222728e-05, + "loss": 1.8907, + "step": 52625 + }, + { + "epoch": 1.845943482224248, + "grad_norm": 4.001281261444092, + "learning_rate": 2.1371528003220403e-05, + "loss": 1.7531, + "step": 52650 + }, + { + "epoch": 1.846819998597574, + "grad_norm": 3.191168785095215, + "learning_rate": 2.135529613421808e-05, + "loss": 1.61, + "step": 52675 + }, + { + "epoch": 1.8476965149708997, + "grad_norm": 3.5133745670318604, + "learning_rate": 2.1339064265215755e-05, + "loss": 1.9661, + "step": 52700 + }, + { + "epoch": 1.8485730313442255, + "grad_norm": 4.573451995849609, + "learning_rate": 2.132283239621343e-05, + "loss": 1.5508, + "step": 52725 + }, + { + "epoch": 1.8494495477175512, + "grad_norm": 5.318416118621826, + "learning_rate": 2.1306600527211108e-05, + "loss": 1.9038, + "step": 52750 + }, + { + "epoch": 1.8503260640908772, + "grad_norm": 8.955162048339844, + "learning_rate": 2.1290368658208783e-05, + "loss": 1.7626, + "step": 52775 + }, + { + "epoch": 1.8512025804642032, + "grad_norm": 3.07539963722229, + "learning_rate": 2.1274136789206457e-05, + "loss": 1.859, + "step": 52800 + }, + { + "epoch": 1.852079096837529, + "grad_norm": 6.337424278259277, + "learning_rate": 2.1257904920204132e-05, + "loss": 1.8543, + "step": 52825 + }, + { + "epoch": 1.8529556132108547, + "grad_norm": 4.328663349151611, + "learning_rate": 2.1241673051201806e-05, + "loss": 1.6373, + "step": 52850 + }, + { + "epoch": 1.8538321295841806, + "grad_norm": 4.544500827789307, + "learning_rate": 2.1225441182199485e-05, + "loss": 1.7755, + "step": 52875 + }, + { + "epoch": 1.8547086459575066, + "grad_norm": 6.548807621002197, + "learning_rate": 2.120920931319716e-05, + "loss": 1.7065, + "step": 52900 + }, + { + "epoch": 1.8555851623308324, + "grad_norm": 5.549559593200684, + "learning_rate": 2.1192977444194837e-05, + "loss": 1.7928, + "step": 52925 + }, + { + "epoch": 1.8564616787041581, + "grad_norm": 3.4190902709960938, + "learning_rate": 2.1176745575192512e-05, + "loss": 1.638, + "step": 52950 + }, + { + "epoch": 1.857338195077484, + "grad_norm": 4.00801420211792, + "learning_rate": 2.1160513706190186e-05, + "loss": 1.6958, + "step": 52975 + }, + { + "epoch": 1.8582147114508099, + "grad_norm": 5.596281051635742, + "learning_rate": 2.114428183718786e-05, + "loss": 1.6379, + "step": 53000 + }, + { + "epoch": 1.8590912278241358, + "grad_norm": 8.1680269241333, + "learning_rate": 2.1128049968185536e-05, + "loss": 1.6709, + "step": 53025 + }, + { + "epoch": 1.8599677441974616, + "grad_norm": 3.8498432636260986, + "learning_rate": 2.1111818099183214e-05, + "loss": 1.6094, + "step": 53050 + }, + { + "epoch": 1.8608442605707873, + "grad_norm": 5.173568248748779, + "learning_rate": 2.109558623018089e-05, + "loss": 1.6956, + "step": 53075 + }, + { + "epoch": 1.8617207769441133, + "grad_norm": 4.46190881729126, + "learning_rate": 2.1079354361178563e-05, + "loss": 1.8917, + "step": 53100 + }, + { + "epoch": 1.8625972933174393, + "grad_norm": 5.988603115081787, + "learning_rate": 2.106312249217624e-05, + "loss": 1.6132, + "step": 53125 + }, + { + "epoch": 1.863473809690765, + "grad_norm": 3.0163025856018066, + "learning_rate": 2.1046890623173916e-05, + "loss": 1.7164, + "step": 53150 + }, + { + "epoch": 1.8643503260640908, + "grad_norm": 3.8393704891204834, + "learning_rate": 2.1030658754171594e-05, + "loss": 1.8942, + "step": 53175 + }, + { + "epoch": 1.8652268424374168, + "grad_norm": 5.056851863861084, + "learning_rate": 2.1014426885169265e-05, + "loss": 1.6591, + "step": 53200 + }, + { + "epoch": 1.8661033588107427, + "grad_norm": 8.386987686157227, + "learning_rate": 2.0998195016166943e-05, + "loss": 1.6855, + "step": 53225 + }, + { + "epoch": 1.8669798751840685, + "grad_norm": 11.277887344360352, + "learning_rate": 2.0981963147164618e-05, + "loss": 1.7064, + "step": 53250 + }, + { + "epoch": 1.8678563915573942, + "grad_norm": 4.33960485458374, + "learning_rate": 2.0965731278162292e-05, + "loss": 1.7019, + "step": 53275 + }, + { + "epoch": 1.86873290793072, + "grad_norm": 4.2368950843811035, + "learning_rate": 2.094949940915997e-05, + "loss": 1.6651, + "step": 53300 + }, + { + "epoch": 1.869609424304046, + "grad_norm": 4.668137550354004, + "learning_rate": 2.0933267540157645e-05, + "loss": 1.7426, + "step": 53325 + }, + { + "epoch": 1.870485940677372, + "grad_norm": 7.276766300201416, + "learning_rate": 2.0917035671155323e-05, + "loss": 1.6419, + "step": 53350 + }, + { + "epoch": 1.8713624570506977, + "grad_norm": 9.706258773803711, + "learning_rate": 2.0900803802152998e-05, + "loss": 1.8126, + "step": 53375 + }, + { + "epoch": 1.8722389734240235, + "grad_norm": 4.889819622039795, + "learning_rate": 2.088457193315067e-05, + "loss": 1.7477, + "step": 53400 + }, + { + "epoch": 1.8731154897973494, + "grad_norm": 4.86405086517334, + "learning_rate": 2.0868340064148347e-05, + "loss": 1.7556, + "step": 53425 + }, + { + "epoch": 1.8739920061706754, + "grad_norm": 3.6495752334594727, + "learning_rate": 2.085210819514602e-05, + "loss": 1.7459, + "step": 53450 + }, + { + "epoch": 1.8748685225440012, + "grad_norm": 3.1089813709259033, + "learning_rate": 2.08358763261437e-05, + "loss": 1.766, + "step": 53475 + }, + { + "epoch": 1.875745038917327, + "grad_norm": 4.518210411071777, + "learning_rate": 2.0819644457141374e-05, + "loss": 1.7107, + "step": 53500 + }, + { + "epoch": 1.8766215552906527, + "grad_norm": 5.612878322601318, + "learning_rate": 2.080341258813905e-05, + "loss": 1.7427, + "step": 53525 + }, + { + "epoch": 1.8774980716639786, + "grad_norm": 4.465632915496826, + "learning_rate": 2.0787180719136727e-05, + "loss": 1.7441, + "step": 53550 + }, + { + "epoch": 1.8783745880373046, + "grad_norm": 5.902110576629639, + "learning_rate": 2.07709488501344e-05, + "loss": 1.6832, + "step": 53575 + }, + { + "epoch": 1.8792511044106304, + "grad_norm": 7.544435024261475, + "learning_rate": 2.0754716981132076e-05, + "loss": 1.7862, + "step": 53600 + }, + { + "epoch": 1.8801276207839561, + "grad_norm": 5.261139392852783, + "learning_rate": 2.073848511212975e-05, + "loss": 1.739, + "step": 53625 + }, + { + "epoch": 1.881004137157282, + "grad_norm": 7.61118221282959, + "learning_rate": 2.0722253243127426e-05, + "loss": 1.8247, + "step": 53650 + }, + { + "epoch": 1.881880653530608, + "grad_norm": 4.948622226715088, + "learning_rate": 2.0706021374125104e-05, + "loss": 1.6752, + "step": 53675 + }, + { + "epoch": 1.8827571699039338, + "grad_norm": 6.419146537780762, + "learning_rate": 2.068978950512278e-05, + "loss": 1.7929, + "step": 53700 + }, + { + "epoch": 1.8836336862772596, + "grad_norm": 5.996078968048096, + "learning_rate": 2.0673557636120456e-05, + "loss": 1.5974, + "step": 53725 + }, + { + "epoch": 1.8845102026505856, + "grad_norm": 3.221792697906494, + "learning_rate": 2.065732576711813e-05, + "loss": 1.584, + "step": 53750 + }, + { + "epoch": 1.8853867190239115, + "grad_norm": 4.358556270599365, + "learning_rate": 2.0641093898115806e-05, + "loss": 1.8487, + "step": 53775 + }, + { + "epoch": 1.8862632353972373, + "grad_norm": 4.725550651550293, + "learning_rate": 2.062486202911348e-05, + "loss": 1.6828, + "step": 53800 + }, + { + "epoch": 1.887139751770563, + "grad_norm": 6.89113712310791, + "learning_rate": 2.0608630160111155e-05, + "loss": 1.7135, + "step": 53825 + }, + { + "epoch": 1.8880162681438888, + "grad_norm": 4.7607927322387695, + "learning_rate": 2.0592398291108833e-05, + "loss": 1.6382, + "step": 53850 + }, + { + "epoch": 1.8888927845172148, + "grad_norm": 4.279661655426025, + "learning_rate": 2.0576166422106508e-05, + "loss": 1.7417, + "step": 53875 + }, + { + "epoch": 1.8897693008905407, + "grad_norm": 5.419626235961914, + "learning_rate": 2.0559934553104186e-05, + "loss": 1.7162, + "step": 53900 + }, + { + "epoch": 1.8906458172638665, + "grad_norm": 3.89320969581604, + "learning_rate": 2.054370268410186e-05, + "loss": 1.7178, + "step": 53925 + }, + { + "epoch": 1.8915223336371922, + "grad_norm": 3.099106788635254, + "learning_rate": 2.0527470815099535e-05, + "loss": 1.7912, + "step": 53950 + }, + { + "epoch": 1.8923988500105182, + "grad_norm": 7.661317348480225, + "learning_rate": 2.051123894609721e-05, + "loss": 1.716, + "step": 53975 + }, + { + "epoch": 1.8932753663838442, + "grad_norm": 4.48473596572876, + "learning_rate": 2.0495007077094884e-05, + "loss": 1.7601, + "step": 54000 + }, + { + "epoch": 1.89415188275717, + "grad_norm": 5.13318395614624, + "learning_rate": 2.0478775208092562e-05, + "loss": 1.6185, + "step": 54025 + }, + { + "epoch": 1.8950283991304957, + "grad_norm": 4.01314115524292, + "learning_rate": 2.0462543339090237e-05, + "loss": 1.5362, + "step": 54050 + }, + { + "epoch": 1.8959049155038215, + "grad_norm": 4.515827178955078, + "learning_rate": 2.044631147008791e-05, + "loss": 1.8, + "step": 54075 + }, + { + "epoch": 1.8967814318771474, + "grad_norm": 6.994966506958008, + "learning_rate": 2.043007960108559e-05, + "loss": 1.6095, + "step": 54100 + }, + { + "epoch": 1.8976579482504734, + "grad_norm": 5.409581661224365, + "learning_rate": 2.0413847732083264e-05, + "loss": 1.6865, + "step": 54125 + }, + { + "epoch": 1.8985344646237992, + "grad_norm": 4.429877281188965, + "learning_rate": 2.039761586308094e-05, + "loss": 1.6996, + "step": 54150 + }, + { + "epoch": 1.899410980997125, + "grad_norm": 6.171942234039307, + "learning_rate": 2.0381383994078614e-05, + "loss": 1.7751, + "step": 54175 + }, + { + "epoch": 1.9002874973704509, + "grad_norm": 5.616784572601318, + "learning_rate": 2.0365152125076288e-05, + "loss": 1.7524, + "step": 54200 + }, + { + "epoch": 1.9011640137437769, + "grad_norm": 7.2618088722229, + "learning_rate": 2.0348920256073966e-05, + "loss": 1.8282, + "step": 54225 + }, + { + "epoch": 1.9020405301171026, + "grad_norm": 3.7744929790496826, + "learning_rate": 2.033268838707164e-05, + "loss": 1.6838, + "step": 54250 + }, + { + "epoch": 1.9029170464904284, + "grad_norm": 4.981478691101074, + "learning_rate": 2.031645651806932e-05, + "loss": 1.5563, + "step": 54275 + }, + { + "epoch": 1.9037935628637543, + "grad_norm": 4.463685512542725, + "learning_rate": 2.0300224649066994e-05, + "loss": 1.7584, + "step": 54300 + }, + { + "epoch": 1.90467007923708, + "grad_norm": 4.244435787200928, + "learning_rate": 2.028399278006467e-05, + "loss": 1.7948, + "step": 54325 + }, + { + "epoch": 1.905546595610406, + "grad_norm": 4.782602310180664, + "learning_rate": 2.0267760911062346e-05, + "loss": 1.7909, + "step": 54350 + }, + { + "epoch": 1.9064231119837318, + "grad_norm": 3.0142722129821777, + "learning_rate": 2.0251529042060017e-05, + "loss": 1.6113, + "step": 54375 + }, + { + "epoch": 1.9072996283570576, + "grad_norm": 8.575775146484375, + "learning_rate": 2.0235297173057695e-05, + "loss": 1.6572, + "step": 54400 + }, + { + "epoch": 1.9081761447303835, + "grad_norm": 5.08777379989624, + "learning_rate": 2.021906530405537e-05, + "loss": 1.6362, + "step": 54425 + }, + { + "epoch": 1.9090526611037095, + "grad_norm": 3.023665189743042, + "learning_rate": 2.0202833435053048e-05, + "loss": 1.5771, + "step": 54450 + }, + { + "epoch": 1.9099291774770353, + "grad_norm": 4.836973667144775, + "learning_rate": 2.0186601566050723e-05, + "loss": 1.5759, + "step": 54475 + }, + { + "epoch": 1.910805693850361, + "grad_norm": 5.158304214477539, + "learning_rate": 2.0170369697048397e-05, + "loss": 1.7299, + "step": 54500 + }, + { + "epoch": 1.911682210223687, + "grad_norm": 4.460534572601318, + "learning_rate": 2.0154137828046076e-05, + "loss": 1.6845, + "step": 54525 + }, + { + "epoch": 1.912558726597013, + "grad_norm": 3.7869298458099365, + "learning_rate": 2.013790595904375e-05, + "loss": 1.7239, + "step": 54550 + }, + { + "epoch": 1.9134352429703387, + "grad_norm": 3.6763241291046143, + "learning_rate": 2.0121674090041425e-05, + "loss": 1.5982, + "step": 54575 + }, + { + "epoch": 1.9143117593436645, + "grad_norm": 3.8734912872314453, + "learning_rate": 2.01054422210391e-05, + "loss": 1.6068, + "step": 54600 + }, + { + "epoch": 1.9151882757169902, + "grad_norm": 5.129932403564453, + "learning_rate": 2.0089210352036774e-05, + "loss": 1.8422, + "step": 54625 + }, + { + "epoch": 1.9160647920903162, + "grad_norm": 9.548328399658203, + "learning_rate": 2.0072978483034452e-05, + "loss": 1.7377, + "step": 54650 + }, + { + "epoch": 1.9169413084636422, + "grad_norm": 2.9938127994537354, + "learning_rate": 2.0056746614032127e-05, + "loss": 1.9545, + "step": 54675 + }, + { + "epoch": 1.917817824836968, + "grad_norm": 4.49973726272583, + "learning_rate": 2.0040514745029805e-05, + "loss": 1.7275, + "step": 54700 + }, + { + "epoch": 1.9186943412102937, + "grad_norm": 3.677746057510376, + "learning_rate": 2.002428287602748e-05, + "loss": 1.7391, + "step": 54725 + }, + { + "epoch": 1.9195708575836197, + "grad_norm": 7.230477809906006, + "learning_rate": 2.0008051007025154e-05, + "loss": 1.5913, + "step": 54750 + }, + { + "epoch": 1.9204473739569456, + "grad_norm": 9.183040618896484, + "learning_rate": 1.999181913802283e-05, + "loss": 1.8191, + "step": 54775 + }, + { + "epoch": 1.9213238903302714, + "grad_norm": 3.3522186279296875, + "learning_rate": 1.9975587269020503e-05, + "loss": 1.5659, + "step": 54800 + }, + { + "epoch": 1.9222004067035972, + "grad_norm": 4.5263848304748535, + "learning_rate": 1.995935540001818e-05, + "loss": 1.6058, + "step": 54825 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 5.750899791717529, + "learning_rate": 1.9943123531015856e-05, + "loss": 1.6691, + "step": 54850 + }, + { + "epoch": 1.9239534394502489, + "grad_norm": 3.090224504470825, + "learning_rate": 1.9926891662013534e-05, + "loss": 1.7005, + "step": 54875 + }, + { + "epoch": 1.9248299558235749, + "grad_norm": 4.633055210113525, + "learning_rate": 1.991065979301121e-05, + "loss": 1.7803, + "step": 54900 + }, + { + "epoch": 1.9257064721969006, + "grad_norm": 6.900075912475586, + "learning_rate": 1.9894427924008883e-05, + "loss": 1.6295, + "step": 54925 + }, + { + "epoch": 1.9265829885702264, + "grad_norm": 3.525984287261963, + "learning_rate": 1.9878196055006558e-05, + "loss": 1.6168, + "step": 54950 + }, + { + "epoch": 1.9274595049435523, + "grad_norm": 2.9258334636688232, + "learning_rate": 1.9861964186004233e-05, + "loss": 1.6087, + "step": 54975 + }, + { + "epoch": 1.9283360213168783, + "grad_norm": 7.73768949508667, + "learning_rate": 1.984573231700191e-05, + "loss": 1.7529, + "step": 55000 + }, + { + "epoch": 1.929212537690204, + "grad_norm": 5.033933162689209, + "learning_rate": 1.9829500447999585e-05, + "loss": 1.6794, + "step": 55025 + }, + { + "epoch": 1.9300890540635298, + "grad_norm": 2.903231382369995, + "learning_rate": 1.981326857899726e-05, + "loss": 1.6618, + "step": 55050 + }, + { + "epoch": 1.9309655704368558, + "grad_norm": 4.118696689605713, + "learning_rate": 1.9797036709994938e-05, + "loss": 1.736, + "step": 55075 + }, + { + "epoch": 1.9318420868101818, + "grad_norm": 3.7915968894958496, + "learning_rate": 1.9780804840992613e-05, + "loss": 1.6973, + "step": 55100 + }, + { + "epoch": 1.9327186031835075, + "grad_norm": 5.059043884277344, + "learning_rate": 1.9764572971990287e-05, + "loss": 1.6953, + "step": 55125 + }, + { + "epoch": 1.9335951195568333, + "grad_norm": 4.689755916595459, + "learning_rate": 1.9748341102987962e-05, + "loss": 1.7323, + "step": 55150 + }, + { + "epoch": 1.934471635930159, + "grad_norm": 5.350003242492676, + "learning_rate": 1.9732109233985637e-05, + "loss": 1.8228, + "step": 55175 + }, + { + "epoch": 1.935348152303485, + "grad_norm": 5.590778827667236, + "learning_rate": 1.9715877364983315e-05, + "loss": 1.605, + "step": 55200 + }, + { + "epoch": 1.936224668676811, + "grad_norm": 3.97818922996521, + "learning_rate": 1.969964549598099e-05, + "loss": 1.7021, + "step": 55225 + }, + { + "epoch": 1.9371011850501367, + "grad_norm": 5.318004131317139, + "learning_rate": 1.9683413626978667e-05, + "loss": 1.705, + "step": 55250 + }, + { + "epoch": 1.9379777014234625, + "grad_norm": 3.8011457920074463, + "learning_rate": 1.9667181757976342e-05, + "loss": 1.7353, + "step": 55275 + }, + { + "epoch": 1.9388542177967885, + "grad_norm": 6.410901069641113, + "learning_rate": 1.9650949888974017e-05, + "loss": 1.7282, + "step": 55300 + }, + { + "epoch": 1.9397307341701144, + "grad_norm": 5.2905097007751465, + "learning_rate": 1.963471801997169e-05, + "loss": 1.6688, + "step": 55325 + }, + { + "epoch": 1.9406072505434402, + "grad_norm": 4.395038604736328, + "learning_rate": 1.9618486150969366e-05, + "loss": 1.7073, + "step": 55350 + }, + { + "epoch": 1.941483766916766, + "grad_norm": 5.7047624588012695, + "learning_rate": 1.9602254281967044e-05, + "loss": 1.6772, + "step": 55375 + }, + { + "epoch": 1.9423602832900917, + "grad_norm": 3.7461438179016113, + "learning_rate": 1.958602241296472e-05, + "loss": 1.5964, + "step": 55400 + }, + { + "epoch": 1.9432367996634177, + "grad_norm": 5.13286828994751, + "learning_rate": 1.9569790543962397e-05, + "loss": 1.7704, + "step": 55425 + }, + { + "epoch": 1.9441133160367436, + "grad_norm": 6.831084728240967, + "learning_rate": 1.955355867496007e-05, + "loss": 1.6993, + "step": 55450 + }, + { + "epoch": 1.9449898324100694, + "grad_norm": 4.41579532623291, + "learning_rate": 1.9537326805957746e-05, + "loss": 1.8126, + "step": 55475 + }, + { + "epoch": 1.9458663487833951, + "grad_norm": 3.7293365001678467, + "learning_rate": 1.9521094936955424e-05, + "loss": 1.6403, + "step": 55500 + }, + { + "epoch": 1.9467428651567211, + "grad_norm": 5.2591705322265625, + "learning_rate": 1.95048630679531e-05, + "loss": 1.5229, + "step": 55525 + }, + { + "epoch": 1.947619381530047, + "grad_norm": 4.664018630981445, + "learning_rate": 1.9488631198950773e-05, + "loss": 1.5987, + "step": 55550 + }, + { + "epoch": 1.9484958979033729, + "grad_norm": 8.392513275146484, + "learning_rate": 1.9472399329948448e-05, + "loss": 1.8112, + "step": 55575 + }, + { + "epoch": 1.9493724142766986, + "grad_norm": 4.846610069274902, + "learning_rate": 1.9456167460946123e-05, + "loss": 1.6121, + "step": 55600 + }, + { + "epoch": 1.9502489306500246, + "grad_norm": 4.277972221374512, + "learning_rate": 1.94399355919438e-05, + "loss": 1.5529, + "step": 55625 + }, + { + "epoch": 1.9511254470233506, + "grad_norm": 3.967294931411743, + "learning_rate": 1.9423703722941475e-05, + "loss": 1.7459, + "step": 55650 + }, + { + "epoch": 1.9520019633966763, + "grad_norm": 3.807868480682373, + "learning_rate": 1.9407471853939153e-05, + "loss": 1.667, + "step": 55675 + }, + { + "epoch": 1.952878479770002, + "grad_norm": 6.832434177398682, + "learning_rate": 1.9391239984936828e-05, + "loss": 1.6242, + "step": 55700 + }, + { + "epoch": 1.9537549961433278, + "grad_norm": 4.903378009796143, + "learning_rate": 1.9375008115934503e-05, + "loss": 1.79, + "step": 55725 + }, + { + "epoch": 1.9546315125166538, + "grad_norm": 4.66172981262207, + "learning_rate": 1.9358776246932177e-05, + "loss": 1.6904, + "step": 55750 + }, + { + "epoch": 1.9555080288899798, + "grad_norm": 5.605931282043457, + "learning_rate": 1.9342544377929852e-05, + "loss": 1.7964, + "step": 55775 + }, + { + "epoch": 1.9563845452633055, + "grad_norm": 3.669130325317383, + "learning_rate": 1.932631250892753e-05, + "loss": 1.8263, + "step": 55800 + }, + { + "epoch": 1.9572610616366313, + "grad_norm": 4.108389854431152, + "learning_rate": 1.9310080639925205e-05, + "loss": 1.6828, + "step": 55825 + }, + { + "epoch": 1.9581375780099572, + "grad_norm": 4.478518962860107, + "learning_rate": 1.929384877092288e-05, + "loss": 1.7994, + "step": 55850 + }, + { + "epoch": 1.9590140943832832, + "grad_norm": 4.396457195281982, + "learning_rate": 1.9277616901920557e-05, + "loss": 1.5791, + "step": 55875 + }, + { + "epoch": 1.959890610756609, + "grad_norm": 3.828763723373413, + "learning_rate": 1.9261385032918232e-05, + "loss": 1.8153, + "step": 55900 + }, + { + "epoch": 1.9607671271299347, + "grad_norm": 10.078981399536133, + "learning_rate": 1.9245153163915906e-05, + "loss": 1.7751, + "step": 55925 + }, + { + "epoch": 1.9616436435032605, + "grad_norm": 4.609557628631592, + "learning_rate": 1.922892129491358e-05, + "loss": 1.7842, + "step": 55950 + }, + { + "epoch": 1.9625201598765865, + "grad_norm": 4.030450820922852, + "learning_rate": 1.921268942591126e-05, + "loss": 1.7134, + "step": 55975 + }, + { + "epoch": 1.9633966762499124, + "grad_norm": 6.944028854370117, + "learning_rate": 1.9196457556908934e-05, + "loss": 1.6535, + "step": 56000 + }, + { + "epoch": 1.9642731926232382, + "grad_norm": 9.504170417785645, + "learning_rate": 1.918022568790661e-05, + "loss": 1.6884, + "step": 56025 + }, + { + "epoch": 1.965149708996564, + "grad_norm": 4.647138595581055, + "learning_rate": 1.9163993818904286e-05, + "loss": 1.8026, + "step": 56050 + }, + { + "epoch": 1.96602622536989, + "grad_norm": 3.5368120670318604, + "learning_rate": 1.914776194990196e-05, + "loss": 1.7471, + "step": 56075 + }, + { + "epoch": 1.9669027417432159, + "grad_norm": 3.200273036956787, + "learning_rate": 1.9131530080899636e-05, + "loss": 1.6685, + "step": 56100 + }, + { + "epoch": 1.9677792581165416, + "grad_norm": 5.260920524597168, + "learning_rate": 1.911529821189731e-05, + "loss": 1.7621, + "step": 56125 + }, + { + "epoch": 1.9686557744898674, + "grad_norm": 4.630614280700684, + "learning_rate": 1.9099066342894985e-05, + "loss": 1.6251, + "step": 56150 + }, + { + "epoch": 1.9695322908631934, + "grad_norm": 4.830227851867676, + "learning_rate": 1.9082834473892663e-05, + "loss": 1.7863, + "step": 56175 + }, + { + "epoch": 1.9704088072365193, + "grad_norm": 3.753767967224121, + "learning_rate": 1.9066602604890338e-05, + "loss": 1.5553, + "step": 56200 + }, + { + "epoch": 1.971285323609845, + "grad_norm": 3.0593109130859375, + "learning_rate": 1.9050370735888016e-05, + "loss": 1.66, + "step": 56225 + }, + { + "epoch": 1.9721618399831709, + "grad_norm": 5.107424736022949, + "learning_rate": 1.903413886688569e-05, + "loss": 1.8056, + "step": 56250 + }, + { + "epoch": 1.9730383563564966, + "grad_norm": 4.697347164154053, + "learning_rate": 1.9017906997883365e-05, + "loss": 1.6408, + "step": 56275 + }, + { + "epoch": 1.9739148727298226, + "grad_norm": 3.1213080883026123, + "learning_rate": 1.900167512888104e-05, + "loss": 1.7266, + "step": 56300 + }, + { + "epoch": 1.9747913891031486, + "grad_norm": 7.092903137207031, + "learning_rate": 1.8985443259878714e-05, + "loss": 1.5278, + "step": 56325 + }, + { + "epoch": 1.9756679054764743, + "grad_norm": 5.596978664398193, + "learning_rate": 1.8969211390876392e-05, + "loss": 1.7139, + "step": 56350 + }, + { + "epoch": 1.9765444218498, + "grad_norm": 3.2355401515960693, + "learning_rate": 1.8952979521874067e-05, + "loss": 1.515, + "step": 56375 + }, + { + "epoch": 1.977420938223126, + "grad_norm": 5.783256530761719, + "learning_rate": 1.8936747652871742e-05, + "loss": 1.374, + "step": 56400 + }, + { + "epoch": 1.978297454596452, + "grad_norm": 8.129377365112305, + "learning_rate": 1.892051578386942e-05, + "loss": 1.8887, + "step": 56425 + }, + { + "epoch": 1.9791739709697778, + "grad_norm": 4.8233184814453125, + "learning_rate": 1.8904283914867094e-05, + "loss": 1.6734, + "step": 56450 + }, + { + "epoch": 1.9800504873431035, + "grad_norm": 7.95534610748291, + "learning_rate": 1.8888052045864772e-05, + "loss": 1.5969, + "step": 56475 + }, + { + "epoch": 1.9809270037164293, + "grad_norm": 5.142815113067627, + "learning_rate": 1.8871820176862444e-05, + "loss": 1.9276, + "step": 56500 + }, + { + "epoch": 1.9818035200897552, + "grad_norm": 4.5233540534973145, + "learning_rate": 1.8855588307860122e-05, + "loss": 1.6384, + "step": 56525 + }, + { + "epoch": 1.9826800364630812, + "grad_norm": 5.287750244140625, + "learning_rate": 1.8839356438857796e-05, + "loss": 1.7303, + "step": 56550 + }, + { + "epoch": 1.983556552836407, + "grad_norm": 10.074333190917969, + "learning_rate": 1.882312456985547e-05, + "loss": 1.5764, + "step": 56575 + }, + { + "epoch": 1.9844330692097327, + "grad_norm": 6.043591499328613, + "learning_rate": 1.880689270085315e-05, + "loss": 1.793, + "step": 56600 + }, + { + "epoch": 1.9853095855830587, + "grad_norm": 4.111767768859863, + "learning_rate": 1.8790660831850824e-05, + "loss": 1.6546, + "step": 56625 + }, + { + "epoch": 1.9861861019563847, + "grad_norm": 12.171656608581543, + "learning_rate": 1.8774428962848502e-05, + "loss": 1.8536, + "step": 56650 + }, + { + "epoch": 1.9870626183297104, + "grad_norm": 3.444023847579956, + "learning_rate": 1.8758197093846176e-05, + "loss": 1.6901, + "step": 56675 + }, + { + "epoch": 1.9879391347030362, + "grad_norm": 4.021968841552734, + "learning_rate": 1.874196522484385e-05, + "loss": 1.6858, + "step": 56700 + }, + { + "epoch": 1.9888156510763622, + "grad_norm": 4.813103675842285, + "learning_rate": 1.8725733355841526e-05, + "loss": 1.6255, + "step": 56725 + }, + { + "epoch": 1.989692167449688, + "grad_norm": 5.8629326820373535, + "learning_rate": 1.87095014868392e-05, + "loss": 1.8332, + "step": 56750 + }, + { + "epoch": 1.9905686838230139, + "grad_norm": 9.464312553405762, + "learning_rate": 1.869326961783688e-05, + "loss": 1.5589, + "step": 56775 + }, + { + "epoch": 1.9914452001963396, + "grad_norm": 5.148681640625, + "learning_rate": 1.8677037748834553e-05, + "loss": 1.8013, + "step": 56800 + }, + { + "epoch": 1.9923217165696654, + "grad_norm": 9.511919021606445, + "learning_rate": 1.8660805879832228e-05, + "loss": 1.6575, + "step": 56825 + }, + { + "epoch": 1.9931982329429914, + "grad_norm": 4.221864700317383, + "learning_rate": 1.8644574010829906e-05, + "loss": 1.6932, + "step": 56850 + }, + { + "epoch": 1.9940747493163173, + "grad_norm": 5.2890849113464355, + "learning_rate": 1.862834214182758e-05, + "loss": 1.7892, + "step": 56875 + }, + { + "epoch": 1.994951265689643, + "grad_norm": 3.6551027297973633, + "learning_rate": 1.8612110272825255e-05, + "loss": 1.5899, + "step": 56900 + }, + { + "epoch": 1.9958277820629688, + "grad_norm": 3.6625514030456543, + "learning_rate": 1.859587840382293e-05, + "loss": 1.7028, + "step": 56925 + }, + { + "epoch": 1.9967042984362948, + "grad_norm": 4.675689220428467, + "learning_rate": 1.8579646534820604e-05, + "loss": 1.7088, + "step": 56950 + }, + { + "epoch": 1.9975808148096208, + "grad_norm": 10.478850364685059, + "learning_rate": 1.8563414665818282e-05, + "loss": 1.7325, + "step": 56975 + }, + { + "epoch": 1.9984573311829466, + "grad_norm": 10.61678409576416, + "learning_rate": 1.8547182796815957e-05, + "loss": 1.7375, + "step": 57000 + }, + { + "epoch": 1.9993338475562723, + "grad_norm": 5.076901912689209, + "learning_rate": 1.8530950927813635e-05, + "loss": 1.6795, + "step": 57025 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.3333567071032887, + "eval_f1_macro": 0.07143232786146277, + "eval_f1_micro": 0.3333567071032887, + "eval_f1_weighted": 0.1666871191763381, + "eval_loss": 1.7063689231872559, + "eval_precision_macro": 0.04762238672904124, + "eval_precision_micro": 0.3333567071032887, + "eval_precision_weighted": 0.1111266941707478, + "eval_recall_macro": 0.14285714285714285, + "eval_recall_micro": 0.3333567071032887, + "eval_recall_weighted": 0.3333567071032887, + "eval_runtime": 3201.6457, + "eval_samples_per_second": 4.454, + "eval_steps_per_second": 1.114, + "step": 57044 + }, + { + "epoch": 2.000210363929598, + "grad_norm": 7.432506561279297, + "learning_rate": 1.851471905881131e-05, + "loss": 1.6546, + "step": 57050 + }, + { + "epoch": 2.0010868803029243, + "grad_norm": 10.967360496520996, + "learning_rate": 1.8498487189808984e-05, + "loss": 1.7926, + "step": 57075 + }, + { + "epoch": 2.00196339667625, + "grad_norm": 5.753106117248535, + "learning_rate": 1.848225532080666e-05, + "loss": 1.6772, + "step": 57100 + }, + { + "epoch": 2.0028399130495758, + "grad_norm": 4.864542007446289, + "learning_rate": 1.8466023451804334e-05, + "loss": 1.8077, + "step": 57125 + }, + { + "epoch": 2.0037164294229015, + "grad_norm": 3.087035655975342, + "learning_rate": 1.844979158280201e-05, + "loss": 1.7172, + "step": 57150 + }, + { + "epoch": 2.0045929457962273, + "grad_norm": 8.269116401672363, + "learning_rate": 1.8433559713799686e-05, + "loss": 1.5707, + "step": 57175 + }, + { + "epoch": 2.0054694621695535, + "grad_norm": 4.868711948394775, + "learning_rate": 1.8417327844797364e-05, + "loss": 1.6263, + "step": 57200 + }, + { + "epoch": 2.006345978542879, + "grad_norm": 6.484753131866455, + "learning_rate": 1.840109597579504e-05, + "loss": 1.7931, + "step": 57225 + }, + { + "epoch": 2.007222494916205, + "grad_norm": 3.085186243057251, + "learning_rate": 1.8384864106792714e-05, + "loss": 1.6496, + "step": 57250 + }, + { + "epoch": 2.0080990112895307, + "grad_norm": 9.798725128173828, + "learning_rate": 1.8368632237790388e-05, + "loss": 1.6796, + "step": 57275 + }, + { + "epoch": 2.008975527662857, + "grad_norm": 3.8998429775238037, + "learning_rate": 1.8352400368788063e-05, + "loss": 1.6334, + "step": 57300 + }, + { + "epoch": 2.0098520440361827, + "grad_norm": 4.05152702331543, + "learning_rate": 1.833616849978574e-05, + "loss": 1.6416, + "step": 57325 + }, + { + "epoch": 2.0107285604095084, + "grad_norm": 8.541157722473145, + "learning_rate": 1.8319936630783416e-05, + "loss": 1.7063, + "step": 57350 + }, + { + "epoch": 2.011605076782834, + "grad_norm": 5.788800239562988, + "learning_rate": 1.830370476178109e-05, + "loss": 1.5167, + "step": 57375 + }, + { + "epoch": 2.0124815931561604, + "grad_norm": 3.804034471511841, + "learning_rate": 1.8287472892778768e-05, + "loss": 1.6035, + "step": 57400 + }, + { + "epoch": 2.013358109529486, + "grad_norm": 7.230899810791016, + "learning_rate": 1.8271241023776443e-05, + "loss": 1.7174, + "step": 57425 + }, + { + "epoch": 2.014234625902812, + "grad_norm": 3.460132598876953, + "learning_rate": 1.825500915477412e-05, + "loss": 1.4353, + "step": 57450 + }, + { + "epoch": 2.0151111422761376, + "grad_norm": 4.252769947052002, + "learning_rate": 1.8238777285771792e-05, + "loss": 1.8115, + "step": 57475 + }, + { + "epoch": 2.0159876586494634, + "grad_norm": 5.621621131896973, + "learning_rate": 1.8222545416769467e-05, + "loss": 1.7821, + "step": 57500 + }, + { + "epoch": 2.0168641750227896, + "grad_norm": 3.6484017372131348, + "learning_rate": 1.8206313547767145e-05, + "loss": 1.5152, + "step": 57525 + }, + { + "epoch": 2.0177406913961153, + "grad_norm": 9.489485740661621, + "learning_rate": 1.819008167876482e-05, + "loss": 1.5889, + "step": 57550 + }, + { + "epoch": 2.018617207769441, + "grad_norm": 3.5841336250305176, + "learning_rate": 1.8173849809762497e-05, + "loss": 1.7606, + "step": 57575 + }, + { + "epoch": 2.019493724142767, + "grad_norm": 6.106287479400635, + "learning_rate": 1.8157617940760172e-05, + "loss": 1.8519, + "step": 57600 + }, + { + "epoch": 2.020370240516093, + "grad_norm": 9.883373260498047, + "learning_rate": 1.8141386071757847e-05, + "loss": 1.8422, + "step": 57625 + }, + { + "epoch": 2.021246756889419, + "grad_norm": 4.817384719848633, + "learning_rate": 1.8125154202755525e-05, + "loss": 1.5365, + "step": 57650 + }, + { + "epoch": 2.0221232732627445, + "grad_norm": 5.803076267242432, + "learning_rate": 1.8108922333753196e-05, + "loss": 1.864, + "step": 57675 + }, + { + "epoch": 2.0229997896360703, + "grad_norm": 3.99819278717041, + "learning_rate": 1.8092690464750874e-05, + "loss": 1.686, + "step": 57700 + }, + { + "epoch": 2.023876306009396, + "grad_norm": 5.014941215515137, + "learning_rate": 1.807645859574855e-05, + "loss": 1.5791, + "step": 57725 + }, + { + "epoch": 2.0247528223827223, + "grad_norm": 5.027705669403076, + "learning_rate": 1.8060226726746227e-05, + "loss": 1.6395, + "step": 57750 + }, + { + "epoch": 2.025629338756048, + "grad_norm": 3.5443620681762695, + "learning_rate": 1.80439948577439e-05, + "loss": 1.5912, + "step": 57775 + }, + { + "epoch": 2.0265058551293738, + "grad_norm": 3.816401720046997, + "learning_rate": 1.8027762988741576e-05, + "loss": 1.7031, + "step": 57800 + }, + { + "epoch": 2.0273823715026995, + "grad_norm": 5.3107099533081055, + "learning_rate": 1.8011531119739254e-05, + "loss": 1.7344, + "step": 57825 + }, + { + "epoch": 2.0282588878760257, + "grad_norm": 5.741453170776367, + "learning_rate": 1.799529925073693e-05, + "loss": 1.7215, + "step": 57850 + }, + { + "epoch": 2.0291354042493515, + "grad_norm": 8.53344440460205, + "learning_rate": 1.7979067381734603e-05, + "loss": 1.6265, + "step": 57875 + }, + { + "epoch": 2.030011920622677, + "grad_norm": 3.3225338459014893, + "learning_rate": 1.7962835512732278e-05, + "loss": 1.6403, + "step": 57900 + }, + { + "epoch": 2.030888436996003, + "grad_norm": 4.029701232910156, + "learning_rate": 1.7946603643729953e-05, + "loss": 1.7215, + "step": 57925 + }, + { + "epoch": 2.031764953369329, + "grad_norm": 4.863218307495117, + "learning_rate": 1.793037177472763e-05, + "loss": 1.5151, + "step": 57950 + }, + { + "epoch": 2.032641469742655, + "grad_norm": 5.0037150382995605, + "learning_rate": 1.7914139905725305e-05, + "loss": 1.4445, + "step": 57975 + }, + { + "epoch": 2.0335179861159807, + "grad_norm": 4.470105171203613, + "learning_rate": 1.7897908036722983e-05, + "loss": 1.6785, + "step": 58000 + }, + { + "epoch": 2.0343945024893064, + "grad_norm": 7.346751689910889, + "learning_rate": 1.7881676167720658e-05, + "loss": 1.7289, + "step": 58025 + }, + { + "epoch": 2.035271018862632, + "grad_norm": 7.421669960021973, + "learning_rate": 1.7865444298718333e-05, + "loss": 1.6563, + "step": 58050 + }, + { + "epoch": 2.0361475352359584, + "grad_norm": 7.061256408691406, + "learning_rate": 1.7849212429716007e-05, + "loss": 1.5849, + "step": 58075 + }, + { + "epoch": 2.037024051609284, + "grad_norm": 3.7155234813690186, + "learning_rate": 1.7832980560713682e-05, + "loss": 1.5089, + "step": 58100 + }, + { + "epoch": 2.03790056798261, + "grad_norm": 4.816730499267578, + "learning_rate": 1.781674869171136e-05, + "loss": 1.6782, + "step": 58125 + }, + { + "epoch": 2.0387770843559356, + "grad_norm": 3.8499293327331543, + "learning_rate": 1.7800516822709035e-05, + "loss": 1.5393, + "step": 58150 + }, + { + "epoch": 2.039653600729262, + "grad_norm": 5.406007766723633, + "learning_rate": 1.778428495370671e-05, + "loss": 1.7026, + "step": 58175 + }, + { + "epoch": 2.0405301171025876, + "grad_norm": 7.620289325714111, + "learning_rate": 1.7768053084704387e-05, + "loss": 1.7841, + "step": 58200 + }, + { + "epoch": 2.0414066334759133, + "grad_norm": 5.739929676055908, + "learning_rate": 1.7751821215702062e-05, + "loss": 1.6318, + "step": 58225 + }, + { + "epoch": 2.042283149849239, + "grad_norm": 5.0432844161987305, + "learning_rate": 1.7735589346699737e-05, + "loss": 1.7897, + "step": 58250 + }, + { + "epoch": 2.043159666222565, + "grad_norm": 3.3888237476348877, + "learning_rate": 1.771935747769741e-05, + "loss": 1.7691, + "step": 58275 + }, + { + "epoch": 2.044036182595891, + "grad_norm": 5.027014255523682, + "learning_rate": 1.770312560869509e-05, + "loss": 1.6551, + "step": 58300 + }, + { + "epoch": 2.044912698969217, + "grad_norm": 5.7204108238220215, + "learning_rate": 1.7686893739692764e-05, + "loss": 1.7726, + "step": 58325 + }, + { + "epoch": 2.0457892153425425, + "grad_norm": 8.256714820861816, + "learning_rate": 1.767066187069044e-05, + "loss": 1.5923, + "step": 58350 + }, + { + "epoch": 2.0466657317158683, + "grad_norm": 4.614947319030762, + "learning_rate": 1.7654430001688117e-05, + "loss": 1.6899, + "step": 58375 + }, + { + "epoch": 2.0475422480891945, + "grad_norm": 6.58522367477417, + "learning_rate": 1.763819813268579e-05, + "loss": 1.6727, + "step": 58400 + }, + { + "epoch": 2.0484187644625202, + "grad_norm": 4.371238708496094, + "learning_rate": 1.7621966263683466e-05, + "loss": 1.7055, + "step": 58425 + }, + { + "epoch": 2.049295280835846, + "grad_norm": 6.451613903045654, + "learning_rate": 1.760573439468114e-05, + "loss": 1.5889, + "step": 58450 + }, + { + "epoch": 2.0501717972091718, + "grad_norm": 9.235331535339355, + "learning_rate": 1.7589502525678815e-05, + "loss": 1.6223, + "step": 58475 + }, + { + "epoch": 2.0510483135824975, + "grad_norm": 4.4301557540893555, + "learning_rate": 1.7573270656676493e-05, + "loss": 1.5719, + "step": 58500 + }, + { + "epoch": 2.0519248299558237, + "grad_norm": 10.541121482849121, + "learning_rate": 1.7557038787674168e-05, + "loss": 1.739, + "step": 58525 + }, + { + "epoch": 2.0528013463291495, + "grad_norm": 7.134521961212158, + "learning_rate": 1.7540806918671846e-05, + "loss": 1.7188, + "step": 58550 + }, + { + "epoch": 2.053677862702475, + "grad_norm": 3.9003493785858154, + "learning_rate": 1.752457504966952e-05, + "loss": 1.5343, + "step": 58575 + }, + { + "epoch": 2.054554379075801, + "grad_norm": 4.250467777252197, + "learning_rate": 1.7508343180667195e-05, + "loss": 1.7018, + "step": 58600 + }, + { + "epoch": 2.055430895449127, + "grad_norm": 4.874830722808838, + "learning_rate": 1.7492111311664873e-05, + "loss": 1.8252, + "step": 58625 + }, + { + "epoch": 2.056307411822453, + "grad_norm": 7.257506370544434, + "learning_rate": 1.7475879442662545e-05, + "loss": 1.6927, + "step": 58650 + }, + { + "epoch": 2.0571839281957787, + "grad_norm": 3.7201359272003174, + "learning_rate": 1.7459647573660223e-05, + "loss": 1.6037, + "step": 58675 + }, + { + "epoch": 2.0580604445691044, + "grad_norm": 6.291917324066162, + "learning_rate": 1.7443415704657897e-05, + "loss": 1.8128, + "step": 58700 + }, + { + "epoch": 2.0589369609424306, + "grad_norm": 11.657938957214355, + "learning_rate": 1.7427183835655572e-05, + "loss": 1.7732, + "step": 58725 + }, + { + "epoch": 2.0598134773157564, + "grad_norm": 4.301278114318848, + "learning_rate": 1.741095196665325e-05, + "loss": 1.7512, + "step": 58750 + }, + { + "epoch": 2.060689993689082, + "grad_norm": 5.392992973327637, + "learning_rate": 1.7394720097650925e-05, + "loss": 1.5763, + "step": 58775 + }, + { + "epoch": 2.061566510062408, + "grad_norm": 5.2184739112854, + "learning_rate": 1.7378488228648603e-05, + "loss": 1.8036, + "step": 58800 + }, + { + "epoch": 2.0624430264357336, + "grad_norm": 8.054828643798828, + "learning_rate": 1.7362256359646277e-05, + "loss": 1.6846, + "step": 58825 + }, + { + "epoch": 2.06331954280906, + "grad_norm": 5.296640872955322, + "learning_rate": 1.7346024490643952e-05, + "loss": 1.5423, + "step": 58850 + }, + { + "epoch": 2.0641960591823856, + "grad_norm": 4.9606218338012695, + "learning_rate": 1.7329792621641626e-05, + "loss": 1.6605, + "step": 58875 + }, + { + "epoch": 2.0650725755557113, + "grad_norm": 5.2522664070129395, + "learning_rate": 1.73135607526393e-05, + "loss": 1.8493, + "step": 58900 + }, + { + "epoch": 2.065949091929037, + "grad_norm": 5.308340549468994, + "learning_rate": 1.729732888363698e-05, + "loss": 1.6538, + "step": 58925 + }, + { + "epoch": 2.0668256083023633, + "grad_norm": 5.424072265625, + "learning_rate": 1.7281097014634654e-05, + "loss": 1.6603, + "step": 58950 + }, + { + "epoch": 2.067702124675689, + "grad_norm": 5.351873874664307, + "learning_rate": 1.7264865145632332e-05, + "loss": 1.6807, + "step": 58975 + }, + { + "epoch": 2.068578641049015, + "grad_norm": 4.736924171447754, + "learning_rate": 1.7248633276630007e-05, + "loss": 1.759, + "step": 59000 + }, + { + "epoch": 2.0694551574223405, + "grad_norm": 5.189515113830566, + "learning_rate": 1.723240140762768e-05, + "loss": 1.7507, + "step": 59025 + }, + { + "epoch": 2.0703316737956663, + "grad_norm": 7.811767101287842, + "learning_rate": 1.7216169538625356e-05, + "loss": 1.8046, + "step": 59050 + }, + { + "epoch": 2.0712081901689925, + "grad_norm": 3.700206756591797, + "learning_rate": 1.719993766962303e-05, + "loss": 1.7187, + "step": 59075 + }, + { + "epoch": 2.0720847065423182, + "grad_norm": 3.8614251613616943, + "learning_rate": 1.718370580062071e-05, + "loss": 1.734, + "step": 59100 + }, + { + "epoch": 2.072961222915644, + "grad_norm": 4.857114315032959, + "learning_rate": 1.7167473931618383e-05, + "loss": 1.7361, + "step": 59125 + }, + { + "epoch": 2.0738377392889698, + "grad_norm": 5.390267848968506, + "learning_rate": 1.7151242062616058e-05, + "loss": 1.8102, + "step": 59150 + }, + { + "epoch": 2.074714255662296, + "grad_norm": 3.427516460418701, + "learning_rate": 1.7135010193613736e-05, + "loss": 1.7002, + "step": 59175 + }, + { + "epoch": 2.0755907720356217, + "grad_norm": 3.411686420440674, + "learning_rate": 1.711877832461141e-05, + "loss": 1.7175, + "step": 59200 + }, + { + "epoch": 2.0764672884089475, + "grad_norm": 5.7324981689453125, + "learning_rate": 1.7102546455609085e-05, + "loss": 1.682, + "step": 59225 + }, + { + "epoch": 2.077343804782273, + "grad_norm": 5.107734680175781, + "learning_rate": 1.708631458660676e-05, + "loss": 1.7229, + "step": 59250 + }, + { + "epoch": 2.078220321155599, + "grad_norm": 6.808684349060059, + "learning_rate": 1.7070082717604434e-05, + "loss": 1.5962, + "step": 59275 + }, + { + "epoch": 2.079096837528925, + "grad_norm": 3.8583710193634033, + "learning_rate": 1.7053850848602112e-05, + "loss": 1.7129, + "step": 59300 + }, + { + "epoch": 2.079973353902251, + "grad_norm": 7.351189136505127, + "learning_rate": 1.7037618979599787e-05, + "loss": 1.6763, + "step": 59325 + }, + { + "epoch": 2.0808498702755767, + "grad_norm": 3.8302559852600098, + "learning_rate": 1.7021387110597465e-05, + "loss": 1.8345, + "step": 59350 + }, + { + "epoch": 2.0817263866489024, + "grad_norm": 3.0672607421875, + "learning_rate": 1.700515524159514e-05, + "loss": 1.6442, + "step": 59375 + }, + { + "epoch": 2.0826029030222286, + "grad_norm": 3.8337478637695312, + "learning_rate": 1.6988923372592814e-05, + "loss": 1.7136, + "step": 59400 + }, + { + "epoch": 2.0834794193955544, + "grad_norm": 4.651517868041992, + "learning_rate": 1.697269150359049e-05, + "loss": 1.6616, + "step": 59425 + }, + { + "epoch": 2.08435593576888, + "grad_norm": 10.039414405822754, + "learning_rate": 1.6956459634588164e-05, + "loss": 1.7019, + "step": 59450 + }, + { + "epoch": 2.085232452142206, + "grad_norm": 10.67033576965332, + "learning_rate": 1.6940227765585842e-05, + "loss": 1.4044, + "step": 59475 + }, + { + "epoch": 2.086108968515532, + "grad_norm": 5.995604515075684, + "learning_rate": 1.6923995896583516e-05, + "loss": 1.4052, + "step": 59500 + }, + { + "epoch": 2.086985484888858, + "grad_norm": 3.4435312747955322, + "learning_rate": 1.6907764027581194e-05, + "loss": 1.2537, + "step": 59525 + }, + { + "epoch": 2.0878620012621836, + "grad_norm": 4.10497522354126, + "learning_rate": 1.689153215857887e-05, + "loss": 1.0041, + "step": 59550 + }, + { + "epoch": 2.0887385176355093, + "grad_norm": 5.6024556159973145, + "learning_rate": 1.6875300289576544e-05, + "loss": 1.208, + "step": 59575 + }, + { + "epoch": 2.089615034008835, + "grad_norm": 4.466493129730225, + "learning_rate": 1.685906842057422e-05, + "loss": 1.2982, + "step": 59600 + }, + { + "epoch": 2.0904915503821613, + "grad_norm": 4.77345609664917, + "learning_rate": 1.6842836551571893e-05, + "loss": 1.304, + "step": 59625 + }, + { + "epoch": 2.091368066755487, + "grad_norm": 4.033942222595215, + "learning_rate": 1.682660468256957e-05, + "loss": 1.2167, + "step": 59650 + }, + { + "epoch": 2.092244583128813, + "grad_norm": 9.618032455444336, + "learning_rate": 1.6810372813567246e-05, + "loss": 1.3884, + "step": 59675 + }, + { + "epoch": 2.0931210995021385, + "grad_norm": 7.514223575592041, + "learning_rate": 1.679414094456492e-05, + "loss": 1.1409, + "step": 59700 + }, + { + "epoch": 2.0939976158754647, + "grad_norm": 9.025437355041504, + "learning_rate": 1.67779090755626e-05, + "loss": 1.2512, + "step": 59725 + }, + { + "epoch": 2.0948741322487905, + "grad_norm": 7.592188835144043, + "learning_rate": 1.6761677206560273e-05, + "loss": 1.2114, + "step": 59750 + }, + { + "epoch": 2.0957506486221162, + "grad_norm": 7.753513813018799, + "learning_rate": 1.674544533755795e-05, + "loss": 1.2264, + "step": 59775 + }, + { + "epoch": 2.096627164995442, + "grad_norm": 8.737753868103027, + "learning_rate": 1.6729213468555626e-05, + "loss": 1.1652, + "step": 59800 + }, + { + "epoch": 2.097503681368768, + "grad_norm": 5.105305194854736, + "learning_rate": 1.6712981599553297e-05, + "loss": 1.7003, + "step": 59825 + }, + { + "epoch": 2.098380197742094, + "grad_norm": 4.2381792068481445, + "learning_rate": 1.6696749730550975e-05, + "loss": 1.3654, + "step": 59850 + }, + { + "epoch": 2.0992567141154197, + "grad_norm": 7.61479377746582, + "learning_rate": 1.668051786154865e-05, + "loss": 1.339, + "step": 59875 + }, + { + "epoch": 2.1001332304887455, + "grad_norm": 8.126875877380371, + "learning_rate": 1.6664285992546328e-05, + "loss": 1.6103, + "step": 59900 + }, + { + "epoch": 2.101009746862071, + "grad_norm": 6.034974098205566, + "learning_rate": 1.6648054123544002e-05, + "loss": 1.8082, + "step": 59925 + }, + { + "epoch": 2.1018862632353974, + "grad_norm": 4.626751899719238, + "learning_rate": 1.663182225454168e-05, + "loss": 1.6374, + "step": 59950 + }, + { + "epoch": 2.102762779608723, + "grad_norm": 8.887164115905762, + "learning_rate": 1.6615590385539355e-05, + "loss": 1.4562, + "step": 59975 + }, + { + "epoch": 2.103639295982049, + "grad_norm": 5.6790771484375, + "learning_rate": 1.659935851653703e-05, + "loss": 1.4569, + "step": 60000 + }, + { + "epoch": 2.1045158123553747, + "grad_norm": 3.6733789443969727, + "learning_rate": 1.6583126647534704e-05, + "loss": 1.4483, + "step": 60025 + }, + { + "epoch": 2.105392328728701, + "grad_norm": 5.965392589569092, + "learning_rate": 1.656689477853238e-05, + "loss": 1.6071, + "step": 60050 + }, + { + "epoch": 2.1062688451020266, + "grad_norm": 5.106157302856445, + "learning_rate": 1.6550662909530057e-05, + "loss": 1.4597, + "step": 60075 + }, + { + "epoch": 2.1071453614753524, + "grad_norm": 11.229660034179688, + "learning_rate": 1.653443104052773e-05, + "loss": 1.4195, + "step": 60100 + }, + { + "epoch": 2.108021877848678, + "grad_norm": 6.3902177810668945, + "learning_rate": 1.6518199171525406e-05, + "loss": 1.3964, + "step": 60125 + }, + { + "epoch": 2.108898394222004, + "grad_norm": 7.153317928314209, + "learning_rate": 1.6501967302523084e-05, + "loss": 1.2465, + "step": 60150 + }, + { + "epoch": 2.10977491059533, + "grad_norm": 7.027910232543945, + "learning_rate": 1.648573543352076e-05, + "loss": 1.4681, + "step": 60175 + }, + { + "epoch": 2.110651426968656, + "grad_norm": 6.757160186767578, + "learning_rate": 1.6469503564518434e-05, + "loss": 1.2417, + "step": 60200 + }, + { + "epoch": 2.1115279433419816, + "grad_norm": 5.105618953704834, + "learning_rate": 1.6453271695516108e-05, + "loss": 1.4845, + "step": 60225 + }, + { + "epoch": 2.1124044597153073, + "grad_norm": 4.939294815063477, + "learning_rate": 1.6437039826513783e-05, + "loss": 1.178, + "step": 60250 + }, + { + "epoch": 2.1132809760886335, + "grad_norm": 5.382005214691162, + "learning_rate": 1.642080795751146e-05, + "loss": 1.1554, + "step": 60275 + }, + { + "epoch": 2.1141574924619593, + "grad_norm": 9.851510047912598, + "learning_rate": 1.6404576088509136e-05, + "loss": 1.6571, + "step": 60300 + }, + { + "epoch": 2.115034008835285, + "grad_norm": 5.952871799468994, + "learning_rate": 1.6388344219506814e-05, + "loss": 1.3153, + "step": 60325 + }, + { + "epoch": 2.115910525208611, + "grad_norm": 6.180793762207031, + "learning_rate": 1.6372112350504488e-05, + "loss": 1.3026, + "step": 60350 + }, + { + "epoch": 2.1167870415819365, + "grad_norm": 5.639267921447754, + "learning_rate": 1.6355880481502163e-05, + "loss": 1.6399, + "step": 60375 + }, + { + "epoch": 2.1176635579552627, + "grad_norm": 8.813505172729492, + "learning_rate": 1.6339648612499837e-05, + "loss": 1.4286, + "step": 60400 + }, + { + "epoch": 2.1185400743285885, + "grad_norm": 10.103287696838379, + "learning_rate": 1.6323416743497512e-05, + "loss": 1.3808, + "step": 60425 + }, + { + "epoch": 2.1194165907019142, + "grad_norm": 6.443492889404297, + "learning_rate": 1.630718487449519e-05, + "loss": 1.2739, + "step": 60450 + }, + { + "epoch": 2.12029310707524, + "grad_norm": 8.28731918334961, + "learning_rate": 1.6290953005492865e-05, + "loss": 1.3298, + "step": 60475 + }, + { + "epoch": 2.121169623448566, + "grad_norm": 7.13043737411499, + "learning_rate": 1.6274721136490543e-05, + "loss": 1.2798, + "step": 60500 + }, + { + "epoch": 2.122046139821892, + "grad_norm": 11.668686866760254, + "learning_rate": 1.6258489267488217e-05, + "loss": 1.2074, + "step": 60525 + }, + { + "epoch": 2.1229226561952177, + "grad_norm": 3.5660481452941895, + "learning_rate": 1.6242257398485892e-05, + "loss": 1.2858, + "step": 60550 + }, + { + "epoch": 2.1237991725685434, + "grad_norm": 8.111200332641602, + "learning_rate": 1.6226025529483567e-05, + "loss": 1.1299, + "step": 60575 + }, + { + "epoch": 2.1246756889418696, + "grad_norm": 4.568535804748535, + "learning_rate": 1.620979366048124e-05, + "loss": 1.344, + "step": 60600 + }, + { + "epoch": 2.1255522053151954, + "grad_norm": 7.337769031524658, + "learning_rate": 1.619356179147892e-05, + "loss": 1.2956, + "step": 60625 + }, + { + "epoch": 2.126428721688521, + "grad_norm": 6.132130146026611, + "learning_rate": 1.6177329922476594e-05, + "loss": 1.1869, + "step": 60650 + }, + { + "epoch": 2.127305238061847, + "grad_norm": 7.997233867645264, + "learning_rate": 1.616109805347427e-05, + "loss": 1.2055, + "step": 60675 + }, + { + "epoch": 2.1281817544351727, + "grad_norm": 10.488957405090332, + "learning_rate": 1.6144866184471947e-05, + "loss": 1.1909, + "step": 60700 + }, + { + "epoch": 2.129058270808499, + "grad_norm": 6.912450313568115, + "learning_rate": 1.612863431546962e-05, + "loss": 1.1443, + "step": 60725 + }, + { + "epoch": 2.1299347871818246, + "grad_norm": 3.465562582015991, + "learning_rate": 1.61124024464673e-05, + "loss": 1.0926, + "step": 60750 + }, + { + "epoch": 2.1308113035551504, + "grad_norm": 8.77373218536377, + "learning_rate": 1.609617057746497e-05, + "loss": 1.3224, + "step": 60775 + }, + { + "epoch": 2.131687819928476, + "grad_norm": 7.831766128540039, + "learning_rate": 1.6079938708462645e-05, + "loss": 1.1091, + "step": 60800 + }, + { + "epoch": 2.1325643363018023, + "grad_norm": 5.911093235015869, + "learning_rate": 1.6063706839460323e-05, + "loss": 1.1232, + "step": 60825 + }, + { + "epoch": 2.133440852675128, + "grad_norm": 7.513334274291992, + "learning_rate": 1.6047474970457998e-05, + "loss": 1.423, + "step": 60850 + }, + { + "epoch": 2.134317369048454, + "grad_norm": 5.474644184112549, + "learning_rate": 1.6031243101455676e-05, + "loss": 1.1404, + "step": 60875 + }, + { + "epoch": 2.1351938854217796, + "grad_norm": 20.941823959350586, + "learning_rate": 1.601501123245335e-05, + "loss": 1.1727, + "step": 60900 + }, + { + "epoch": 2.1360704017951058, + "grad_norm": 3.337054491043091, + "learning_rate": 1.5998779363451025e-05, + "loss": 1.1772, + "step": 60925 + }, + { + "epoch": 2.1369469181684315, + "grad_norm": 6.570552349090576, + "learning_rate": 1.5982547494448703e-05, + "loss": 1.7387, + "step": 60950 + }, + { + "epoch": 2.1378234345417573, + "grad_norm": 15.178180694580078, + "learning_rate": 1.5966315625446378e-05, + "loss": 1.48, + "step": 60975 + }, + { + "epoch": 2.138699950915083, + "grad_norm": 9.649445533752441, + "learning_rate": 1.5950083756444053e-05, + "loss": 1.0396, + "step": 61000 + }, + { + "epoch": 2.139576467288409, + "grad_norm": 5.6376118659973145, + "learning_rate": 1.5933851887441727e-05, + "loss": 1.3296, + "step": 61025 + }, + { + "epoch": 2.140452983661735, + "grad_norm": 11.736661911010742, + "learning_rate": 1.5917620018439405e-05, + "loss": 1.4538, + "step": 61050 + }, + { + "epoch": 2.1413295000350607, + "grad_norm": 10.321537017822266, + "learning_rate": 1.590138814943708e-05, + "loss": 1.1794, + "step": 61075 + }, + { + "epoch": 2.1422060164083865, + "grad_norm": 5.693462371826172, + "learning_rate": 1.5885156280434755e-05, + "loss": 1.2688, + "step": 61100 + }, + { + "epoch": 2.1430825327817122, + "grad_norm": 11.95571517944336, + "learning_rate": 1.5868924411432433e-05, + "loss": 1.0795, + "step": 61125 + }, + { + "epoch": 2.143959049155038, + "grad_norm": 4.360915660858154, + "learning_rate": 1.5852692542430107e-05, + "loss": 1.0324, + "step": 61150 + }, + { + "epoch": 2.144835565528364, + "grad_norm": 12.591435432434082, + "learning_rate": 1.5836460673427782e-05, + "loss": 1.3808, + "step": 61175 + }, + { + "epoch": 2.14571208190169, + "grad_norm": 4.115639686584473, + "learning_rate": 1.5820228804425457e-05, + "loss": 1.2975, + "step": 61200 + }, + { + "epoch": 2.1465885982750157, + "grad_norm": 11.415609359741211, + "learning_rate": 1.580399693542313e-05, + "loss": 1.1037, + "step": 61225 + }, + { + "epoch": 2.1474651146483414, + "grad_norm": 5.071483135223389, + "learning_rate": 1.578776506642081e-05, + "loss": 1.2992, + "step": 61250 + }, + { + "epoch": 2.1483416310216676, + "grad_norm": 11.591309547424316, + "learning_rate": 1.5771533197418484e-05, + "loss": 1.2275, + "step": 61275 + }, + { + "epoch": 2.1492181473949934, + "grad_norm": 7.068187713623047, + "learning_rate": 1.5755301328416162e-05, + "loss": 1.6482, + "step": 61300 + }, + { + "epoch": 2.150094663768319, + "grad_norm": 12.756220817565918, + "learning_rate": 1.5739069459413837e-05, + "loss": 1.2699, + "step": 61325 + }, + { + "epoch": 2.150971180141645, + "grad_norm": 9.182514190673828, + "learning_rate": 1.572283759041151e-05, + "loss": 1.2582, + "step": 61350 + }, + { + "epoch": 2.151847696514971, + "grad_norm": 8.481664657592773, + "learning_rate": 1.5706605721409186e-05, + "loss": 1.2476, + "step": 61375 + }, + { + "epoch": 2.152724212888297, + "grad_norm": 8.210360527038574, + "learning_rate": 1.569037385240686e-05, + "loss": 1.2822, + "step": 61400 + }, + { + "epoch": 2.1536007292616226, + "grad_norm": 6.498918533325195, + "learning_rate": 1.567414198340454e-05, + "loss": 1.1902, + "step": 61425 + }, + { + "epoch": 2.1544772456349484, + "grad_norm": 15.439064979553223, + "learning_rate": 1.5657910114402213e-05, + "loss": 1.2777, + "step": 61450 + }, + { + "epoch": 2.155353762008274, + "grad_norm": 5.281651020050049, + "learning_rate": 1.5641678245399888e-05, + "loss": 1.3163, + "step": 61475 + }, + { + "epoch": 2.1562302783816003, + "grad_norm": 11.64437484741211, + "learning_rate": 1.5625446376397566e-05, + "loss": 1.2628, + "step": 61500 + }, + { + "epoch": 2.157106794754926, + "grad_norm": 6.896574974060059, + "learning_rate": 1.560921450739524e-05, + "loss": 0.8704, + "step": 61525 + }, + { + "epoch": 2.157983311128252, + "grad_norm": 8.92774772644043, + "learning_rate": 1.5592982638392915e-05, + "loss": 1.055, + "step": 61550 + }, + { + "epoch": 2.1588598275015776, + "grad_norm": 11.023283958435059, + "learning_rate": 1.557675076939059e-05, + "loss": 1.2083, + "step": 61575 + }, + { + "epoch": 2.1597363438749038, + "grad_norm": 7.64619255065918, + "learning_rate": 1.5560518900388268e-05, + "loss": 1.224, + "step": 61600 + }, + { + "epoch": 2.1606128602482295, + "grad_norm": 8.311941146850586, + "learning_rate": 1.5544287031385943e-05, + "loss": 1.0876, + "step": 61625 + }, + { + "epoch": 2.1614893766215553, + "grad_norm": 16.156932830810547, + "learning_rate": 1.5528055162383617e-05, + "loss": 1.2417, + "step": 61650 + }, + { + "epoch": 2.162365892994881, + "grad_norm": 11.313617706298828, + "learning_rate": 1.5511823293381295e-05, + "loss": 1.4548, + "step": 61675 + }, + { + "epoch": 2.163242409368207, + "grad_norm": 4.596912384033203, + "learning_rate": 1.549559142437897e-05, + "loss": 0.9107, + "step": 61700 + }, + { + "epoch": 2.164118925741533, + "grad_norm": 5.13827657699585, + "learning_rate": 1.5479359555376648e-05, + "loss": 1.2166, + "step": 61725 + }, + { + "epoch": 2.1649954421148587, + "grad_norm": 11.189650535583496, + "learning_rate": 1.546312768637432e-05, + "loss": 1.622, + "step": 61750 + }, + { + "epoch": 2.1658719584881845, + "grad_norm": 7.132916450500488, + "learning_rate": 1.5446895817371994e-05, + "loss": 1.2637, + "step": 61775 + }, + { + "epoch": 2.1667484748615102, + "grad_norm": 7.302834987640381, + "learning_rate": 1.5430663948369672e-05, + "loss": 1.42, + "step": 61800 + }, + { + "epoch": 2.1676249912348364, + "grad_norm": 11.534622192382812, + "learning_rate": 1.5414432079367347e-05, + "loss": 1.6326, + "step": 61825 + }, + { + "epoch": 2.168501507608162, + "grad_norm": 10.724017143249512, + "learning_rate": 1.5398200210365025e-05, + "loss": 1.1857, + "step": 61850 + }, + { + "epoch": 2.169378023981488, + "grad_norm": 9.165660858154297, + "learning_rate": 1.53819683413627e-05, + "loss": 1.1867, + "step": 61875 + }, + { + "epoch": 2.1702545403548137, + "grad_norm": 4.067252159118652, + "learning_rate": 1.5365736472360374e-05, + "loss": 1.1166, + "step": 61900 + }, + { + "epoch": 2.17113105672814, + "grad_norm": 8.982287406921387, + "learning_rate": 1.5349504603358052e-05, + "loss": 1.011, + "step": 61925 + }, + { + "epoch": 2.1720075731014656, + "grad_norm": 9.483511924743652, + "learning_rate": 1.5333272734355723e-05, + "loss": 1.0147, + "step": 61950 + }, + { + "epoch": 2.1728840894747914, + "grad_norm": 7.0537238121032715, + "learning_rate": 1.53170408653534e-05, + "loss": 1.1044, + "step": 61975 + }, + { + "epoch": 2.173760605848117, + "grad_norm": 16.747358322143555, + "learning_rate": 1.5300808996351076e-05, + "loss": 1.1767, + "step": 62000 + }, + { + "epoch": 2.174637122221443, + "grad_norm": 6.221466064453125, + "learning_rate": 1.528457712734875e-05, + "loss": 1.1167, + "step": 62025 + }, + { + "epoch": 2.175513638594769, + "grad_norm": 5.9452643394470215, + "learning_rate": 1.526834525834643e-05, + "loss": 1.0865, + "step": 62050 + }, + { + "epoch": 2.176390154968095, + "grad_norm": 11.402565956115723, + "learning_rate": 1.5252113389344103e-05, + "loss": 1.262, + "step": 62075 + }, + { + "epoch": 2.1772666713414206, + "grad_norm": 7.334470272064209, + "learning_rate": 1.523588152034178e-05, + "loss": 1.4264, + "step": 62100 + }, + { + "epoch": 2.1781431877147464, + "grad_norm": 10.389479637145996, + "learning_rate": 1.5219649651339454e-05, + "loss": 0.9541, + "step": 62125 + }, + { + "epoch": 2.1790197040880726, + "grad_norm": 7.104258060455322, + "learning_rate": 1.5203417782337129e-05, + "loss": 1.1572, + "step": 62150 + }, + { + "epoch": 2.1798962204613983, + "grad_norm": 6.933248996734619, + "learning_rate": 1.5187185913334807e-05, + "loss": 1.0572, + "step": 62175 + }, + { + "epoch": 2.180772736834724, + "grad_norm": 9.419309616088867, + "learning_rate": 1.517095404433248e-05, + "loss": 1.0456, + "step": 62200 + }, + { + "epoch": 2.18164925320805, + "grad_norm": 8.014698028564453, + "learning_rate": 1.5154722175330158e-05, + "loss": 1.3509, + "step": 62225 + }, + { + "epoch": 2.1825257695813756, + "grad_norm": 6.70542049407959, + "learning_rate": 1.5138490306327832e-05, + "loss": 1.4004, + "step": 62250 + }, + { + "epoch": 2.1834022859547018, + "grad_norm": 7.753915786743164, + "learning_rate": 1.5122258437325509e-05, + "loss": 1.1195, + "step": 62275 + }, + { + "epoch": 2.1842788023280275, + "grad_norm": 6.60853910446167, + "learning_rate": 1.5106026568323183e-05, + "loss": 1.1001, + "step": 62300 + }, + { + "epoch": 2.1851553187013533, + "grad_norm": 10.639272689819336, + "learning_rate": 1.5089794699320858e-05, + "loss": 1.0716, + "step": 62325 + }, + { + "epoch": 2.186031835074679, + "grad_norm": 9.584762573242188, + "learning_rate": 1.5073562830318536e-05, + "loss": 0.8633, + "step": 62350 + }, + { + "epoch": 2.186908351448005, + "grad_norm": 11.992773056030273, + "learning_rate": 1.505733096131621e-05, + "loss": 1.1148, + "step": 62375 + }, + { + "epoch": 2.187784867821331, + "grad_norm": 8.48607063293457, + "learning_rate": 1.5041099092313887e-05, + "loss": 1.2328, + "step": 62400 + }, + { + "epoch": 2.1886613841946567, + "grad_norm": 10.821176528930664, + "learning_rate": 1.5024867223311562e-05, + "loss": 1.4255, + "step": 62425 + }, + { + "epoch": 2.1895379005679825, + "grad_norm": 12.91912841796875, + "learning_rate": 1.5008635354309236e-05, + "loss": 1.2937, + "step": 62450 + }, + { + "epoch": 2.1904144169413087, + "grad_norm": 14.881612777709961, + "learning_rate": 1.4992403485306913e-05, + "loss": 1.1314, + "step": 62475 + }, + { + "epoch": 2.1912909333146344, + "grad_norm": 12.38964557647705, + "learning_rate": 1.4976171616304587e-05, + "loss": 1.1281, + "step": 62500 + }, + { + "epoch": 2.19216744968796, + "grad_norm": 15.818326950073242, + "learning_rate": 1.4959939747302265e-05, + "loss": 1.3672, + "step": 62525 + }, + { + "epoch": 2.193043966061286, + "grad_norm": 11.195785522460938, + "learning_rate": 1.494370787829994e-05, + "loss": 1.1913, + "step": 62550 + }, + { + "epoch": 2.1939204824346117, + "grad_norm": 12.351496696472168, + "learning_rate": 1.4927476009297615e-05, + "loss": 1.046, + "step": 62575 + }, + { + "epoch": 2.194796998807938, + "grad_norm": 17.33974838256836, + "learning_rate": 1.4911244140295291e-05, + "loss": 1.3043, + "step": 62600 + }, + { + "epoch": 2.1956735151812636, + "grad_norm": 9.725691795349121, + "learning_rate": 1.4895012271292966e-05, + "loss": 1.1749, + "step": 62625 + }, + { + "epoch": 2.1965500315545894, + "grad_norm": 8.873906135559082, + "learning_rate": 1.4878780402290644e-05, + "loss": 1.1888, + "step": 62650 + }, + { + "epoch": 2.197426547927915, + "grad_norm": 6.923158645629883, + "learning_rate": 1.4862548533288317e-05, + "loss": 1.3612, + "step": 62675 + }, + { + "epoch": 2.1983030643012413, + "grad_norm": 15.75735092163086, + "learning_rate": 1.4846316664285991e-05, + "loss": 1.2355, + "step": 62700 + }, + { + "epoch": 2.199179580674567, + "grad_norm": 8.291440963745117, + "learning_rate": 1.483008479528367e-05, + "loss": 1.2309, + "step": 62725 + }, + { + "epoch": 2.200056097047893, + "grad_norm": 5.6305365562438965, + "learning_rate": 1.4813852926281344e-05, + "loss": 1.3653, + "step": 62750 + }, + { + "epoch": 2.2009326134212186, + "grad_norm": 6.8870720863342285, + "learning_rate": 1.479762105727902e-05, + "loss": 1.1144, + "step": 62775 + }, + { + "epoch": 2.201809129794545, + "grad_norm": 9.557476997375488, + "learning_rate": 1.4781389188276695e-05, + "loss": 1.2263, + "step": 62800 + }, + { + "epoch": 2.2026856461678705, + "grad_norm": 6.789280414581299, + "learning_rate": 1.4765157319274373e-05, + "loss": 1.0878, + "step": 62825 + }, + { + "epoch": 2.2035621625411963, + "grad_norm": 7.609008312225342, + "learning_rate": 1.4748925450272048e-05, + "loss": 1.1756, + "step": 62850 + }, + { + "epoch": 2.204438678914522, + "grad_norm": 8.568477630615234, + "learning_rate": 1.473269358126972e-05, + "loss": 0.8791, + "step": 62875 + }, + { + "epoch": 2.205315195287848, + "grad_norm": 8.70605754852295, + "learning_rate": 1.4716461712267399e-05, + "loss": 0.9333, + "step": 62900 + }, + { + "epoch": 2.206191711661174, + "grad_norm": 5.8045125007629395, + "learning_rate": 1.4700229843265073e-05, + "loss": 1.0379, + "step": 62925 + }, + { + "epoch": 2.2070682280344998, + "grad_norm": 5.341181755065918, + "learning_rate": 1.468399797426275e-05, + "loss": 1.222, + "step": 62950 + }, + { + "epoch": 2.2079447444078255, + "grad_norm": 10.990700721740723, + "learning_rate": 1.4667766105260424e-05, + "loss": 1.2193, + "step": 62975 + }, + { + "epoch": 2.2088212607811513, + "grad_norm": 6.929116725921631, + "learning_rate": 1.4651534236258099e-05, + "loss": 1.0662, + "step": 63000 + }, + { + "epoch": 2.209697777154477, + "grad_norm": 8.430336952209473, + "learning_rate": 1.4635302367255777e-05, + "loss": 1.279, + "step": 63025 + }, + { + "epoch": 2.210574293527803, + "grad_norm": 9.159374237060547, + "learning_rate": 1.4619070498253452e-05, + "loss": 1.2463, + "step": 63050 + }, + { + "epoch": 2.211450809901129, + "grad_norm": 11.71422290802002, + "learning_rate": 1.4602838629251128e-05, + "loss": 1.1213, + "step": 63075 + }, + { + "epoch": 2.2123273262744547, + "grad_norm": 5.386176586151123, + "learning_rate": 1.4586606760248803e-05, + "loss": 1.1826, + "step": 63100 + }, + { + "epoch": 2.2132038426477805, + "grad_norm": 12.135611534118652, + "learning_rate": 1.4570374891246477e-05, + "loss": 1.2138, + "step": 63125 + }, + { + "epoch": 2.2140803590211067, + "grad_norm": 13.777759552001953, + "learning_rate": 1.4554143022244154e-05, + "loss": 1.2459, + "step": 63150 + }, + { + "epoch": 2.2149568753944324, + "grad_norm": 13.447957992553711, + "learning_rate": 1.4537911153241828e-05, + "loss": 1.1244, + "step": 63175 + }, + { + "epoch": 2.215833391767758, + "grad_norm": 9.967288970947266, + "learning_rate": 1.4521679284239506e-05, + "loss": 1.1687, + "step": 63200 + }, + { + "epoch": 2.216709908141084, + "grad_norm": 9.691473007202148, + "learning_rate": 1.4505447415237181e-05, + "loss": 1.2126, + "step": 63225 + }, + { + "epoch": 2.21758642451441, + "grad_norm": 5.531303405761719, + "learning_rate": 1.4489215546234856e-05, + "loss": 0.9968, + "step": 63250 + }, + { + "epoch": 2.218462940887736, + "grad_norm": 11.491808891296387, + "learning_rate": 1.4472983677232532e-05, + "loss": 1.3788, + "step": 63275 + }, + { + "epoch": 2.2193394572610616, + "grad_norm": 7.630377292633057, + "learning_rate": 1.4456751808230207e-05, + "loss": 0.9965, + "step": 63300 + }, + { + "epoch": 2.2202159736343874, + "grad_norm": 8.51891803741455, + "learning_rate": 1.4440519939227885e-05, + "loss": 1.2286, + "step": 63325 + }, + { + "epoch": 2.221092490007713, + "grad_norm": 11.05774211883545, + "learning_rate": 1.442428807022556e-05, + "loss": 1.103, + "step": 63350 + }, + { + "epoch": 2.2219690063810393, + "grad_norm": 6.963611125946045, + "learning_rate": 1.4408056201223236e-05, + "loss": 1.0311, + "step": 63375 + }, + { + "epoch": 2.222845522754365, + "grad_norm": 11.58055591583252, + "learning_rate": 1.439182433222091e-05, + "loss": 1.1147, + "step": 63400 + }, + { + "epoch": 2.223722039127691, + "grad_norm": 11.923283576965332, + "learning_rate": 1.4375592463218585e-05, + "loss": 1.1129, + "step": 63425 + }, + { + "epoch": 2.2245985555010166, + "grad_norm": 8.796027183532715, + "learning_rate": 1.4359360594216261e-05, + "loss": 1.26, + "step": 63450 + }, + { + "epoch": 2.225475071874343, + "grad_norm": 18.604293823242188, + "learning_rate": 1.4343128725213936e-05, + "loss": 1.1104, + "step": 63475 + }, + { + "epoch": 2.2263515882476685, + "grad_norm": 13.390384674072266, + "learning_rate": 1.4326896856211614e-05, + "loss": 1.0971, + "step": 63500 + }, + { + "epoch": 2.2272281046209943, + "grad_norm": 7.11267614364624, + "learning_rate": 1.4310664987209288e-05, + "loss": 1.1117, + "step": 63525 + }, + { + "epoch": 2.22810462099432, + "grad_norm": 8.017780303955078, + "learning_rate": 1.4294433118206963e-05, + "loss": 1.2771, + "step": 63550 + }, + { + "epoch": 2.2289811373676462, + "grad_norm": 20.030256271362305, + "learning_rate": 1.427820124920464e-05, + "loss": 1.3119, + "step": 63575 + }, + { + "epoch": 2.229857653740972, + "grad_norm": 12.353796005249023, + "learning_rate": 1.4261969380202314e-05, + "loss": 1.105, + "step": 63600 + }, + { + "epoch": 2.2307341701142978, + "grad_norm": 8.753910064697266, + "learning_rate": 1.4245737511199992e-05, + "loss": 1.2876, + "step": 63625 + }, + { + "epoch": 2.2316106864876235, + "grad_norm": 4.85818338394165, + "learning_rate": 1.4229505642197665e-05, + "loss": 0.9897, + "step": 63650 + }, + { + "epoch": 2.2324872028609493, + "grad_norm": 18.149354934692383, + "learning_rate": 1.421327377319534e-05, + "loss": 1.0471, + "step": 63675 + }, + { + "epoch": 2.2333637192342755, + "grad_norm": 6.053328514099121, + "learning_rate": 1.4197041904193018e-05, + "loss": 1.2905, + "step": 63700 + }, + { + "epoch": 2.234240235607601, + "grad_norm": 7.7093329429626465, + "learning_rate": 1.4180810035190692e-05, + "loss": 1.363, + "step": 63725 + }, + { + "epoch": 2.235116751980927, + "grad_norm": 9.11639404296875, + "learning_rate": 1.4164578166188369e-05, + "loss": 0.9319, + "step": 63750 + }, + { + "epoch": 2.2359932683542527, + "grad_norm": 8.412013053894043, + "learning_rate": 1.4148346297186043e-05, + "loss": 1.0993, + "step": 63775 + }, + { + "epoch": 2.236869784727579, + "grad_norm": 9.40140438079834, + "learning_rate": 1.4132114428183718e-05, + "loss": 1.3123, + "step": 63800 + }, + { + "epoch": 2.2377463011009047, + "grad_norm": 9.621429443359375, + "learning_rate": 1.4115882559181396e-05, + "loss": 1.2648, + "step": 63825 + }, + { + "epoch": 2.2386228174742304, + "grad_norm": 7.497134208679199, + "learning_rate": 1.4099650690179069e-05, + "loss": 0.9452, + "step": 63850 + }, + { + "epoch": 2.239499333847556, + "grad_norm": 15.31733512878418, + "learning_rate": 1.4083418821176747e-05, + "loss": 1.0469, + "step": 63875 + }, + { + "epoch": 2.240375850220882, + "grad_norm": 10.040287971496582, + "learning_rate": 1.4067186952174422e-05, + "loss": 1.1029, + "step": 63900 + }, + { + "epoch": 2.241252366594208, + "grad_norm": 13.715876579284668, + "learning_rate": 1.4050955083172098e-05, + "loss": 1.1265, + "step": 63925 + }, + { + "epoch": 2.242128882967534, + "grad_norm": 8.147451400756836, + "learning_rate": 1.4034723214169773e-05, + "loss": 1.1401, + "step": 63950 + }, + { + "epoch": 2.2430053993408596, + "grad_norm": 8.11771011352539, + "learning_rate": 1.4018491345167447e-05, + "loss": 1.0209, + "step": 63975 + }, + { + "epoch": 2.2438819157141854, + "grad_norm": 7.193262100219727, + "learning_rate": 1.4002259476165125e-05, + "loss": 1.4929, + "step": 64000 + }, + { + "epoch": 2.2447584320875116, + "grad_norm": 18.863521575927734, + "learning_rate": 1.39860276071628e-05, + "loss": 0.9684, + "step": 64025 + }, + { + "epoch": 2.2456349484608373, + "grad_norm": 14.127700805664062, + "learning_rate": 1.3969795738160476e-05, + "loss": 1.361, + "step": 64050 + }, + { + "epoch": 2.246511464834163, + "grad_norm": 12.539345741271973, + "learning_rate": 1.3953563869158151e-05, + "loss": 1.2985, + "step": 64075 + }, + { + "epoch": 2.247387981207489, + "grad_norm": 11.785778999328613, + "learning_rate": 1.3937332000155826e-05, + "loss": 1.1288, + "step": 64100 + }, + { + "epoch": 2.2482644975808146, + "grad_norm": 8.454031944274902, + "learning_rate": 1.3921100131153502e-05, + "loss": 1.3195, + "step": 64125 + }, + { + "epoch": 2.249141013954141, + "grad_norm": 9.731660842895508, + "learning_rate": 1.3904868262151177e-05, + "loss": 0.9719, + "step": 64150 + }, + { + "epoch": 2.2500175303274665, + "grad_norm": 10.866177558898926, + "learning_rate": 1.3888636393148855e-05, + "loss": 1.3056, + "step": 64175 + }, + { + "epoch": 2.2508940467007923, + "grad_norm": 8.91787338256836, + "learning_rate": 1.387240452414653e-05, + "loss": 0.9775, + "step": 64200 + }, + { + "epoch": 2.251770563074118, + "grad_norm": 6.956072807312012, + "learning_rate": 1.3856172655144204e-05, + "loss": 1.242, + "step": 64225 + }, + { + "epoch": 2.2526470794474442, + "grad_norm": 10.768413543701172, + "learning_rate": 1.383994078614188e-05, + "loss": 1.0621, + "step": 64250 + }, + { + "epoch": 2.25352359582077, + "grad_norm": 10.155693054199219, + "learning_rate": 1.3823708917139555e-05, + "loss": 1.2745, + "step": 64275 + }, + { + "epoch": 2.2544001121940958, + "grad_norm": 7.30828857421875, + "learning_rate": 1.3807477048137233e-05, + "loss": 1.0845, + "step": 64300 + }, + { + "epoch": 2.2552766285674215, + "grad_norm": 6.549935340881348, + "learning_rate": 1.3791245179134906e-05, + "loss": 1.2149, + "step": 64325 + }, + { + "epoch": 2.2561531449407477, + "grad_norm": 9.448326110839844, + "learning_rate": 1.377501331013258e-05, + "loss": 1.129, + "step": 64350 + }, + { + "epoch": 2.2570296613140735, + "grad_norm": 8.09896183013916, + "learning_rate": 1.3758781441130259e-05, + "loss": 1.3812, + "step": 64375 + }, + { + "epoch": 2.257906177687399, + "grad_norm": 8.403566360473633, + "learning_rate": 1.3742549572127933e-05, + "loss": 1.1513, + "step": 64400 + }, + { + "epoch": 2.258782694060725, + "grad_norm": 6.6899871826171875, + "learning_rate": 1.372631770312561e-05, + "loss": 1.0878, + "step": 64425 + }, + { + "epoch": 2.2596592104340507, + "grad_norm": 7.417532920837402, + "learning_rate": 1.3710085834123284e-05, + "loss": 1.2159, + "step": 64450 + }, + { + "epoch": 2.260535726807377, + "grad_norm": 15.373078346252441, + "learning_rate": 1.3693853965120962e-05, + "loss": 1.1967, + "step": 64475 + }, + { + "epoch": 2.2614122431807027, + "grad_norm": 7.596785545349121, + "learning_rate": 1.3677622096118637e-05, + "loss": 1.158, + "step": 64500 + }, + { + "epoch": 2.2622887595540284, + "grad_norm": 6.736906051635742, + "learning_rate": 1.3661390227116312e-05, + "loss": 1.0744, + "step": 64525 + }, + { + "epoch": 2.263165275927354, + "grad_norm": 12.176616668701172, + "learning_rate": 1.3645158358113988e-05, + "loss": 1.1423, + "step": 64550 + }, + { + "epoch": 2.2640417923006804, + "grad_norm": 14.4893798828125, + "learning_rate": 1.3628926489111663e-05, + "loss": 1.1938, + "step": 64575 + }, + { + "epoch": 2.264918308674006, + "grad_norm": 16.336078643798828, + "learning_rate": 1.3612694620109339e-05, + "loss": 1.1487, + "step": 64600 + }, + { + "epoch": 2.265794825047332, + "grad_norm": 13.916101455688477, + "learning_rate": 1.3596462751107014e-05, + "loss": 1.5846, + "step": 64625 + }, + { + "epoch": 2.2666713414206576, + "grad_norm": 7.394772052764893, + "learning_rate": 1.3580230882104688e-05, + "loss": 1.2296, + "step": 64650 + }, + { + "epoch": 2.267547857793984, + "grad_norm": 11.338119506835938, + "learning_rate": 1.3563999013102366e-05, + "loss": 1.3305, + "step": 64675 + }, + { + "epoch": 2.2684243741673096, + "grad_norm": 8.167732238769531, + "learning_rate": 1.3547767144100041e-05, + "loss": 0.9005, + "step": 64700 + }, + { + "epoch": 2.2693008905406353, + "grad_norm": 8.0704345703125, + "learning_rate": 1.3531535275097717e-05, + "loss": 1.1773, + "step": 64725 + }, + { + "epoch": 2.270177406913961, + "grad_norm": 9.083067893981934, + "learning_rate": 1.3515303406095392e-05, + "loss": 1.2738, + "step": 64750 + }, + { + "epoch": 2.271053923287287, + "grad_norm": 12.996097564697266, + "learning_rate": 1.3499071537093067e-05, + "loss": 1.0907, + "step": 64775 + }, + { + "epoch": 2.271930439660613, + "grad_norm": 11.757902145385742, + "learning_rate": 1.3482839668090745e-05, + "loss": 0.9622, + "step": 64800 + }, + { + "epoch": 2.272806956033939, + "grad_norm": 12.404463768005371, + "learning_rate": 1.3466607799088418e-05, + "loss": 1.3914, + "step": 64825 + }, + { + "epoch": 2.2736834724072645, + "grad_norm": 8.176536560058594, + "learning_rate": 1.3450375930086096e-05, + "loss": 1.0768, + "step": 64850 + }, + { + "epoch": 2.2745599887805903, + "grad_norm": 14.900890350341797, + "learning_rate": 1.343414406108377e-05, + "loss": 1.1238, + "step": 64875 + }, + { + "epoch": 2.275436505153916, + "grad_norm": 20.78076171875, + "learning_rate": 1.3417912192081445e-05, + "loss": 1.2414, + "step": 64900 + }, + { + "epoch": 2.2763130215272422, + "grad_norm": 7.360983848571777, + "learning_rate": 1.3401680323079121e-05, + "loss": 1.0232, + "step": 64925 + }, + { + "epoch": 2.277189537900568, + "grad_norm": 6.995347023010254, + "learning_rate": 1.3385448454076796e-05, + "loss": 1.3225, + "step": 64950 + }, + { + "epoch": 2.2780660542738937, + "grad_norm": 9.152095794677734, + "learning_rate": 1.3369216585074474e-05, + "loss": 1.0567, + "step": 64975 + }, + { + "epoch": 2.2789425706472195, + "grad_norm": 5.39893913269043, + "learning_rate": 1.3352984716072148e-05, + "loss": 1.0878, + "step": 65000 + }, + { + "epoch": 2.2798190870205457, + "grad_norm": 11.251779556274414, + "learning_rate": 1.3336752847069825e-05, + "loss": 0.9965, + "step": 65025 + }, + { + "epoch": 2.2806956033938715, + "grad_norm": 11.079448699951172, + "learning_rate": 1.33205209780675e-05, + "loss": 1.1683, + "step": 65050 + }, + { + "epoch": 2.281572119767197, + "grad_norm": 8.452425956726074, + "learning_rate": 1.3304289109065174e-05, + "loss": 0.996, + "step": 65075 + }, + { + "epoch": 2.282448636140523, + "grad_norm": 9.498518943786621, + "learning_rate": 1.328805724006285e-05, + "loss": 1.1423, + "step": 65100 + }, + { + "epoch": 2.283325152513849, + "grad_norm": 6.586195945739746, + "learning_rate": 1.3271825371060525e-05, + "loss": 1.1997, + "step": 65125 + }, + { + "epoch": 2.284201668887175, + "grad_norm": 7.409008979797363, + "learning_rate": 1.3255593502058203e-05, + "loss": 0.9656, + "step": 65150 + }, + { + "epoch": 2.2850781852605007, + "grad_norm": 15.804064750671387, + "learning_rate": 1.3239361633055878e-05, + "loss": 1.1133, + "step": 65175 + }, + { + "epoch": 2.2859547016338264, + "grad_norm": 9.292071342468262, + "learning_rate": 1.3223129764053552e-05, + "loss": 1.3483, + "step": 65200 + }, + { + "epoch": 2.286831218007152, + "grad_norm": 8.236437797546387, + "learning_rate": 1.3206897895051229e-05, + "loss": 1.0843, + "step": 65225 + }, + { + "epoch": 2.2877077343804784, + "grad_norm": 6.235188961029053, + "learning_rate": 1.3190666026048903e-05, + "loss": 1.1264, + "step": 65250 + }, + { + "epoch": 2.288584250753804, + "grad_norm": 14.295860290527344, + "learning_rate": 1.3174434157046581e-05, + "loss": 1.2697, + "step": 65275 + }, + { + "epoch": 2.28946076712713, + "grad_norm": 12.850435256958008, + "learning_rate": 1.3158202288044254e-05, + "loss": 0.9444, + "step": 65300 + }, + { + "epoch": 2.2903372835004556, + "grad_norm": 14.328096389770508, + "learning_rate": 1.3141970419041929e-05, + "loss": 1.3435, + "step": 65325 + }, + { + "epoch": 2.291213799873782, + "grad_norm": 4.334450721740723, + "learning_rate": 1.3125738550039607e-05, + "loss": 1.0277, + "step": 65350 + }, + { + "epoch": 2.2920903162471076, + "grad_norm": 10.577120780944824, + "learning_rate": 1.3109506681037282e-05, + "loss": 0.9517, + "step": 65375 + }, + { + "epoch": 2.2929668326204333, + "grad_norm": 6.530296802520752, + "learning_rate": 1.3093274812034958e-05, + "loss": 1.2533, + "step": 65400 + }, + { + "epoch": 2.293843348993759, + "grad_norm": 13.176822662353516, + "learning_rate": 1.3077042943032633e-05, + "loss": 1.474, + "step": 65425 + }, + { + "epoch": 2.2947198653670853, + "grad_norm": 17.602054595947266, + "learning_rate": 1.3060811074030307e-05, + "loss": 1.3468, + "step": 65450 + }, + { + "epoch": 2.295596381740411, + "grad_norm": 8.113541603088379, + "learning_rate": 1.3044579205027985e-05, + "loss": 0.9533, + "step": 65475 + }, + { + "epoch": 2.296472898113737, + "grad_norm": 9.917032241821289, + "learning_rate": 1.3028347336025658e-05, + "loss": 1.3302, + "step": 65500 + }, + { + "epoch": 2.2973494144870625, + "grad_norm": 6.521631717681885, + "learning_rate": 1.3012115467023336e-05, + "loss": 1.3015, + "step": 65525 + }, + { + "epoch": 2.2982259308603883, + "grad_norm": 6.809088230133057, + "learning_rate": 1.2995883598021011e-05, + "loss": 1.3289, + "step": 65550 + }, + { + "epoch": 2.2991024472337145, + "grad_norm": 7.34030294418335, + "learning_rate": 1.2979651729018687e-05, + "loss": 1.2915, + "step": 65575 + }, + { + "epoch": 2.2999789636070402, + "grad_norm": 11.594886779785156, + "learning_rate": 1.2963419860016362e-05, + "loss": 1.3886, + "step": 65600 + }, + { + "epoch": 2.300855479980366, + "grad_norm": 15.413215637207031, + "learning_rate": 1.2947187991014037e-05, + "loss": 1.3848, + "step": 65625 + }, + { + "epoch": 2.3017319963536917, + "grad_norm": 6.280229091644287, + "learning_rate": 1.2930956122011715e-05, + "loss": 1.3388, + "step": 65650 + }, + { + "epoch": 2.3026085127270175, + "grad_norm": 5.499985694885254, + "learning_rate": 1.291472425300939e-05, + "loss": 1.0535, + "step": 65675 + }, + { + "epoch": 2.3034850291003437, + "grad_norm": 6.137710094451904, + "learning_rate": 1.2898492384007066e-05, + "loss": 1.4381, + "step": 65700 + }, + { + "epoch": 2.3043615454736694, + "grad_norm": 15.223346710205078, + "learning_rate": 1.288226051500474e-05, + "loss": 1.0938, + "step": 65725 + }, + { + "epoch": 2.305238061846995, + "grad_norm": 6.983646869659424, + "learning_rate": 1.2866028646002415e-05, + "loss": 1.1013, + "step": 65750 + }, + { + "epoch": 2.3061145782203214, + "grad_norm": 7.031818866729736, + "learning_rate": 1.2849796777000091e-05, + "loss": 1.0016, + "step": 65775 + }, + { + "epoch": 2.306991094593647, + "grad_norm": 10.256093978881836, + "learning_rate": 1.2833564907997766e-05, + "loss": 1.1998, + "step": 65800 + }, + { + "epoch": 2.307867610966973, + "grad_norm": 8.508296966552734, + "learning_rate": 1.2817333038995444e-05, + "loss": 0.9351, + "step": 65825 + }, + { + "epoch": 2.3087441273402987, + "grad_norm": 8.450778007507324, + "learning_rate": 1.2801101169993119e-05, + "loss": 1.8145, + "step": 65850 + }, + { + "epoch": 2.3096206437136244, + "grad_norm": 9.001132011413574, + "learning_rate": 1.2784869300990793e-05, + "loss": 1.6872, + "step": 65875 + }, + { + "epoch": 2.3104971600869506, + "grad_norm": 10.638721466064453, + "learning_rate": 1.276863743198847e-05, + "loss": 0.997, + "step": 65900 + }, + { + "epoch": 2.3113736764602764, + "grad_norm": 5.741235256195068, + "learning_rate": 1.2752405562986144e-05, + "loss": 1.1964, + "step": 65925 + }, + { + "epoch": 2.312250192833602, + "grad_norm": 4.804197788238525, + "learning_rate": 1.2736173693983822e-05, + "loss": 1.0686, + "step": 65950 + }, + { + "epoch": 2.313126709206928, + "grad_norm": 7.462785720825195, + "learning_rate": 1.2719941824981497e-05, + "loss": 1.3415, + "step": 65975 + }, + { + "epoch": 2.3140032255802536, + "grad_norm": 5.075049877166748, + "learning_rate": 1.270370995597917e-05, + "loss": 1.0885, + "step": 66000 + }, + { + "epoch": 2.31487974195358, + "grad_norm": 6.830626964569092, + "learning_rate": 1.2687478086976848e-05, + "loss": 1.1337, + "step": 66025 + }, + { + "epoch": 2.3157562583269056, + "grad_norm": 6.266919136047363, + "learning_rate": 1.2671246217974523e-05, + "loss": 1.317, + "step": 66050 + }, + { + "epoch": 2.3166327747002313, + "grad_norm": 6.521954536437988, + "learning_rate": 1.2655014348972199e-05, + "loss": 1.0031, + "step": 66075 + }, + { + "epoch": 2.317509291073557, + "grad_norm": 9.182901382446289, + "learning_rate": 1.2638782479969874e-05, + "loss": 1.2213, + "step": 66100 + }, + { + "epoch": 2.3183858074468833, + "grad_norm": 10.247505187988281, + "learning_rate": 1.2622550610967552e-05, + "loss": 1.0759, + "step": 66125 + }, + { + "epoch": 2.319262323820209, + "grad_norm": 6.451937198638916, + "learning_rate": 1.2606318741965226e-05, + "loss": 1.1272, + "step": 66150 + }, + { + "epoch": 2.320138840193535, + "grad_norm": 10.546768188476562, + "learning_rate": 1.2590086872962901e-05, + "loss": 1.322, + "step": 66175 + }, + { + "epoch": 2.3210153565668605, + "grad_norm": 16.464622497558594, + "learning_rate": 1.2573855003960577e-05, + "loss": 1.1806, + "step": 66200 + }, + { + "epoch": 2.3218918729401867, + "grad_norm": 9.041464805603027, + "learning_rate": 1.2557623134958252e-05, + "loss": 1.2188, + "step": 66225 + }, + { + "epoch": 2.3227683893135125, + "grad_norm": 8.639899253845215, + "learning_rate": 1.2541391265955928e-05, + "loss": 1.0531, + "step": 66250 + }, + { + "epoch": 2.3236449056868382, + "grad_norm": 6.391017913818359, + "learning_rate": 1.2525159396953603e-05, + "loss": 1.125, + "step": 66275 + }, + { + "epoch": 2.324521422060164, + "grad_norm": 6.832110404968262, + "learning_rate": 1.2508927527951278e-05, + "loss": 1.425, + "step": 66300 + }, + { + "epoch": 2.3253979384334897, + "grad_norm": 6.540305137634277, + "learning_rate": 1.2492695658948956e-05, + "loss": 1.6026, + "step": 66325 + }, + { + "epoch": 2.326274454806816, + "grad_norm": 4.358699798583984, + "learning_rate": 1.247646378994663e-05, + "loss": 1.5728, + "step": 66350 + }, + { + "epoch": 2.3271509711801417, + "grad_norm": 7.278059959411621, + "learning_rate": 1.2460231920944305e-05, + "loss": 1.5081, + "step": 66375 + }, + { + "epoch": 2.3280274875534674, + "grad_norm": 4.623327255249023, + "learning_rate": 1.2444000051941981e-05, + "loss": 1.6399, + "step": 66400 + }, + { + "epoch": 2.328904003926793, + "grad_norm": 12.707106590270996, + "learning_rate": 1.2427768182939658e-05, + "loss": 1.6587, + "step": 66425 + }, + { + "epoch": 2.3297805203001194, + "grad_norm": 10.422568321228027, + "learning_rate": 1.2411536313937334e-05, + "loss": 1.8639, + "step": 66450 + }, + { + "epoch": 2.330657036673445, + "grad_norm": 4.712951183319092, + "learning_rate": 1.2395304444935007e-05, + "loss": 1.2808, + "step": 66475 + }, + { + "epoch": 2.331533553046771, + "grad_norm": 5.33419942855835, + "learning_rate": 1.2379072575932683e-05, + "loss": 1.4289, + "step": 66500 + }, + { + "epoch": 2.3324100694200967, + "grad_norm": 3.6061112880706787, + "learning_rate": 1.236284070693036e-05, + "loss": 1.1902, + "step": 66525 + }, + { + "epoch": 2.333286585793423, + "grad_norm": 2.9957661628723145, + "learning_rate": 1.2346608837928036e-05, + "loss": 1.3346, + "step": 66550 + }, + { + "epoch": 2.3341631021667486, + "grad_norm": 6.258781909942627, + "learning_rate": 1.233037696892571e-05, + "loss": 1.1473, + "step": 66575 + }, + { + "epoch": 2.3350396185400744, + "grad_norm": 12.420341491699219, + "learning_rate": 1.2314145099923387e-05, + "loss": 1.2126, + "step": 66600 + }, + { + "epoch": 2.3359161349134, + "grad_norm": 3.5512471199035645, + "learning_rate": 1.2297913230921061e-05, + "loss": 1.4138, + "step": 66625 + }, + { + "epoch": 2.336792651286726, + "grad_norm": 3.0088722705841064, + "learning_rate": 1.2281681361918738e-05, + "loss": 1.4367, + "step": 66650 + }, + { + "epoch": 2.337669167660052, + "grad_norm": 5.246761798858643, + "learning_rate": 1.2265449492916412e-05, + "loss": 1.3053, + "step": 66675 + }, + { + "epoch": 2.338545684033378, + "grad_norm": 4.059486389160156, + "learning_rate": 1.2249217623914089e-05, + "loss": 1.2769, + "step": 66700 + }, + { + "epoch": 2.3394222004067036, + "grad_norm": 5.219972610473633, + "learning_rate": 1.2232985754911765e-05, + "loss": 1.5008, + "step": 66725 + }, + { + "epoch": 2.3402987167800293, + "grad_norm": 5.141010284423828, + "learning_rate": 1.221675388590944e-05, + "loss": 1.169, + "step": 66750 + }, + { + "epoch": 2.341175233153355, + "grad_norm": 5.214272975921631, + "learning_rate": 1.2200522016907114e-05, + "loss": 1.4271, + "step": 66775 + }, + { + "epoch": 2.3420517495266813, + "grad_norm": 7.851667404174805, + "learning_rate": 1.218429014790479e-05, + "loss": 1.3818, + "step": 66800 + }, + { + "epoch": 2.342928265900007, + "grad_norm": 4.76398229598999, + "learning_rate": 1.2168058278902467e-05, + "loss": 1.2085, + "step": 66825 + }, + { + "epoch": 2.3438047822733328, + "grad_norm": 6.248112201690674, + "learning_rate": 1.2151826409900142e-05, + "loss": 1.265, + "step": 66850 + }, + { + "epoch": 2.3446812986466585, + "grad_norm": 6.2776899337768555, + "learning_rate": 1.2135594540897818e-05, + "loss": 1.507, + "step": 66875 + }, + { + "epoch": 2.3455578150199847, + "grad_norm": 7.714486598968506, + "learning_rate": 1.2119362671895493e-05, + "loss": 1.4698, + "step": 66900 + }, + { + "epoch": 2.3464343313933105, + "grad_norm": 6.639073371887207, + "learning_rate": 1.2103130802893169e-05, + "loss": 1.5053, + "step": 66925 + }, + { + "epoch": 2.3473108477666362, + "grad_norm": 5.089391708374023, + "learning_rate": 1.2086898933890844e-05, + "loss": 1.3339, + "step": 66950 + }, + { + "epoch": 2.348187364139962, + "grad_norm": 6.181281566619873, + "learning_rate": 1.207066706488852e-05, + "loss": 1.4314, + "step": 66975 + }, + { + "epoch": 2.349063880513288, + "grad_norm": 4.252866744995117, + "learning_rate": 1.2054435195886196e-05, + "loss": 1.2223, + "step": 67000 + }, + { + "epoch": 2.349940396886614, + "grad_norm": 3.8136186599731445, + "learning_rate": 1.2038203326883871e-05, + "loss": 1.5342, + "step": 67025 + }, + { + "epoch": 2.3508169132599397, + "grad_norm": 0.3348371088504791, + "learning_rate": 1.2021971457881546e-05, + "loss": 1.3498, + "step": 67050 + }, + { + "epoch": 2.3516934296332654, + "grad_norm": 6.191456317901611, + "learning_rate": 1.2005739588879222e-05, + "loss": 1.3251, + "step": 67075 + }, + { + "epoch": 2.352569946006591, + "grad_norm": 5.127946376800537, + "learning_rate": 1.1989507719876898e-05, + "loss": 1.369, + "step": 67100 + }, + { + "epoch": 2.3534464623799174, + "grad_norm": 7.636602878570557, + "learning_rate": 1.1973275850874575e-05, + "loss": 1.2727, + "step": 67125 + }, + { + "epoch": 2.354322978753243, + "grad_norm": 3.425740957260132, + "learning_rate": 1.195704398187225e-05, + "loss": 1.264, + "step": 67150 + }, + { + "epoch": 2.355199495126569, + "grad_norm": 8.71462345123291, + "learning_rate": 1.1940812112869924e-05, + "loss": 1.27, + "step": 67175 + }, + { + "epoch": 2.3560760114998947, + "grad_norm": 5.049646854400635, + "learning_rate": 1.19245802438676e-05, + "loss": 1.316, + "step": 67200 + }, + { + "epoch": 2.356952527873221, + "grad_norm": 3.5590717792510986, + "learning_rate": 1.1908348374865277e-05, + "loss": 1.396, + "step": 67225 + }, + { + "epoch": 2.3578290442465466, + "grad_norm": 6.117215633392334, + "learning_rate": 1.1892116505862951e-05, + "loss": 1.2454, + "step": 67250 + }, + { + "epoch": 2.3587055606198724, + "grad_norm": 3.2125706672668457, + "learning_rate": 1.1875884636860628e-05, + "loss": 1.3358, + "step": 67275 + }, + { + "epoch": 2.359582076993198, + "grad_norm": 6.520144939422607, + "learning_rate": 1.1859652767858302e-05, + "loss": 1.4485, + "step": 67300 + }, + { + "epoch": 2.3604585933665243, + "grad_norm": 5.095522880554199, + "learning_rate": 1.1843420898855979e-05, + "loss": 1.5557, + "step": 67325 + }, + { + "epoch": 2.36133510973985, + "grad_norm": 5.485278606414795, + "learning_rate": 1.1827189029853653e-05, + "loss": 1.4017, + "step": 67350 + }, + { + "epoch": 2.362211626113176, + "grad_norm": 3.431875228881836, + "learning_rate": 1.181095716085133e-05, + "loss": 1.2363, + "step": 67375 + }, + { + "epoch": 2.3630881424865016, + "grad_norm": 5.415625095367432, + "learning_rate": 1.1794725291849006e-05, + "loss": 1.4984, + "step": 67400 + }, + { + "epoch": 2.3639646588598273, + "grad_norm": 8.294529914855957, + "learning_rate": 1.177849342284668e-05, + "loss": 1.1948, + "step": 67425 + }, + { + "epoch": 2.3648411752331535, + "grad_norm": 6.95084810256958, + "learning_rate": 1.1762261553844355e-05, + "loss": 1.2955, + "step": 67450 + }, + { + "epoch": 2.3657176916064793, + "grad_norm": 12.713452339172363, + "learning_rate": 1.1746029684842032e-05, + "loss": 1.4774, + "step": 67475 + }, + { + "epoch": 2.366594207979805, + "grad_norm": 14.597761154174805, + "learning_rate": 1.1729797815839708e-05, + "loss": 1.6036, + "step": 67500 + }, + { + "epoch": 2.3674707243531308, + "grad_norm": 7.804803848266602, + "learning_rate": 1.1713565946837384e-05, + "loss": 1.2269, + "step": 67525 + }, + { + "epoch": 2.3683472407264565, + "grad_norm": 6.2915754318237305, + "learning_rate": 1.1697334077835059e-05, + "loss": 1.2376, + "step": 67550 + }, + { + "epoch": 2.3692237570997827, + "grad_norm": 4.893512725830078, + "learning_rate": 1.1681102208832734e-05, + "loss": 1.6003, + "step": 67575 + }, + { + "epoch": 2.3701002734731085, + "grad_norm": 9.638872146606445, + "learning_rate": 1.166487033983041e-05, + "loss": 1.2465, + "step": 67600 + }, + { + "epoch": 2.3709767898464342, + "grad_norm": 6.564438819885254, + "learning_rate": 1.1648638470828086e-05, + "loss": 1.2292, + "step": 67625 + }, + { + "epoch": 2.3718533062197604, + "grad_norm": 6.55390739440918, + "learning_rate": 1.1632406601825761e-05, + "loss": 1.3807, + "step": 67650 + }, + { + "epoch": 2.372729822593086, + "grad_norm": 19.753761291503906, + "learning_rate": 1.1616174732823437e-05, + "loss": 1.5243, + "step": 67675 + }, + { + "epoch": 2.373606338966412, + "grad_norm": 5.744351863861084, + "learning_rate": 1.1599942863821114e-05, + "loss": 1.349, + "step": 67700 + }, + { + "epoch": 2.3744828553397377, + "grad_norm": 10.720484733581543, + "learning_rate": 1.1583710994818788e-05, + "loss": 1.3652, + "step": 67725 + }, + { + "epoch": 2.3753593717130634, + "grad_norm": 7.839745998382568, + "learning_rate": 1.1567479125816463e-05, + "loss": 1.2857, + "step": 67750 + }, + { + "epoch": 2.3762358880863896, + "grad_norm": 7.6176276206970215, + "learning_rate": 1.155124725681414e-05, + "loss": 1.289, + "step": 67775 + }, + { + "epoch": 2.3771124044597154, + "grad_norm": 5.830434799194336, + "learning_rate": 1.1535015387811816e-05, + "loss": 1.1162, + "step": 67800 + }, + { + "epoch": 2.377988920833041, + "grad_norm": 5.353158473968506, + "learning_rate": 1.151878351880949e-05, + "loss": 1.4974, + "step": 67825 + }, + { + "epoch": 2.378865437206367, + "grad_norm": 6.223961353302002, + "learning_rate": 1.1502551649807165e-05, + "loss": 1.4721, + "step": 67850 + }, + { + "epoch": 2.3797419535796926, + "grad_norm": 3.869769811630249, + "learning_rate": 1.1486319780804841e-05, + "loss": 1.2941, + "step": 67875 + }, + { + "epoch": 2.380618469953019, + "grad_norm": 9.043240547180176, + "learning_rate": 1.1470087911802518e-05, + "loss": 1.3491, + "step": 67900 + }, + { + "epoch": 2.3814949863263446, + "grad_norm": 15.45620346069336, + "learning_rate": 1.1453856042800192e-05, + "loss": 1.4145, + "step": 67925 + }, + { + "epoch": 2.3823715026996704, + "grad_norm": 4.688398361206055, + "learning_rate": 1.1437624173797869e-05, + "loss": 1.0981, + "step": 67950 + }, + { + "epoch": 2.383248019072996, + "grad_norm": 17.60409927368164, + "learning_rate": 1.1421392304795545e-05, + "loss": 1.4341, + "step": 67975 + }, + { + "epoch": 2.3841245354463223, + "grad_norm": 3.730234384536743, + "learning_rate": 1.140516043579322e-05, + "loss": 1.3659, + "step": 68000 + }, + { + "epoch": 2.385001051819648, + "grad_norm": 5.758042812347412, + "learning_rate": 1.1388928566790894e-05, + "loss": 1.0819, + "step": 68025 + }, + { + "epoch": 2.385877568192974, + "grad_norm": 5.108676910400391, + "learning_rate": 1.137269669778857e-05, + "loss": 1.2622, + "step": 68050 + }, + { + "epoch": 2.3867540845662996, + "grad_norm": 6.270639896392822, + "learning_rate": 1.1356464828786247e-05, + "loss": 1.5273, + "step": 68075 + }, + { + "epoch": 2.3876306009396258, + "grad_norm": 3.8803985118865967, + "learning_rate": 1.1340232959783923e-05, + "loss": 1.4824, + "step": 68100 + }, + { + "epoch": 2.3885071173129515, + "grad_norm": 5.450827121734619, + "learning_rate": 1.1324001090781596e-05, + "loss": 1.4588, + "step": 68125 + }, + { + "epoch": 2.3893836336862773, + "grad_norm": 9.719725608825684, + "learning_rate": 1.1307769221779272e-05, + "loss": 1.2743, + "step": 68150 + }, + { + "epoch": 2.390260150059603, + "grad_norm": 3.902130126953125, + "learning_rate": 1.1291537352776949e-05, + "loss": 1.3505, + "step": 68175 + }, + { + "epoch": 2.3911366664329288, + "grad_norm": 5.322880268096924, + "learning_rate": 1.1275305483774625e-05, + "loss": 1.4635, + "step": 68200 + }, + { + "epoch": 2.392013182806255, + "grad_norm": 11.213282585144043, + "learning_rate": 1.12590736147723e-05, + "loss": 1.1503, + "step": 68225 + }, + { + "epoch": 2.3928896991795807, + "grad_norm": 7.674829006195068, + "learning_rate": 1.1242841745769976e-05, + "loss": 1.452, + "step": 68250 + }, + { + "epoch": 2.3937662155529065, + "grad_norm": 5.447293281555176, + "learning_rate": 1.122660987676765e-05, + "loss": 1.2225, + "step": 68275 + }, + { + "epoch": 2.3946427319262322, + "grad_norm": 6.576241970062256, + "learning_rate": 1.1210378007765327e-05, + "loss": 1.3598, + "step": 68300 + }, + { + "epoch": 2.3955192482995584, + "grad_norm": 10.302413940429688, + "learning_rate": 1.1194146138763002e-05, + "loss": 1.3756, + "step": 68325 + }, + { + "epoch": 2.396395764672884, + "grad_norm": 5.237799644470215, + "learning_rate": 1.1177914269760678e-05, + "loss": 1.4599, + "step": 68350 + }, + { + "epoch": 2.39727228104621, + "grad_norm": 5.441614151000977, + "learning_rate": 1.1161682400758354e-05, + "loss": 1.3786, + "step": 68375 + }, + { + "epoch": 2.3981487974195357, + "grad_norm": 4.767749786376953, + "learning_rate": 1.1145450531756029e-05, + "loss": 1.2437, + "step": 68400 + }, + { + "epoch": 2.399025313792862, + "grad_norm": 4.06847620010376, + "learning_rate": 1.1129218662753704e-05, + "loss": 1.229, + "step": 68425 + }, + { + "epoch": 2.3999018301661876, + "grad_norm": 7.522512912750244, + "learning_rate": 1.111298679375138e-05, + "loss": 1.3946, + "step": 68450 + }, + { + "epoch": 2.4007783465395134, + "grad_norm": 8.218101501464844, + "learning_rate": 1.1096754924749056e-05, + "loss": 1.3987, + "step": 68475 + }, + { + "epoch": 2.401654862912839, + "grad_norm": 8.022956848144531, + "learning_rate": 1.1080523055746731e-05, + "loss": 1.351, + "step": 68500 + }, + { + "epoch": 2.402531379286165, + "grad_norm": 7.761036396026611, + "learning_rate": 1.1064291186744407e-05, + "loss": 1.4522, + "step": 68525 + }, + { + "epoch": 2.403407895659491, + "grad_norm": 5.859753131866455, + "learning_rate": 1.1048059317742082e-05, + "loss": 1.3047, + "step": 68550 + }, + { + "epoch": 2.404284412032817, + "grad_norm": 8.199419021606445, + "learning_rate": 1.1031827448739758e-05, + "loss": 1.5411, + "step": 68575 + }, + { + "epoch": 2.4051609284061426, + "grad_norm": 3.6655709743499756, + "learning_rate": 1.1015595579737433e-05, + "loss": 1.244, + "step": 68600 + }, + { + "epoch": 2.4060374447794683, + "grad_norm": 5.963811874389648, + "learning_rate": 1.099936371073511e-05, + "loss": 1.5508, + "step": 68625 + }, + { + "epoch": 2.406913961152794, + "grad_norm": 9.749382972717285, + "learning_rate": 1.0983131841732786e-05, + "loss": 1.4934, + "step": 68650 + }, + { + "epoch": 2.4077904775261203, + "grad_norm": 6.059975624084473, + "learning_rate": 1.096689997273046e-05, + "loss": 1.3325, + "step": 68675 + }, + { + "epoch": 2.408666993899446, + "grad_norm": 12.585299491882324, + "learning_rate": 1.0950668103728137e-05, + "loss": 1.3439, + "step": 68700 + }, + { + "epoch": 2.409543510272772, + "grad_norm": 5.510673522949219, + "learning_rate": 1.0934436234725811e-05, + "loss": 1.4836, + "step": 68725 + }, + { + "epoch": 2.4104200266460976, + "grad_norm": 4.810507774353027, + "learning_rate": 1.0918204365723488e-05, + "loss": 1.2463, + "step": 68750 + }, + { + "epoch": 2.4112965430194238, + "grad_norm": 6.378121376037598, + "learning_rate": 1.0901972496721164e-05, + "loss": 1.5456, + "step": 68775 + }, + { + "epoch": 2.4121730593927495, + "grad_norm": 0.1079462319612503, + "learning_rate": 1.0885740627718839e-05, + "loss": 1.1993, + "step": 68800 + }, + { + "epoch": 2.4130495757660753, + "grad_norm": 4.412527561187744, + "learning_rate": 1.0869508758716513e-05, + "loss": 1.4499, + "step": 68825 + }, + { + "epoch": 2.413926092139401, + "grad_norm": 6.493129253387451, + "learning_rate": 1.085327688971419e-05, + "loss": 1.5954, + "step": 68850 + }, + { + "epoch": 2.414802608512727, + "grad_norm": 12.93430233001709, + "learning_rate": 1.0837045020711866e-05, + "loss": 1.5468, + "step": 68875 + }, + { + "epoch": 2.415679124886053, + "grad_norm": 8.088399887084961, + "learning_rate": 1.082081315170954e-05, + "loss": 1.397, + "step": 68900 + }, + { + "epoch": 2.4165556412593787, + "grad_norm": 3.7516894340515137, + "learning_rate": 1.0804581282707217e-05, + "loss": 1.2605, + "step": 68925 + }, + { + "epoch": 2.4174321576327045, + "grad_norm": 8.067241668701172, + "learning_rate": 1.0788349413704892e-05, + "loss": 1.4247, + "step": 68950 + }, + { + "epoch": 2.4183086740060302, + "grad_norm": 10.616955757141113, + "learning_rate": 1.0772117544702568e-05, + "loss": 1.4214, + "step": 68975 + }, + { + "epoch": 2.4191851903793564, + "grad_norm": 3.6030430793762207, + "learning_rate": 1.0755885675700243e-05, + "loss": 1.3034, + "step": 69000 + }, + { + "epoch": 2.420061706752682, + "grad_norm": 4.475828170776367, + "learning_rate": 1.0739653806697919e-05, + "loss": 1.2718, + "step": 69025 + }, + { + "epoch": 2.420938223126008, + "grad_norm": 4.648390293121338, + "learning_rate": 1.0723421937695595e-05, + "loss": 1.3789, + "step": 69050 + }, + { + "epoch": 2.4218147394993337, + "grad_norm": 5.53602933883667, + "learning_rate": 1.0707190068693272e-05, + "loss": 1.2574, + "step": 69075 + }, + { + "epoch": 2.42269125587266, + "grad_norm": 10.657238960266113, + "learning_rate": 1.0690958199690945e-05, + "loss": 1.6151, + "step": 69100 + }, + { + "epoch": 2.4235677722459856, + "grad_norm": 12.784908294677734, + "learning_rate": 1.0674726330688621e-05, + "loss": 1.2609, + "step": 69125 + }, + { + "epoch": 2.4244442886193114, + "grad_norm": 5.4270853996276855, + "learning_rate": 1.0658494461686297e-05, + "loss": 1.2237, + "step": 69150 + }, + { + "epoch": 2.425320804992637, + "grad_norm": 7.120392799377441, + "learning_rate": 1.0642262592683974e-05, + "loss": 1.2568, + "step": 69175 + }, + { + "epoch": 2.4261973213659633, + "grad_norm": 5.707665920257568, + "learning_rate": 1.0626030723681648e-05, + "loss": 1.4036, + "step": 69200 + }, + { + "epoch": 2.427073837739289, + "grad_norm": 5.330784797668457, + "learning_rate": 1.0609798854679323e-05, + "loss": 1.7774, + "step": 69225 + }, + { + "epoch": 2.427950354112615, + "grad_norm": 10.050646781921387, + "learning_rate": 1.0593566985677e-05, + "loss": 1.659, + "step": 69250 + }, + { + "epoch": 2.4288268704859406, + "grad_norm": 6.590310573577881, + "learning_rate": 1.0577335116674676e-05, + "loss": 1.5512, + "step": 69275 + }, + { + "epoch": 2.4297033868592663, + "grad_norm": 7.697371959686279, + "learning_rate": 1.056110324767235e-05, + "loss": 1.5168, + "step": 69300 + }, + { + "epoch": 2.4305799032325925, + "grad_norm": 10.983071327209473, + "learning_rate": 1.0544871378670027e-05, + "loss": 1.4443, + "step": 69325 + }, + { + "epoch": 2.4314564196059183, + "grad_norm": 5.53188943862915, + "learning_rate": 1.0528639509667703e-05, + "loss": 1.4331, + "step": 69350 + }, + { + "epoch": 2.432332935979244, + "grad_norm": 6.846372127532959, + "learning_rate": 1.0512407640665378e-05, + "loss": 1.3743, + "step": 69375 + }, + { + "epoch": 2.43320945235257, + "grad_norm": 7.041776657104492, + "learning_rate": 1.0496175771663052e-05, + "loss": 1.5299, + "step": 69400 + }, + { + "epoch": 2.4340859687258956, + "grad_norm": 6.105144023895264, + "learning_rate": 1.0479943902660729e-05, + "loss": 1.2891, + "step": 69425 + }, + { + "epoch": 2.4349624850992218, + "grad_norm": 5.625642776489258, + "learning_rate": 1.0463712033658405e-05, + "loss": 1.3624, + "step": 69450 + }, + { + "epoch": 2.4358390014725475, + "grad_norm": 8.459287643432617, + "learning_rate": 1.044748016465608e-05, + "loss": 1.524, + "step": 69475 + }, + { + "epoch": 2.4367155178458733, + "grad_norm": 8.358732223510742, + "learning_rate": 1.0431248295653754e-05, + "loss": 1.2738, + "step": 69500 + }, + { + "epoch": 2.4375920342191995, + "grad_norm": 3.5559542179107666, + "learning_rate": 1.041501642665143e-05, + "loss": 1.2854, + "step": 69525 + }, + { + "epoch": 2.438468550592525, + "grad_norm": 11.328535079956055, + "learning_rate": 1.0398784557649107e-05, + "loss": 1.2894, + "step": 69550 + }, + { + "epoch": 2.439345066965851, + "grad_norm": 8.327231407165527, + "learning_rate": 1.0382552688646781e-05, + "loss": 1.3767, + "step": 69575 + }, + { + "epoch": 2.4402215833391767, + "grad_norm": 13.856952667236328, + "learning_rate": 1.0366320819644458e-05, + "loss": 1.4239, + "step": 69600 + }, + { + "epoch": 2.4410980997125025, + "grad_norm": 0.39048582315444946, + "learning_rate": 1.0350088950642134e-05, + "loss": 1.3945, + "step": 69625 + }, + { + "epoch": 2.4419746160858287, + "grad_norm": 4.744585990905762, + "learning_rate": 1.0333857081639809e-05, + "loss": 1.1111, + "step": 69650 + }, + { + "epoch": 2.4428511324591544, + "grad_norm": 9.59121322631836, + "learning_rate": 1.0317625212637483e-05, + "loss": 1.5399, + "step": 69675 + }, + { + "epoch": 2.44372764883248, + "grad_norm": 4.546262264251709, + "learning_rate": 1.030139334363516e-05, + "loss": 1.3382, + "step": 69700 + }, + { + "epoch": 2.444604165205806, + "grad_norm": 5.815179824829102, + "learning_rate": 1.0285161474632836e-05, + "loss": 1.3412, + "step": 69725 + }, + { + "epoch": 2.4454806815791317, + "grad_norm": 5.249732971191406, + "learning_rate": 1.0268929605630512e-05, + "loss": 1.6306, + "step": 69750 + }, + { + "epoch": 2.446357197952458, + "grad_norm": 10.636207580566406, + "learning_rate": 1.0252697736628185e-05, + "loss": 1.32, + "step": 69775 + }, + { + "epoch": 2.4472337143257836, + "grad_norm": 4.425610542297363, + "learning_rate": 1.0236465867625862e-05, + "loss": 1.6169, + "step": 69800 + }, + { + "epoch": 2.4481102306991094, + "grad_norm": 5.571956157684326, + "learning_rate": 1.0220233998623538e-05, + "loss": 1.4644, + "step": 69825 + }, + { + "epoch": 2.448986747072435, + "grad_norm": 5.285646438598633, + "learning_rate": 1.0204002129621214e-05, + "loss": 1.4868, + "step": 69850 + }, + { + "epoch": 2.4498632634457613, + "grad_norm": 10.48544979095459, + "learning_rate": 1.0187770260618889e-05, + "loss": 1.2492, + "step": 69875 + }, + { + "epoch": 2.450739779819087, + "grad_norm": 4.978031635284424, + "learning_rate": 1.0171538391616565e-05, + "loss": 1.2969, + "step": 69900 + }, + { + "epoch": 2.451616296192413, + "grad_norm": 4.813091278076172, + "learning_rate": 1.015530652261424e-05, + "loss": 1.5551, + "step": 69925 + }, + { + "epoch": 2.4524928125657386, + "grad_norm": 5.444601535797119, + "learning_rate": 1.0139074653611916e-05, + "loss": 1.2313, + "step": 69950 + }, + { + "epoch": 2.453369328939065, + "grad_norm": 5.607186317443848, + "learning_rate": 1.0122842784609591e-05, + "loss": 1.429, + "step": 69975 + }, + { + "epoch": 2.4542458453123905, + "grad_norm": 4.9828596115112305, + "learning_rate": 1.0106610915607267e-05, + "loss": 1.1876, + "step": 70000 + }, + { + "epoch": 2.4551223616857163, + "grad_norm": 5.385584354400635, + "learning_rate": 1.0090379046604944e-05, + "loss": 1.3981, + "step": 70025 + }, + { + "epoch": 2.455998878059042, + "grad_norm": 10.007575035095215, + "learning_rate": 1.0074147177602618e-05, + "loss": 1.3336, + "step": 70050 + }, + { + "epoch": 2.456875394432368, + "grad_norm": 5.743343830108643, + "learning_rate": 1.0057915308600293e-05, + "loss": 1.4479, + "step": 70075 + }, + { + "epoch": 2.457751910805694, + "grad_norm": 11.081926345825195, + "learning_rate": 1.004168343959797e-05, + "loss": 1.2387, + "step": 70100 + }, + { + "epoch": 2.4586284271790197, + "grad_norm": 6.275691986083984, + "learning_rate": 1.0025451570595646e-05, + "loss": 1.2709, + "step": 70125 + }, + { + "epoch": 2.4595049435523455, + "grad_norm": 6.006302833557129, + "learning_rate": 1.000921970159332e-05, + "loss": 1.426, + "step": 70150 + }, + { + "epoch": 2.4603814599256713, + "grad_norm": 5.228515148162842, + "learning_rate": 9.992987832590997e-06, + "loss": 1.6584, + "step": 70175 + }, + { + "epoch": 2.4612579762989975, + "grad_norm": 14.154410362243652, + "learning_rate": 9.976755963588671e-06, + "loss": 1.3544, + "step": 70200 + }, + { + "epoch": 2.462134492672323, + "grad_norm": 6.904314994812012, + "learning_rate": 9.960524094586348e-06, + "loss": 1.5085, + "step": 70225 + }, + { + "epoch": 2.463011009045649, + "grad_norm": 5.216091632843018, + "learning_rate": 9.944292225584024e-06, + "loss": 1.2704, + "step": 70250 + }, + { + "epoch": 2.4638875254189747, + "grad_norm": 9.950838088989258, + "learning_rate": 9.928060356581699e-06, + "loss": 1.4352, + "step": 70275 + }, + { + "epoch": 2.464764041792301, + "grad_norm": 3.9176886081695557, + "learning_rate": 9.911828487579375e-06, + "loss": 1.4426, + "step": 70300 + }, + { + "epoch": 2.4656405581656267, + "grad_norm": 5.807528018951416, + "learning_rate": 9.89559661857705e-06, + "loss": 1.5131, + "step": 70325 + }, + { + "epoch": 2.4665170745389524, + "grad_norm": 6.817758083343506, + "learning_rate": 9.879364749574726e-06, + "loss": 1.4219, + "step": 70350 + }, + { + "epoch": 2.467393590912278, + "grad_norm": 4.802738666534424, + "learning_rate": 9.8631328805724e-06, + "loss": 1.1961, + "step": 70375 + }, + { + "epoch": 2.468270107285604, + "grad_norm": 10.23182201385498, + "learning_rate": 9.846901011570077e-06, + "loss": 1.4329, + "step": 70400 + }, + { + "epoch": 2.46914662365893, + "grad_norm": 7.265708923339844, + "learning_rate": 9.830669142567753e-06, + "loss": 1.2623, + "step": 70425 + }, + { + "epoch": 2.470023140032256, + "grad_norm": 18.45274543762207, + "learning_rate": 9.814437273565428e-06, + "loss": 1.3851, + "step": 70450 + }, + { + "epoch": 2.4708996564055816, + "grad_norm": 8.482441902160645, + "learning_rate": 9.798205404563103e-06, + "loss": 1.4368, + "step": 70475 + }, + { + "epoch": 2.4717761727789074, + "grad_norm": 8.700657844543457, + "learning_rate": 9.781973535560779e-06, + "loss": 1.5407, + "step": 70500 + }, + { + "epoch": 2.472652689152233, + "grad_norm": 7.514232635498047, + "learning_rate": 9.765741666558455e-06, + "loss": 1.4234, + "step": 70525 + }, + { + "epoch": 2.4735292055255593, + "grad_norm": 9.35762882232666, + "learning_rate": 9.74950979755613e-06, + "loss": 1.8144, + "step": 70550 + }, + { + "epoch": 2.474405721898885, + "grad_norm": 9.915963172912598, + "learning_rate": 9.733277928553806e-06, + "loss": 1.5248, + "step": 70575 + }, + { + "epoch": 2.475282238272211, + "grad_norm": 5.961577892303467, + "learning_rate": 9.717046059551481e-06, + "loss": 1.713, + "step": 70600 + }, + { + "epoch": 2.4761587546455366, + "grad_norm": 5.170897960662842, + "learning_rate": 9.700814190549157e-06, + "loss": 1.587, + "step": 70625 + }, + { + "epoch": 2.477035271018863, + "grad_norm": 3.5307393074035645, + "learning_rate": 9.684582321546832e-06, + "loss": 1.6887, + "step": 70650 + }, + { + "epoch": 2.4779117873921885, + "grad_norm": 4.7386474609375, + "learning_rate": 9.668350452544508e-06, + "loss": 1.446, + "step": 70675 + }, + { + "epoch": 2.4787883037655143, + "grad_norm": 5.714728832244873, + "learning_rate": 9.652118583542185e-06, + "loss": 1.4055, + "step": 70700 + }, + { + "epoch": 2.47966482013884, + "grad_norm": 6.416653633117676, + "learning_rate": 9.63588671453986e-06, + "loss": 1.4051, + "step": 70725 + }, + { + "epoch": 2.4805413365121662, + "grad_norm": 4.722318172454834, + "learning_rate": 9.619654845537534e-06, + "loss": 1.3477, + "step": 70750 + }, + { + "epoch": 2.481417852885492, + "grad_norm": 9.822413444519043, + "learning_rate": 9.60342297653521e-06, + "loss": 1.3958, + "step": 70775 + }, + { + "epoch": 2.4822943692588177, + "grad_norm": 3.4541211128234863, + "learning_rate": 9.587191107532887e-06, + "loss": 1.41, + "step": 70800 + }, + { + "epoch": 2.4831708856321435, + "grad_norm": 11.967958450317383, + "learning_rate": 9.570959238530563e-06, + "loss": 1.0918, + "step": 70825 + }, + { + "epoch": 2.4840474020054693, + "grad_norm": 8.711935043334961, + "learning_rate": 9.554727369528238e-06, + "loss": 1.423, + "step": 70850 + }, + { + "epoch": 2.4849239183787954, + "grad_norm": 9.730420112609863, + "learning_rate": 9.538495500525912e-06, + "loss": 1.2093, + "step": 70875 + }, + { + "epoch": 2.485800434752121, + "grad_norm": 5.395915508270264, + "learning_rate": 9.522263631523589e-06, + "loss": 1.4026, + "step": 70900 + }, + { + "epoch": 2.486676951125447, + "grad_norm": 4.823686599731445, + "learning_rate": 9.506031762521265e-06, + "loss": 1.5856, + "step": 70925 + }, + { + "epoch": 2.4875534674987727, + "grad_norm": 11.376557350158691, + "learning_rate": 9.48979989351894e-06, + "loss": 1.3565, + "step": 70950 + }, + { + "epoch": 2.488429983872099, + "grad_norm": 5.467879295349121, + "learning_rate": 9.473568024516616e-06, + "loss": 1.5825, + "step": 70975 + }, + { + "epoch": 2.4893065002454247, + "grad_norm": 5.279167175292969, + "learning_rate": 9.45733615551429e-06, + "loss": 1.4391, + "step": 71000 + }, + { + "epoch": 2.4901830166187504, + "grad_norm": 12.771502494812012, + "learning_rate": 9.441104286511967e-06, + "loss": 1.3838, + "step": 71025 + }, + { + "epoch": 2.491059532992076, + "grad_norm": 10.200343132019043, + "learning_rate": 9.424872417509641e-06, + "loss": 1.4121, + "step": 71050 + }, + { + "epoch": 2.4919360493654024, + "grad_norm": 11.834638595581055, + "learning_rate": 9.408640548507318e-06, + "loss": 1.4912, + "step": 71075 + }, + { + "epoch": 2.492812565738728, + "grad_norm": 10.702736854553223, + "learning_rate": 9.392408679504994e-06, + "loss": 1.4444, + "step": 71100 + }, + { + "epoch": 2.493689082112054, + "grad_norm": 5.230205059051514, + "learning_rate": 9.376176810502669e-06, + "loss": 1.5649, + "step": 71125 + }, + { + "epoch": 2.4945655984853796, + "grad_norm": 3.3453924655914307, + "learning_rate": 9.359944941500343e-06, + "loss": 1.2737, + "step": 71150 + }, + { + "epoch": 2.4954421148587054, + "grad_norm": 3.2534615993499756, + "learning_rate": 9.34371307249802e-06, + "loss": 1.2116, + "step": 71175 + }, + { + "epoch": 2.4963186312320316, + "grad_norm": 6.174441337585449, + "learning_rate": 9.327481203495696e-06, + "loss": 1.2755, + "step": 71200 + }, + { + "epoch": 2.4971951476053573, + "grad_norm": 6.518664836883545, + "learning_rate": 9.31124933449337e-06, + "loss": 1.3794, + "step": 71225 + }, + { + "epoch": 2.498071663978683, + "grad_norm": 4.957963466644287, + "learning_rate": 9.295017465491047e-06, + "loss": 1.663, + "step": 71250 + }, + { + "epoch": 2.498948180352009, + "grad_norm": 4.674337863922119, + "learning_rate": 9.278785596488722e-06, + "loss": 1.2952, + "step": 71275 + }, + { + "epoch": 2.4998246967253346, + "grad_norm": 8.47387409210205, + "learning_rate": 9.262553727486398e-06, + "loss": 1.1899, + "step": 71300 + }, + { + "epoch": 2.500701213098661, + "grad_norm": 5.968294143676758, + "learning_rate": 9.246321858484073e-06, + "loss": 1.188, + "step": 71325 + }, + { + "epoch": 2.5015777294719865, + "grad_norm": 10.668359756469727, + "learning_rate": 9.230089989481749e-06, + "loss": 1.2379, + "step": 71350 + }, + { + "epoch": 2.5024542458453123, + "grad_norm": 10.559553146362305, + "learning_rate": 9.213858120479425e-06, + "loss": 1.3363, + "step": 71375 + }, + { + "epoch": 2.5033307622186385, + "grad_norm": 6.458956241607666, + "learning_rate": 9.197626251477102e-06, + "loss": 1.5016, + "step": 71400 + }, + { + "epoch": 2.5042072785919642, + "grad_norm": 15.261276245117188, + "learning_rate": 9.181394382474776e-06, + "loss": 1.3526, + "step": 71425 + }, + { + "epoch": 2.50508379496529, + "grad_norm": 6.02475643157959, + "learning_rate": 9.165162513472451e-06, + "loss": 1.3596, + "step": 71450 + }, + { + "epoch": 2.5059603113386157, + "grad_norm": 11.272513389587402, + "learning_rate": 9.148930644470127e-06, + "loss": 1.4846, + "step": 71475 + }, + { + "epoch": 2.5068368277119415, + "grad_norm": 4.351443767547607, + "learning_rate": 9.132698775467804e-06, + "loss": 1.3289, + "step": 71500 + }, + { + "epoch": 2.5077133440852677, + "grad_norm": 5.156081676483154, + "learning_rate": 9.116466906465478e-06, + "loss": 1.2947, + "step": 71525 + }, + { + "epoch": 2.5085898604585934, + "grad_norm": 6.222312927246094, + "learning_rate": 9.100235037463153e-06, + "loss": 1.4874, + "step": 71550 + }, + { + "epoch": 2.509466376831919, + "grad_norm": 8.88426399230957, + "learning_rate": 9.08400316846083e-06, + "loss": 1.2568, + "step": 71575 + }, + { + "epoch": 2.510342893205245, + "grad_norm": 4.125082969665527, + "learning_rate": 9.067771299458506e-06, + "loss": 1.2758, + "step": 71600 + }, + { + "epoch": 2.5112194095785707, + "grad_norm": 6.845038414001465, + "learning_rate": 9.05153943045618e-06, + "loss": 1.3326, + "step": 71625 + }, + { + "epoch": 2.512095925951897, + "grad_norm": 8.04570484161377, + "learning_rate": 9.035307561453857e-06, + "loss": 1.3113, + "step": 71650 + }, + { + "epoch": 2.5129724423252227, + "grad_norm": 4.814210891723633, + "learning_rate": 9.019075692451533e-06, + "loss": 1.2989, + "step": 71675 + }, + { + "epoch": 2.5138489586985484, + "grad_norm": 11.167763710021973, + "learning_rate": 9.002843823449208e-06, + "loss": 1.3037, + "step": 71700 + }, + { + "epoch": 2.5147254750718746, + "grad_norm": 4.483511924743652, + "learning_rate": 8.986611954446882e-06, + "loss": 1.1465, + "step": 71725 + }, + { + "epoch": 2.5156019914452004, + "grad_norm": 6.508684158325195, + "learning_rate": 8.970380085444559e-06, + "loss": 1.5615, + "step": 71750 + }, + { + "epoch": 2.516478507818526, + "grad_norm": 7.059176921844482, + "learning_rate": 8.954148216442235e-06, + "loss": 1.4737, + "step": 71775 + }, + { + "epoch": 2.517355024191852, + "grad_norm": 5.442532539367676, + "learning_rate": 8.937916347439911e-06, + "loss": 1.4215, + "step": 71800 + }, + { + "epoch": 2.5182315405651776, + "grad_norm": 4.990203380584717, + "learning_rate": 8.921684478437584e-06, + "loss": 1.2816, + "step": 71825 + }, + { + "epoch": 2.519108056938504, + "grad_norm": 4.586921215057373, + "learning_rate": 8.90545260943526e-06, + "loss": 1.4027, + "step": 71850 + }, + { + "epoch": 2.5199845733118296, + "grad_norm": 6.29127836227417, + "learning_rate": 8.889220740432937e-06, + "loss": 1.1512, + "step": 71875 + }, + { + "epoch": 2.5208610896851553, + "grad_norm": 0.10335668921470642, + "learning_rate": 8.872988871430613e-06, + "loss": 1.2126, + "step": 71900 + }, + { + "epoch": 2.521737606058481, + "grad_norm": 5.757841110229492, + "learning_rate": 8.856757002428288e-06, + "loss": 1.2973, + "step": 71925 + }, + { + "epoch": 2.522614122431807, + "grad_norm": 5.248438358306885, + "learning_rate": 8.840525133425964e-06, + "loss": 1.5339, + "step": 71950 + }, + { + "epoch": 2.523490638805133, + "grad_norm": 4.748878479003906, + "learning_rate": 8.824293264423639e-06, + "loss": 1.5973, + "step": 71975 + }, + { + "epoch": 2.5243671551784588, + "grad_norm": 9.404908180236816, + "learning_rate": 8.808061395421315e-06, + "loss": 1.4316, + "step": 72000 + }, + { + "epoch": 2.5252436715517845, + "grad_norm": 9.20960807800293, + "learning_rate": 8.79182952641899e-06, + "loss": 1.3036, + "step": 72025 + }, + { + "epoch": 2.5261201879251103, + "grad_norm": 6.855129718780518, + "learning_rate": 8.775597657416666e-06, + "loss": 1.3802, + "step": 72050 + }, + { + "epoch": 2.526996704298436, + "grad_norm": 5.258065700531006, + "learning_rate": 8.759365788414343e-06, + "loss": 1.4503, + "step": 72075 + }, + { + "epoch": 2.5278732206717622, + "grad_norm": 7.741124629974365, + "learning_rate": 8.743133919412017e-06, + "loss": 1.2636, + "step": 72100 + }, + { + "epoch": 2.528749737045088, + "grad_norm": 10.79500675201416, + "learning_rate": 8.726902050409692e-06, + "loss": 1.7836, + "step": 72125 + }, + { + "epoch": 2.5296262534184137, + "grad_norm": 6.621405124664307, + "learning_rate": 8.710670181407368e-06, + "loss": 1.3416, + "step": 72150 + }, + { + "epoch": 2.53050276979174, + "grad_norm": 8.074844360351562, + "learning_rate": 8.694438312405045e-06, + "loss": 1.4049, + "step": 72175 + }, + { + "epoch": 2.5313792861650657, + "grad_norm": 5.183954238891602, + "learning_rate": 8.67820644340272e-06, + "loss": 1.2867, + "step": 72200 + }, + { + "epoch": 2.5322558025383914, + "grad_norm": 3.06833553314209, + "learning_rate": 8.661974574400396e-06, + "loss": 1.1413, + "step": 72225 + }, + { + "epoch": 2.533132318911717, + "grad_norm": 4.712790012359619, + "learning_rate": 8.64574270539807e-06, + "loss": 1.5175, + "step": 72250 + }, + { + "epoch": 2.534008835285043, + "grad_norm": 8.348999977111816, + "learning_rate": 8.629510836395747e-06, + "loss": 1.2385, + "step": 72275 + }, + { + "epoch": 2.534885351658369, + "grad_norm": 4.872241973876953, + "learning_rate": 8.613278967393421e-06, + "loss": 1.4364, + "step": 72300 + }, + { + "epoch": 2.535761868031695, + "grad_norm": 6.138674736022949, + "learning_rate": 8.597047098391098e-06, + "loss": 1.5503, + "step": 72325 + }, + { + "epoch": 2.5366383844050207, + "grad_norm": 5.165558338165283, + "learning_rate": 8.580815229388774e-06, + "loss": 1.5029, + "step": 72350 + }, + { + "epoch": 2.5375149007783464, + "grad_norm": 5.93937873840332, + "learning_rate": 8.564583360386449e-06, + "loss": 1.2284, + "step": 72375 + }, + { + "epoch": 2.538391417151672, + "grad_norm": 5.1439528465271, + "learning_rate": 8.548351491384123e-06, + "loss": 1.3342, + "step": 72400 + }, + { + "epoch": 2.5392679335249984, + "grad_norm": 6.246133327484131, + "learning_rate": 8.5321196223818e-06, + "loss": 1.5619, + "step": 72425 + }, + { + "epoch": 2.540144449898324, + "grad_norm": 6.18265438079834, + "learning_rate": 8.515887753379476e-06, + "loss": 1.2359, + "step": 72450 + }, + { + "epoch": 2.54102096627165, + "grad_norm": 4.393498420715332, + "learning_rate": 8.499655884377152e-06, + "loss": 1.2709, + "step": 72475 + }, + { + "epoch": 2.541897482644976, + "grad_norm": 10.41742992401123, + "learning_rate": 8.483424015374827e-06, + "loss": 1.4636, + "step": 72500 + }, + { + "epoch": 2.542773999018302, + "grad_norm": 7.325887680053711, + "learning_rate": 8.467192146372501e-06, + "loss": 1.4284, + "step": 72525 + }, + { + "epoch": 2.5436505153916276, + "grad_norm": 7.314189910888672, + "learning_rate": 8.450960277370178e-06, + "loss": 1.3584, + "step": 72550 + }, + { + "epoch": 2.5445270317649533, + "grad_norm": 9.375946998596191, + "learning_rate": 8.434728408367854e-06, + "loss": 1.1669, + "step": 72575 + }, + { + "epoch": 2.545403548138279, + "grad_norm": 6.589229106903076, + "learning_rate": 8.418496539365529e-06, + "loss": 1.2954, + "step": 72600 + }, + { + "epoch": 2.5462800645116053, + "grad_norm": 6.948800086975098, + "learning_rate": 8.402264670363205e-06, + "loss": 1.324, + "step": 72625 + }, + { + "epoch": 2.547156580884931, + "grad_norm": 9.349984169006348, + "learning_rate": 8.38603280136088e-06, + "loss": 1.3339, + "step": 72650 + }, + { + "epoch": 2.5480330972582568, + "grad_norm": 7.084288120269775, + "learning_rate": 8.369800932358556e-06, + "loss": 1.437, + "step": 72675 + }, + { + "epoch": 2.5489096136315825, + "grad_norm": 4.706659317016602, + "learning_rate": 8.35356906335623e-06, + "loss": 1.4531, + "step": 72700 + }, + { + "epoch": 2.5497861300049083, + "grad_norm": 6.117202281951904, + "learning_rate": 8.337337194353907e-06, + "loss": 1.398, + "step": 72725 + }, + { + "epoch": 2.5506626463782345, + "grad_norm": 5.028677463531494, + "learning_rate": 8.321105325351583e-06, + "loss": 1.396, + "step": 72750 + }, + { + "epoch": 2.5515391627515602, + "grad_norm": 4.792998313903809, + "learning_rate": 8.304873456349258e-06, + "loss": 1.4821, + "step": 72775 + }, + { + "epoch": 2.552415679124886, + "grad_norm": 10.190898895263672, + "learning_rate": 8.288641587346933e-06, + "loss": 1.3942, + "step": 72800 + }, + { + "epoch": 2.553292195498212, + "grad_norm": 7.980866432189941, + "learning_rate": 8.272409718344609e-06, + "loss": 1.2405, + "step": 72825 + }, + { + "epoch": 2.5541687118715375, + "grad_norm": 5.8557448387146, + "learning_rate": 8.256177849342285e-06, + "loss": 1.2846, + "step": 72850 + }, + { + "epoch": 2.5550452282448637, + "grad_norm": 8.189650535583496, + "learning_rate": 8.23994598033996e-06, + "loss": 1.5641, + "step": 72875 + }, + { + "epoch": 2.5559217446181894, + "grad_norm": 6.25494384765625, + "learning_rate": 8.223714111337636e-06, + "loss": 1.1902, + "step": 72900 + }, + { + "epoch": 2.556798260991515, + "grad_norm": 6.3482441902160645, + "learning_rate": 8.207482242335311e-06, + "loss": 1.3945, + "step": 72925 + }, + { + "epoch": 2.5576747773648414, + "grad_norm": 10.085095405578613, + "learning_rate": 8.191250373332987e-06, + "loss": 1.4547, + "step": 72950 + }, + { + "epoch": 2.558551293738167, + "grad_norm": 5.027144908905029, + "learning_rate": 8.175018504330664e-06, + "loss": 1.4522, + "step": 72975 + }, + { + "epoch": 2.559427810111493, + "grad_norm": 10.875364303588867, + "learning_rate": 8.158786635328338e-06, + "loss": 1.384, + "step": 73000 + }, + { + "epoch": 2.5603043264848186, + "grad_norm": 4.796974182128906, + "learning_rate": 8.142554766326015e-06, + "loss": 1.2256, + "step": 73025 + }, + { + "epoch": 2.5611808428581444, + "grad_norm": 0.09913503378629684, + "learning_rate": 8.126322897323691e-06, + "loss": 1.103, + "step": 73050 + }, + { + "epoch": 2.5620573592314706, + "grad_norm": 12.852964401245117, + "learning_rate": 8.110091028321366e-06, + "loss": 1.341, + "step": 73075 + }, + { + "epoch": 2.5629338756047964, + "grad_norm": 10.083839416503906, + "learning_rate": 8.09385915931904e-06, + "loss": 1.2724, + "step": 73100 + }, + { + "epoch": 2.563810391978122, + "grad_norm": 4.931946754455566, + "learning_rate": 8.077627290316717e-06, + "loss": 1.2509, + "step": 73125 + }, + { + "epoch": 2.564686908351448, + "grad_norm": 3.0727267265319824, + "learning_rate": 8.061395421314393e-06, + "loss": 1.2776, + "step": 73150 + }, + { + "epoch": 2.5655634247247736, + "grad_norm": 8.818220138549805, + "learning_rate": 8.045163552312068e-06, + "loss": 1.5177, + "step": 73175 + }, + { + "epoch": 2.5664399410981, + "grad_norm": 7.5944905281066895, + "learning_rate": 8.028931683309742e-06, + "loss": 1.2444, + "step": 73200 + }, + { + "epoch": 2.5673164574714256, + "grad_norm": 11.256046295166016, + "learning_rate": 8.012699814307419e-06, + "loss": 1.2327, + "step": 73225 + }, + { + "epoch": 2.5681929738447513, + "grad_norm": 6.569282054901123, + "learning_rate": 7.996467945305095e-06, + "loss": 1.2505, + "step": 73250 + }, + { + "epoch": 2.5690694902180775, + "grad_norm": 6.855592727661133, + "learning_rate": 7.98023607630277e-06, + "loss": 1.2241, + "step": 73275 + }, + { + "epoch": 2.5699460065914033, + "grad_norm": 11.464797019958496, + "learning_rate": 7.964004207300446e-06, + "loss": 1.4558, + "step": 73300 + }, + { + "epoch": 2.570822522964729, + "grad_norm": 4.544663906097412, + "learning_rate": 7.947772338298122e-06, + "loss": 1.3389, + "step": 73325 + }, + { + "epoch": 2.5716990393380548, + "grad_norm": 7.475327968597412, + "learning_rate": 7.931540469295797e-06, + "loss": 1.2653, + "step": 73350 + }, + { + "epoch": 2.5725755557113805, + "grad_norm": 6.91799259185791, + "learning_rate": 7.915308600293472e-06, + "loss": 1.1753, + "step": 73375 + }, + { + "epoch": 2.5734520720847067, + "grad_norm": 4.662882328033447, + "learning_rate": 7.899076731291148e-06, + "loss": 1.4136, + "step": 73400 + }, + { + "epoch": 2.5743285884580325, + "grad_norm": 6.425024032592773, + "learning_rate": 7.882844862288824e-06, + "loss": 1.5092, + "step": 73425 + }, + { + "epoch": 2.5752051048313582, + "grad_norm": 0.08748676627874374, + "learning_rate": 7.8666129932865e-06, + "loss": 1.5102, + "step": 73450 + }, + { + "epoch": 2.576081621204684, + "grad_norm": 9.487264633178711, + "learning_rate": 7.850381124284174e-06, + "loss": 1.4282, + "step": 73475 + }, + { + "epoch": 2.5769581375780097, + "grad_norm": 9.896917343139648, + "learning_rate": 7.83414925528185e-06, + "loss": 1.5242, + "step": 73500 + }, + { + "epoch": 2.577834653951336, + "grad_norm": 5.128231525421143, + "learning_rate": 7.817917386279526e-06, + "loss": 1.4675, + "step": 73525 + }, + { + "epoch": 2.5787111703246617, + "grad_norm": 7.5461039543151855, + "learning_rate": 7.801685517277203e-06, + "loss": 1.4715, + "step": 73550 + }, + { + "epoch": 2.5795876866979874, + "grad_norm": 5.023724555969238, + "learning_rate": 7.785453648274877e-06, + "loss": 1.3691, + "step": 73575 + }, + { + "epoch": 2.5804642030713136, + "grad_norm": 5.552962303161621, + "learning_rate": 7.769221779272554e-06, + "loss": 1.2872, + "step": 73600 + }, + { + "epoch": 2.5813407194446394, + "grad_norm": 6.5435895919799805, + "learning_rate": 7.752989910270228e-06, + "loss": 1.2741, + "step": 73625 + }, + { + "epoch": 2.582217235817965, + "grad_norm": 8.361076354980469, + "learning_rate": 7.736758041267905e-06, + "loss": 1.4268, + "step": 73650 + }, + { + "epoch": 2.583093752191291, + "grad_norm": 6.555883884429932, + "learning_rate": 7.72052617226558e-06, + "loss": 1.4774, + "step": 73675 + }, + { + "epoch": 2.5839702685646166, + "grad_norm": 4.667803764343262, + "learning_rate": 7.704294303263256e-06, + "loss": 1.518, + "step": 73700 + }, + { + "epoch": 2.584846784937943, + "grad_norm": 6.122119903564453, + "learning_rate": 7.688062434260932e-06, + "loss": 1.4184, + "step": 73725 + }, + { + "epoch": 2.5857233013112686, + "grad_norm": 5.330521583557129, + "learning_rate": 7.671830565258607e-06, + "loss": 1.2353, + "step": 73750 + }, + { + "epoch": 2.5865998176845943, + "grad_norm": 8.081421852111816, + "learning_rate": 7.655598696256281e-06, + "loss": 1.2257, + "step": 73775 + }, + { + "epoch": 2.58747633405792, + "grad_norm": 3.388977527618408, + "learning_rate": 7.639366827253958e-06, + "loss": 1.6802, + "step": 73800 + }, + { + "epoch": 2.588352850431246, + "grad_norm": 5.212347030639648, + "learning_rate": 7.623134958251633e-06, + "loss": 1.4001, + "step": 73825 + }, + { + "epoch": 2.589229366804572, + "grad_norm": 4.9938836097717285, + "learning_rate": 7.606903089249309e-06, + "loss": 1.6174, + "step": 73850 + }, + { + "epoch": 2.590105883177898, + "grad_norm": 5.967789173126221, + "learning_rate": 7.590671220246985e-06, + "loss": 1.0639, + "step": 73875 + }, + { + "epoch": 2.5909823995512236, + "grad_norm": 5.017436981201172, + "learning_rate": 7.5744393512446595e-06, + "loss": 1.3648, + "step": 73900 + }, + { + "epoch": 2.5918589159245493, + "grad_norm": 5.0958099365234375, + "learning_rate": 7.558207482242336e-06, + "loss": 1.6977, + "step": 73925 + }, + { + "epoch": 2.592735432297875, + "grad_norm": 10.95426082611084, + "learning_rate": 7.541975613240011e-06, + "loss": 1.4085, + "step": 73950 + }, + { + "epoch": 2.5936119486712013, + "grad_norm": 9.451430320739746, + "learning_rate": 7.525743744237687e-06, + "loss": 1.4622, + "step": 73975 + }, + { + "epoch": 2.594488465044527, + "grad_norm": 3.822831392288208, + "learning_rate": 7.509511875235363e-06, + "loss": 1.3303, + "step": 74000 + }, + { + "epoch": 2.5953649814178528, + "grad_norm": 8.17967414855957, + "learning_rate": 7.493280006233038e-06, + "loss": 1.1378, + "step": 74025 + }, + { + "epoch": 2.596241497791179, + "grad_norm": 3.7249975204467773, + "learning_rate": 7.477048137230713e-06, + "loss": 1.4206, + "step": 74050 + }, + { + "epoch": 2.5971180141645047, + "grad_norm": 5.054248332977295, + "learning_rate": 7.460816268228389e-06, + "loss": 1.238, + "step": 74075 + }, + { + "epoch": 2.5979945305378305, + "grad_norm": 6.2494025230407715, + "learning_rate": 7.444584399226065e-06, + "loss": 1.3038, + "step": 74100 + }, + { + "epoch": 2.5988710469111562, + "grad_norm": 5.814448356628418, + "learning_rate": 7.428352530223741e-06, + "loss": 1.1957, + "step": 74125 + }, + { + "epoch": 2.599747563284482, + "grad_norm": 9.802275657653809, + "learning_rate": 7.412120661221417e-06, + "loss": 1.411, + "step": 74150 + }, + { + "epoch": 2.600624079657808, + "grad_norm": 6.956434726715088, + "learning_rate": 7.395888792219091e-06, + "loss": 1.3029, + "step": 74175 + }, + { + "epoch": 2.601500596031134, + "grad_norm": 10.115100860595703, + "learning_rate": 7.379656923216767e-06, + "loss": 1.3268, + "step": 74200 + }, + { + "epoch": 2.6023771124044597, + "grad_norm": 3.4571001529693604, + "learning_rate": 7.363425054214443e-06, + "loss": 1.3909, + "step": 74225 + }, + { + "epoch": 2.6032536287777854, + "grad_norm": 4.039469242095947, + "learning_rate": 7.347193185212119e-06, + "loss": 1.3862, + "step": 74250 + }, + { + "epoch": 2.604130145151111, + "grad_norm": 6.4111647605896, + "learning_rate": 7.3309613162097944e-06, + "loss": 1.4502, + "step": 74275 + }, + { + "epoch": 2.6050066615244374, + "grad_norm": 5.054339408874512, + "learning_rate": 7.314729447207469e-06, + "loss": 1.2077, + "step": 74300 + }, + { + "epoch": 2.605883177897763, + "grad_norm": 5.502564907073975, + "learning_rate": 7.298497578205145e-06, + "loss": 1.3556, + "step": 74325 + }, + { + "epoch": 2.606759694271089, + "grad_norm": 7.330199241638184, + "learning_rate": 7.282265709202821e-06, + "loss": 1.2709, + "step": 74350 + }, + { + "epoch": 2.607636210644415, + "grad_norm": 7.680824279785156, + "learning_rate": 7.266033840200496e-06, + "loss": 1.2934, + "step": 74375 + }, + { + "epoch": 2.608512727017741, + "grad_norm": 8.73070240020752, + "learning_rate": 7.249801971198173e-06, + "loss": 1.6146, + "step": 74400 + }, + { + "epoch": 2.6093892433910666, + "grad_norm": 4.549934387207031, + "learning_rate": 7.233570102195848e-06, + "loss": 1.3314, + "step": 74425 + }, + { + "epoch": 2.6102657597643923, + "grad_norm": 0.06053706258535385, + "learning_rate": 7.217338233193523e-06, + "loss": 1.0648, + "step": 74450 + }, + { + "epoch": 2.611142276137718, + "grad_norm": 3.652555465698242, + "learning_rate": 7.201106364191198e-06, + "loss": 1.1319, + "step": 74475 + }, + { + "epoch": 2.6120187925110443, + "grad_norm": 4.677942276000977, + "learning_rate": 7.184874495188875e-06, + "loss": 1.2361, + "step": 74500 + }, + { + "epoch": 2.61289530888437, + "grad_norm": 7.708515167236328, + "learning_rate": 7.16864262618655e-06, + "loss": 1.259, + "step": 74525 + }, + { + "epoch": 2.613771825257696, + "grad_norm": 5.110644817352295, + "learning_rate": 7.152410757184226e-06, + "loss": 1.2775, + "step": 74550 + }, + { + "epoch": 2.6146483416310216, + "grad_norm": 5.966833114624023, + "learning_rate": 7.1361788881819e-06, + "loss": 1.4247, + "step": 74575 + }, + { + "epoch": 2.6155248580043473, + "grad_norm": 5.0785393714904785, + "learning_rate": 7.119947019179577e-06, + "loss": 1.4657, + "step": 74600 + }, + { + "epoch": 2.6164013743776735, + "grad_norm": 7.345777988433838, + "learning_rate": 7.103715150177252e-06, + "loss": 1.0952, + "step": 74625 + }, + { + "epoch": 2.6172778907509993, + "grad_norm": 5.5367512702941895, + "learning_rate": 7.087483281174928e-06, + "loss": 1.3343, + "step": 74650 + }, + { + "epoch": 2.618154407124325, + "grad_norm": 11.372601509094238, + "learning_rate": 7.071251412172604e-06, + "loss": 1.2762, + "step": 74675 + }, + { + "epoch": 2.619030923497651, + "grad_norm": 7.671321868896484, + "learning_rate": 7.0550195431702795e-06, + "loss": 1.3482, + "step": 74700 + }, + { + "epoch": 2.6199074398709765, + "grad_norm": 5.295323371887207, + "learning_rate": 7.038787674167954e-06, + "loss": 1.3911, + "step": 74725 + }, + { + "epoch": 2.6207839562443027, + "grad_norm": 5.153957366943359, + "learning_rate": 7.0225558051656305e-06, + "loss": 1.6349, + "step": 74750 + }, + { + "epoch": 2.6216604726176285, + "grad_norm": 7.1874260902404785, + "learning_rate": 7.006323936163306e-06, + "loss": 1.1379, + "step": 74775 + }, + { + "epoch": 2.622536988990954, + "grad_norm": 4.659069538116455, + "learning_rate": 6.9900920671609815e-06, + "loss": 1.3724, + "step": 74800 + }, + { + "epoch": 2.6234135053642804, + "grad_norm": 5.004680156707764, + "learning_rate": 6.973860198158658e-06, + "loss": 1.402, + "step": 74825 + }, + { + "epoch": 2.624290021737606, + "grad_norm": 3.333934783935547, + "learning_rate": 6.9576283291563325e-06, + "loss": 1.2723, + "step": 74850 + }, + { + "epoch": 2.625166538110932, + "grad_norm": 6.849174499511719, + "learning_rate": 6.941396460154008e-06, + "loss": 1.5981, + "step": 74875 + }, + { + "epoch": 2.6260430544842577, + "grad_norm": 6.498462677001953, + "learning_rate": 6.9251645911516835e-06, + "loss": 1.368, + "step": 74900 + }, + { + "epoch": 2.6269195708575834, + "grad_norm": 3.6896936893463135, + "learning_rate": 6.90893272214936e-06, + "loss": 1.3484, + "step": 74925 + }, + { + "epoch": 2.6277960872309096, + "grad_norm": 5.294214725494385, + "learning_rate": 6.892700853147035e-06, + "loss": 1.3467, + "step": 74950 + }, + { + "epoch": 2.6286726036042354, + "grad_norm": 6.503402233123779, + "learning_rate": 6.876468984144712e-06, + "loss": 1.2135, + "step": 74975 + }, + { + "epoch": 2.629549119977561, + "grad_norm": 5.606820106506348, + "learning_rate": 6.8602371151423854e-06, + "loss": 1.4833, + "step": 75000 + }, + { + "epoch": 2.630425636350887, + "grad_norm": 4.757790565490723, + "learning_rate": 6.844005246140062e-06, + "loss": 1.4138, + "step": 75025 + }, + { + "epoch": 2.6313021527242126, + "grad_norm": 5.489360332489014, + "learning_rate": 6.827773377137737e-06, + "loss": 1.4047, + "step": 75050 + }, + { + "epoch": 2.632178669097539, + "grad_norm": 5.0659356117248535, + "learning_rate": 6.811541508135414e-06, + "loss": 1.1946, + "step": 75075 + }, + { + "epoch": 2.6330551854708646, + "grad_norm": 4.138230800628662, + "learning_rate": 6.795309639133089e-06, + "loss": 1.6412, + "step": 75100 + }, + { + "epoch": 2.6339317018441903, + "grad_norm": 4.599127292633057, + "learning_rate": 6.779077770130764e-06, + "loss": 1.4478, + "step": 75125 + }, + { + "epoch": 2.6348082182175165, + "grad_norm": 10.13494873046875, + "learning_rate": 6.762845901128439e-06, + "loss": 1.395, + "step": 75150 + }, + { + "epoch": 2.6356847345908423, + "grad_norm": 8.80366039276123, + "learning_rate": 6.746614032126116e-06, + "loss": 1.7911, + "step": 75175 + }, + { + "epoch": 2.636561250964168, + "grad_norm": 9.562262535095215, + "learning_rate": 6.730382163123791e-06, + "loss": 1.5228, + "step": 75200 + }, + { + "epoch": 2.637437767337494, + "grad_norm": 5.110538959503174, + "learning_rate": 6.714150294121467e-06, + "loss": 1.471, + "step": 75225 + }, + { + "epoch": 2.6383142837108196, + "grad_norm": 9.712437629699707, + "learning_rate": 6.697918425119143e-06, + "loss": 1.4648, + "step": 75250 + }, + { + "epoch": 2.6391908000841457, + "grad_norm": 3.866316795349121, + "learning_rate": 6.6816865561168175e-06, + "loss": 1.5582, + "step": 75275 + }, + { + "epoch": 2.6400673164574715, + "grad_norm": 5.292516231536865, + "learning_rate": 6.665454687114493e-06, + "loss": 1.3762, + "step": 75300 + }, + { + "epoch": 2.6409438328307973, + "grad_norm": 13.311901092529297, + "learning_rate": 6.649222818112169e-06, + "loss": 1.3665, + "step": 75325 + }, + { + "epoch": 2.641820349204123, + "grad_norm": 10.943368911743164, + "learning_rate": 6.632990949109845e-06, + "loss": 1.395, + "step": 75350 + }, + { + "epoch": 2.6426968655774488, + "grad_norm": 5.143752098083496, + "learning_rate": 6.61675908010752e-06, + "loss": 1.2962, + "step": 75375 + }, + { + "epoch": 2.643573381950775, + "grad_norm": 3.4305734634399414, + "learning_rate": 6.600527211105195e-06, + "loss": 1.2241, + "step": 75400 + }, + { + "epoch": 2.6444498983241007, + "grad_norm": 5.105605125427246, + "learning_rate": 6.584295342102871e-06, + "loss": 1.3979, + "step": 75425 + }, + { + "epoch": 2.6453264146974265, + "grad_norm": 5.093878269195557, + "learning_rate": 6.568063473100547e-06, + "loss": 1.3412, + "step": 75450 + }, + { + "epoch": 2.6462029310707527, + "grad_norm": 11.473904609680176, + "learning_rate": 6.551831604098223e-06, + "loss": 1.3723, + "step": 75475 + }, + { + "epoch": 2.6470794474440784, + "grad_norm": 5.198171615600586, + "learning_rate": 6.535599735095899e-06, + "loss": 1.3401, + "step": 75500 + }, + { + "epoch": 2.647955963817404, + "grad_norm": 6.733193874359131, + "learning_rate": 6.519367866093574e-06, + "loss": 1.709, + "step": 75525 + }, + { + "epoch": 2.64883248019073, + "grad_norm": 6.512747764587402, + "learning_rate": 6.503135997091249e-06, + "loss": 1.216, + "step": 75550 + }, + { + "epoch": 2.6497089965640557, + "grad_norm": 4.305744647979736, + "learning_rate": 6.486904128088925e-06, + "loss": 1.2698, + "step": 75575 + }, + { + "epoch": 2.650585512937382, + "grad_norm": 4.221667289733887, + "learning_rate": 6.470672259086601e-06, + "loss": 1.2331, + "step": 75600 + }, + { + "epoch": 2.6514620293107076, + "grad_norm": 4.361315727233887, + "learning_rate": 6.454440390084276e-06, + "loss": 1.2421, + "step": 75625 + }, + { + "epoch": 2.6523385456840334, + "grad_norm": 5.290825366973877, + "learning_rate": 6.4382085210819525e-06, + "loss": 1.4057, + "step": 75650 + }, + { + "epoch": 2.653215062057359, + "grad_norm": 5.744316101074219, + "learning_rate": 6.421976652079627e-06, + "loss": 1.333, + "step": 75675 + }, + { + "epoch": 2.654091578430685, + "grad_norm": 14.580558776855469, + "learning_rate": 6.405744783077303e-06, + "loss": 1.3117, + "step": 75700 + }, + { + "epoch": 2.654968094804011, + "grad_norm": 9.074707984924316, + "learning_rate": 6.389512914074978e-06, + "loss": 1.4336, + "step": 75725 + }, + { + "epoch": 2.655844611177337, + "grad_norm": 7.365807056427002, + "learning_rate": 6.3732810450726545e-06, + "loss": 1.2236, + "step": 75750 + }, + { + "epoch": 2.6567211275506626, + "grad_norm": 3.5461647510528564, + "learning_rate": 6.35704917607033e-06, + "loss": 1.479, + "step": 75775 + }, + { + "epoch": 2.6575976439239883, + "grad_norm": 9.430548667907715, + "learning_rate": 6.340817307068006e-06, + "loss": 1.3693, + "step": 75800 + }, + { + "epoch": 2.658474160297314, + "grad_norm": 6.8220295906066895, + "learning_rate": 6.32458543806568e-06, + "loss": 1.4766, + "step": 75825 + }, + { + "epoch": 2.6593506766706403, + "grad_norm": 7.507476806640625, + "learning_rate": 6.3083535690633564e-06, + "loss": 1.3477, + "step": 75850 + }, + { + "epoch": 2.660227193043966, + "grad_norm": 6.184453010559082, + "learning_rate": 6.292121700061032e-06, + "loss": 1.3915, + "step": 75875 + }, + { + "epoch": 2.661103709417292, + "grad_norm": 7.317634582519531, + "learning_rate": 6.275889831058708e-06, + "loss": 1.4044, + "step": 75900 + }, + { + "epoch": 2.661980225790618, + "grad_norm": 7.0821990966796875, + "learning_rate": 6.259657962056384e-06, + "loss": 1.494, + "step": 75925 + }, + { + "epoch": 2.6628567421639437, + "grad_norm": 6.859158039093018, + "learning_rate": 6.243426093054059e-06, + "loss": 1.1895, + "step": 75950 + }, + { + "epoch": 2.6637332585372695, + "grad_norm": 7.656239986419678, + "learning_rate": 6.227194224051734e-06, + "loss": 1.3335, + "step": 75975 + }, + { + "epoch": 2.6646097749105953, + "grad_norm": 7.039669513702393, + "learning_rate": 6.21096235504941e-06, + "loss": 1.4697, + "step": 76000 + }, + { + "epoch": 2.665486291283921, + "grad_norm": 6.685445785522461, + "learning_rate": 6.194730486047086e-06, + "loss": 1.5375, + "step": 76025 + }, + { + "epoch": 2.666362807657247, + "grad_norm": 5.206323623657227, + "learning_rate": 6.178498617044761e-06, + "loss": 1.1749, + "step": 76050 + }, + { + "epoch": 2.667239324030573, + "grad_norm": 3.3454227447509766, + "learning_rate": 6.162266748042437e-06, + "loss": 1.281, + "step": 76075 + }, + { + "epoch": 2.6681158404038987, + "grad_norm": 7.117255210876465, + "learning_rate": 6.146034879040113e-06, + "loss": 1.2871, + "step": 76100 + }, + { + "epoch": 2.6689923567772245, + "grad_norm": 5.251532554626465, + "learning_rate": 6.129803010037788e-06, + "loss": 1.1861, + "step": 76125 + }, + { + "epoch": 2.66986887315055, + "grad_norm": 6.722720146179199, + "learning_rate": 6.113571141035464e-06, + "loss": 1.5876, + "step": 76150 + }, + { + "epoch": 2.6707453895238764, + "grad_norm": 6.251155853271484, + "learning_rate": 6.0973392720331395e-06, + "loss": 1.3374, + "step": 76175 + }, + { + "epoch": 2.671621905897202, + "grad_norm": 8.316293716430664, + "learning_rate": 6.081107403030815e-06, + "loss": 1.3808, + "step": 76200 + }, + { + "epoch": 2.672498422270528, + "grad_norm": 7.437630653381348, + "learning_rate": 6.0648755340284905e-06, + "loss": 1.2646, + "step": 76225 + }, + { + "epoch": 2.673374938643854, + "grad_norm": 4.873050689697266, + "learning_rate": 6.048643665026166e-06, + "loss": 1.3152, + "step": 76250 + }, + { + "epoch": 2.67425145501718, + "grad_norm": 7.313154697418213, + "learning_rate": 6.0324117960238415e-06, + "loss": 1.4322, + "step": 76275 + }, + { + "epoch": 2.6751279713905056, + "grad_norm": 5.168817043304443, + "learning_rate": 6.016179927021518e-06, + "loss": 1.3014, + "step": 76300 + }, + { + "epoch": 2.6760044877638314, + "grad_norm": 5.063718318939209, + "learning_rate": 5.9999480580191925e-06, + "loss": 1.3714, + "step": 76325 + }, + { + "epoch": 2.676881004137157, + "grad_norm": 6.373976230621338, + "learning_rate": 5.983716189016869e-06, + "loss": 1.3965, + "step": 76350 + }, + { + "epoch": 2.6777575205104833, + "grad_norm": 9.500669479370117, + "learning_rate": 5.967484320014544e-06, + "loss": 1.5894, + "step": 76375 + }, + { + "epoch": 2.678634036883809, + "grad_norm": 6.216893196105957, + "learning_rate": 5.95125245101222e-06, + "loss": 1.4692, + "step": 76400 + }, + { + "epoch": 2.679510553257135, + "grad_norm": 7.569559574127197, + "learning_rate": 5.935020582009895e-06, + "loss": 1.5031, + "step": 76425 + }, + { + "epoch": 2.6803870696304606, + "grad_norm": 3.224551200866699, + "learning_rate": 5.918788713007571e-06, + "loss": 1.3127, + "step": 76450 + }, + { + "epoch": 2.6812635860037863, + "grad_norm": 0.057992856949567795, + "learning_rate": 5.902556844005246e-06, + "loss": 1.4078, + "step": 76475 + }, + { + "epoch": 2.6821401023771125, + "grad_norm": 10.982565879821777, + "learning_rate": 5.886324975002922e-06, + "loss": 1.1203, + "step": 76500 + }, + { + "epoch": 2.6830166187504383, + "grad_norm": 5.57172155380249, + "learning_rate": 5.870093106000597e-06, + "loss": 1.3235, + "step": 76525 + }, + { + "epoch": 2.683893135123764, + "grad_norm": 10.920694351196289, + "learning_rate": 5.853861236998273e-06, + "loss": 1.5247, + "step": 76550 + }, + { + "epoch": 2.6847696514970902, + "grad_norm": 5.375068664550781, + "learning_rate": 5.837629367995949e-06, + "loss": 1.5153, + "step": 76575 + }, + { + "epoch": 2.6856461678704155, + "grad_norm": 4.988615989685059, + "learning_rate": 5.821397498993624e-06, + "loss": 1.1255, + "step": 76600 + }, + { + "epoch": 2.6865226842437417, + "grad_norm": 4.1690497398376465, + "learning_rate": 5.8051656299913e-06, + "loss": 1.1766, + "step": 76625 + }, + { + "epoch": 2.6873992006170675, + "grad_norm": 3.1485774517059326, + "learning_rate": 5.788933760988976e-06, + "loss": 1.2029, + "step": 76650 + }, + { + "epoch": 2.6882757169903932, + "grad_norm": 7.43988037109375, + "learning_rate": 5.772701891986651e-06, + "loss": 1.5346, + "step": 76675 + }, + { + "epoch": 2.6891522333637194, + "grad_norm": 4.807683944702148, + "learning_rate": 5.7564700229843266e-06, + "loss": 1.3551, + "step": 76700 + }, + { + "epoch": 2.690028749737045, + "grad_norm": 6.876160144805908, + "learning_rate": 5.740238153982003e-06, + "loss": 1.2595, + "step": 76725 + }, + { + "epoch": 2.690905266110371, + "grad_norm": 6.130712509155273, + "learning_rate": 5.7240062849796776e-06, + "loss": 1.2588, + "step": 76750 + }, + { + "epoch": 2.6917817824836967, + "grad_norm": 13.247418403625488, + "learning_rate": 5.707774415977354e-06, + "loss": 1.3528, + "step": 76775 + }, + { + "epoch": 2.6926582988570225, + "grad_norm": 10.209839820861816, + "learning_rate": 5.6915425469750285e-06, + "loss": 1.4386, + "step": 76800 + }, + { + "epoch": 2.6935348152303487, + "grad_norm": 8.98093032836914, + "learning_rate": 5.675310677972705e-06, + "loss": 1.4535, + "step": 76825 + }, + { + "epoch": 2.6944113316036744, + "grad_norm": 5.765896797180176, + "learning_rate": 5.65907880897038e-06, + "loss": 1.187, + "step": 76850 + }, + { + "epoch": 2.695287847977, + "grad_norm": 7.152120590209961, + "learning_rate": 5.642846939968056e-06, + "loss": 1.2143, + "step": 76875 + }, + { + "epoch": 2.696164364350326, + "grad_norm": 9.47020149230957, + "learning_rate": 5.626615070965731e-06, + "loss": 1.3886, + "step": 76900 + }, + { + "epoch": 2.6970408807236517, + "grad_norm": 6.529796123504639, + "learning_rate": 5.610383201963408e-06, + "loss": 1.2684, + "step": 76925 + }, + { + "epoch": 2.697917397096978, + "grad_norm": 7.259756088256836, + "learning_rate": 5.594151332961082e-06, + "loss": 1.4109, + "step": 76950 + }, + { + "epoch": 2.6987939134703036, + "grad_norm": 6.058772087097168, + "learning_rate": 5.577919463958759e-06, + "loss": 1.2843, + "step": 76975 + }, + { + "epoch": 2.6996704298436294, + "grad_norm": 6.7160234451293945, + "learning_rate": 5.561687594956434e-06, + "loss": 1.5227, + "step": 77000 + }, + { + "epoch": 2.7005469462169556, + "grad_norm": 6.782296180725098, + "learning_rate": 5.54545572595411e-06, + "loss": 1.2623, + "step": 77025 + }, + { + "epoch": 2.7014234625902813, + "grad_norm": 8.887977600097656, + "learning_rate": 5.529223856951785e-06, + "loss": 1.5291, + "step": 77050 + }, + { + "epoch": 2.702299978963607, + "grad_norm": 9.76166820526123, + "learning_rate": 5.512991987949461e-06, + "loss": 1.3218, + "step": 77075 + }, + { + "epoch": 2.703176495336933, + "grad_norm": 3.8037917613983154, + "learning_rate": 5.496760118947136e-06, + "loss": 1.2855, + "step": 77100 + }, + { + "epoch": 2.7040530117102586, + "grad_norm": 7.271895408630371, + "learning_rate": 5.4805282499448125e-06, + "loss": 1.4105, + "step": 77125 + }, + { + "epoch": 2.7049295280835848, + "grad_norm": 8.466750144958496, + "learning_rate": 5.464296380942487e-06, + "loss": 1.3299, + "step": 77150 + }, + { + "epoch": 2.7058060444569105, + "grad_norm": 8.896145820617676, + "learning_rate": 5.4480645119401635e-06, + "loss": 1.3396, + "step": 77175 + }, + { + "epoch": 2.7066825608302363, + "grad_norm": 5.753562927246094, + "learning_rate": 5.431832642937839e-06, + "loss": 1.3699, + "step": 77200 + }, + { + "epoch": 2.707559077203562, + "grad_norm": 8.894001960754395, + "learning_rate": 5.4156007739355145e-06, + "loss": 1.4429, + "step": 77225 + }, + { + "epoch": 2.708435593576888, + "grad_norm": 10.95934772491455, + "learning_rate": 5.39936890493319e-06, + "loss": 1.3457, + "step": 77250 + }, + { + "epoch": 2.709312109950214, + "grad_norm": 7.461177349090576, + "learning_rate": 5.3831370359308654e-06, + "loss": 1.7164, + "step": 77275 + }, + { + "epoch": 2.7101886263235397, + "grad_norm": 6.811636447906494, + "learning_rate": 5.366905166928541e-06, + "loss": 1.3483, + "step": 77300 + }, + { + "epoch": 2.7110651426968655, + "grad_norm": 10.19597053527832, + "learning_rate": 5.3506732979262164e-06, + "loss": 1.2233, + "step": 77325 + }, + { + "epoch": 2.7119416590701917, + "grad_norm": 6.083794116973877, + "learning_rate": 5.334441428923892e-06, + "loss": 1.4828, + "step": 77350 + }, + { + "epoch": 2.7128181754435174, + "grad_norm": 6.809365272521973, + "learning_rate": 5.318209559921567e-06, + "loss": 1.455, + "step": 77375 + }, + { + "epoch": 2.713694691816843, + "grad_norm": 6.594851016998291, + "learning_rate": 5.301977690919244e-06, + "loss": 1.2923, + "step": 77400 + }, + { + "epoch": 2.714571208190169, + "grad_norm": 7.028031826019287, + "learning_rate": 5.285745821916919e-06, + "loss": 1.2006, + "step": 77425 + }, + { + "epoch": 2.7154477245634947, + "grad_norm": 6.82227897644043, + "learning_rate": 5.269513952914595e-06, + "loss": 1.4075, + "step": 77450 + }, + { + "epoch": 2.716324240936821, + "grad_norm": 3.540800094604492, + "learning_rate": 5.25328208391227e-06, + "loss": 1.2831, + "step": 77475 + }, + { + "epoch": 2.7172007573101467, + "grad_norm": 5.188521862030029, + "learning_rate": 5.237050214909946e-06, + "loss": 1.2054, + "step": 77500 + }, + { + "epoch": 2.7180772736834724, + "grad_norm": 11.096158027648926, + "learning_rate": 5.220818345907621e-06, + "loss": 1.3434, + "step": 77525 + }, + { + "epoch": 2.718953790056798, + "grad_norm": 6.422120571136475, + "learning_rate": 5.204586476905297e-06, + "loss": 1.4237, + "step": 77550 + }, + { + "epoch": 2.719830306430124, + "grad_norm": 4.977518558502197, + "learning_rate": 5.188354607902972e-06, + "loss": 1.1719, + "step": 77575 + }, + { + "epoch": 2.72070682280345, + "grad_norm": 4.851576328277588, + "learning_rate": 5.1721227389006485e-06, + "loss": 1.5844, + "step": 77600 + }, + { + "epoch": 2.721583339176776, + "grad_norm": 8.687397003173828, + "learning_rate": 5.155890869898323e-06, + "loss": 1.3003, + "step": 77625 + }, + { + "epoch": 2.7224598555501016, + "grad_norm": 5.919240951538086, + "learning_rate": 5.1396590008959995e-06, + "loss": 1.2914, + "step": 77650 + }, + { + "epoch": 2.7233363719234274, + "grad_norm": 6.112980842590332, + "learning_rate": 5.123427131893675e-06, + "loss": 1.4367, + "step": 77675 + }, + { + "epoch": 2.724212888296753, + "grad_norm": 7.2207350730896, + "learning_rate": 5.1071952628913505e-06, + "loss": 1.3135, + "step": 77700 + }, + { + "epoch": 2.7250894046700793, + "grad_norm": 3.5680463314056396, + "learning_rate": 5.090963393889026e-06, + "loss": 1.1692, + "step": 77725 + }, + { + "epoch": 2.725965921043405, + "grad_norm": 6.400141716003418, + "learning_rate": 5.074731524886702e-06, + "loss": 1.0558, + "step": 77750 + }, + { + "epoch": 2.726842437416731, + "grad_norm": 6.259448528289795, + "learning_rate": 5.058499655884377e-06, + "loss": 1.3765, + "step": 77775 + }, + { + "epoch": 2.727718953790057, + "grad_norm": 7.8382673263549805, + "learning_rate": 5.042267786882053e-06, + "loss": 1.468, + "step": 77800 + }, + { + "epoch": 2.7285954701633828, + "grad_norm": 3.2454276084899902, + "learning_rate": 5.026035917879728e-06, + "loss": 1.1505, + "step": 77825 + }, + { + "epoch": 2.7294719865367085, + "grad_norm": 6.973567008972168, + "learning_rate": 5.009804048877404e-06, + "loss": 1.5313, + "step": 77850 + }, + { + "epoch": 2.7303485029100343, + "grad_norm": 11.360092163085938, + "learning_rate": 4.99357217987508e-06, + "loss": 1.5066, + "step": 77875 + }, + { + "epoch": 2.73122501928336, + "grad_norm": 6.053464412689209, + "learning_rate": 4.977340310872755e-06, + "loss": 1.4727, + "step": 77900 + }, + { + "epoch": 2.7321015356566862, + "grad_norm": 5.319965839385986, + "learning_rate": 4.961108441870431e-06, + "loss": 1.3217, + "step": 77925 + }, + { + "epoch": 2.732978052030012, + "grad_norm": 6.151439189910889, + "learning_rate": 4.944876572868107e-06, + "loss": 1.518, + "step": 77950 + }, + { + "epoch": 2.7338545684033377, + "grad_norm": 8.889856338500977, + "learning_rate": 4.928644703865782e-06, + "loss": 1.3471, + "step": 77975 + }, + { + "epoch": 2.7347310847766635, + "grad_norm": 5.211845874786377, + "learning_rate": 4.912412834863458e-06, + "loss": 1.5743, + "step": 78000 + }, + { + "epoch": 2.7356076011499892, + "grad_norm": 7.758738040924072, + "learning_rate": 4.896180965861134e-06, + "loss": 1.2881, + "step": 78025 + }, + { + "epoch": 2.7364841175233154, + "grad_norm": 3.355947732925415, + "learning_rate": 4.879949096858809e-06, + "loss": 1.1524, + "step": 78050 + }, + { + "epoch": 2.737360633896641, + "grad_norm": 5.239437580108643, + "learning_rate": 4.863717227856485e-06, + "loss": 1.3324, + "step": 78075 + }, + { + "epoch": 2.738237150269967, + "grad_norm": 3.655048131942749, + "learning_rate": 4.84748535885416e-06, + "loss": 1.3776, + "step": 78100 + }, + { + "epoch": 2.739113666643293, + "grad_norm": 5.687983512878418, + "learning_rate": 4.831253489851836e-06, + "loss": 1.4796, + "step": 78125 + }, + { + "epoch": 2.739990183016619, + "grad_norm": 8.908906936645508, + "learning_rate": 4.815021620849511e-06, + "loss": 1.2928, + "step": 78150 + }, + { + "epoch": 2.7408666993899446, + "grad_norm": 4.669451713562012, + "learning_rate": 4.7987897518471866e-06, + "loss": 1.2931, + "step": 78175 + }, + { + "epoch": 2.7417432157632704, + "grad_norm": 5.88787317276001, + "learning_rate": 4.782557882844863e-06, + "loss": 1.4383, + "step": 78200 + }, + { + "epoch": 2.742619732136596, + "grad_norm": 7.975228309631348, + "learning_rate": 4.766326013842538e-06, + "loss": 1.541, + "step": 78225 + }, + { + "epoch": 2.7434962485099224, + "grad_norm": 6.7949604988098145, + "learning_rate": 4.750094144840214e-06, + "loss": 1.2209, + "step": 78250 + }, + { + "epoch": 2.744372764883248, + "grad_norm": 5.103420734405518, + "learning_rate": 4.733862275837889e-06, + "loss": 1.3775, + "step": 78275 + }, + { + "epoch": 2.745249281256574, + "grad_norm": 6.0341010093688965, + "learning_rate": 4.717630406835565e-06, + "loss": 1.2444, + "step": 78300 + }, + { + "epoch": 2.7461257976298996, + "grad_norm": 4.131095886230469, + "learning_rate": 4.70139853783324e-06, + "loss": 1.3814, + "step": 78325 + }, + { + "epoch": 2.7470023140032254, + "grad_norm": 4.946516513824463, + "learning_rate": 4.685166668830916e-06, + "loss": 1.2862, + "step": 78350 + }, + { + "epoch": 2.7478788303765516, + "grad_norm": 4.987763404846191, + "learning_rate": 4.668934799828591e-06, + "loss": 1.4431, + "step": 78375 + }, + { + "epoch": 2.7487553467498773, + "grad_norm": 6.478204727172852, + "learning_rate": 4.652702930826267e-06, + "loss": 1.4452, + "step": 78400 + }, + { + "epoch": 2.749631863123203, + "grad_norm": 7.415268898010254, + "learning_rate": 4.636471061823943e-06, + "loss": 1.3217, + "step": 78425 + }, + { + "epoch": 2.7505083794965293, + "grad_norm": 5.237459659576416, + "learning_rate": 4.620239192821618e-06, + "loss": 1.3312, + "step": 78450 + }, + { + "epoch": 2.7513848958698546, + "grad_norm": 10.6133451461792, + "learning_rate": 4.604007323819294e-06, + "loss": 1.3392, + "step": 78475 + }, + { + "epoch": 2.7522614122431808, + "grad_norm": 5.747011661529541, + "learning_rate": 4.58777545481697e-06, + "loss": 1.3252, + "step": 78500 + }, + { + "epoch": 2.7531379286165065, + "grad_norm": 6.134425640106201, + "learning_rate": 4.571543585814645e-06, + "loss": 1.2595, + "step": 78525 + }, + { + "epoch": 2.7540144449898323, + "grad_norm": 4.423951148986816, + "learning_rate": 4.555311716812321e-06, + "loss": 1.0926, + "step": 78550 + }, + { + "epoch": 2.7548909613631585, + "grad_norm": 5.154847621917725, + "learning_rate": 4.539079847809997e-06, + "loss": 1.2344, + "step": 78575 + }, + { + "epoch": 2.7557674777364842, + "grad_norm": 7.072163105010986, + "learning_rate": 4.522847978807672e-06, + "loss": 1.56, + "step": 78600 + }, + { + "epoch": 2.75664399410981, + "grad_norm": 7.500062465667725, + "learning_rate": 4.506616109805348e-06, + "loss": 1.3621, + "step": 78625 + }, + { + "epoch": 2.7575205104831357, + "grad_norm": 0.046525463461875916, + "learning_rate": 4.490384240803023e-06, + "loss": 1.4193, + "step": 78650 + }, + { + "epoch": 2.7583970268564615, + "grad_norm": 3.9394466876983643, + "learning_rate": 4.474152371800699e-06, + "loss": 1.3183, + "step": 78675 + }, + { + "epoch": 2.7592735432297877, + "grad_norm": 4.6378045082092285, + "learning_rate": 4.4579205027983745e-06, + "loss": 1.2722, + "step": 78700 + }, + { + "epoch": 2.7601500596031134, + "grad_norm": 7.3371124267578125, + "learning_rate": 4.44168863379605e-06, + "loss": 1.2671, + "step": 78725 + }, + { + "epoch": 2.761026575976439, + "grad_norm": 11.176252365112305, + "learning_rate": 4.4254567647937254e-06, + "loss": 1.2345, + "step": 78750 + }, + { + "epoch": 2.761903092349765, + "grad_norm": 4.346691608428955, + "learning_rate": 4.409224895791402e-06, + "loss": 1.3068, + "step": 78775 + }, + { + "epoch": 2.7627796087230907, + "grad_norm": 8.342177391052246, + "learning_rate": 4.3929930267890764e-06, + "loss": 1.2726, + "step": 78800 + }, + { + "epoch": 2.763656125096417, + "grad_norm": 4.911406993865967, + "learning_rate": 4.376761157786753e-06, + "loss": 1.1584, + "step": 78825 + }, + { + "epoch": 2.7645326414697426, + "grad_norm": 6.745550632476807, + "learning_rate": 4.360529288784428e-06, + "loss": 1.3463, + "step": 78850 + }, + { + "epoch": 2.7654091578430684, + "grad_norm": 5.295994758605957, + "learning_rate": 4.344297419782104e-06, + "loss": 1.2883, + "step": 78875 + }, + { + "epoch": 2.7662856742163946, + "grad_norm": 8.368355751037598, + "learning_rate": 4.328065550779779e-06, + "loss": 1.4686, + "step": 78900 + }, + { + "epoch": 2.7671621905897203, + "grad_norm": 5.3495378494262695, + "learning_rate": 4.311833681777455e-06, + "loss": 1.2225, + "step": 78925 + }, + { + "epoch": 2.768038706963046, + "grad_norm": 6.356222152709961, + "learning_rate": 4.29560181277513e-06, + "loss": 1.3788, + "step": 78950 + }, + { + "epoch": 2.768915223336372, + "grad_norm": 3.22284197807312, + "learning_rate": 4.279369943772807e-06, + "loss": 1.3918, + "step": 78975 + }, + { + "epoch": 2.7697917397096976, + "grad_norm": 3.769174337387085, + "learning_rate": 4.263138074770481e-06, + "loss": 1.3454, + "step": 79000 + }, + { + "epoch": 2.770668256083024, + "grad_norm": 7.480905532836914, + "learning_rate": 4.2469062057681576e-06, + "loss": 1.2558, + "step": 79025 + }, + { + "epoch": 2.7715447724563496, + "grad_norm": 5.967206001281738, + "learning_rate": 4.230674336765833e-06, + "loss": 1.1984, + "step": 79050 + }, + { + "epoch": 2.7724212888296753, + "grad_norm": 5.045833110809326, + "learning_rate": 4.2144424677635086e-06, + "loss": 1.4149, + "step": 79075 + }, + { + "epoch": 2.773297805203001, + "grad_norm": 8.399658203125, + "learning_rate": 4.198210598761184e-06, + "loss": 1.4665, + "step": 79100 + }, + { + "epoch": 2.774174321576327, + "grad_norm": 6.81123161315918, + "learning_rate": 4.1819787297588595e-06, + "loss": 1.4052, + "step": 79125 + }, + { + "epoch": 2.775050837949653, + "grad_norm": 3.2026560306549072, + "learning_rate": 4.165746860756535e-06, + "loss": 1.5719, + "step": 79150 + }, + { + "epoch": 2.7759273543229788, + "grad_norm": 7.025137901306152, + "learning_rate": 4.1495149917542105e-06, + "loss": 1.314, + "step": 79175 + }, + { + "epoch": 2.7768038706963045, + "grad_norm": 6.430960178375244, + "learning_rate": 4.133283122751886e-06, + "loss": 1.4388, + "step": 79200 + }, + { + "epoch": 2.7776803870696307, + "grad_norm": 5.7621378898620605, + "learning_rate": 4.1170512537495615e-06, + "loss": 1.4231, + "step": 79225 + }, + { + "epoch": 2.7785569034429565, + "grad_norm": 6.6016082763671875, + "learning_rate": 4.100819384747238e-06, + "loss": 1.2568, + "step": 79250 + }, + { + "epoch": 2.7794334198162822, + "grad_norm": 0.07205575704574585, + "learning_rate": 4.0845875157449125e-06, + "loss": 1.2344, + "step": 79275 + }, + { + "epoch": 2.780309936189608, + "grad_norm": 3.642509698867798, + "learning_rate": 4.068355646742589e-06, + "loss": 1.2478, + "step": 79300 + }, + { + "epoch": 2.7811864525629337, + "grad_norm": 11.069058418273926, + "learning_rate": 4.052123777740264e-06, + "loss": 1.2364, + "step": 79325 + }, + { + "epoch": 2.78206296893626, + "grad_norm": 6.974544048309326, + "learning_rate": 4.03589190873794e-06, + "loss": 1.2312, + "step": 79350 + }, + { + "epoch": 2.7829394853095857, + "grad_norm": 3.718480110168457, + "learning_rate": 4.019660039735615e-06, + "loss": 1.3228, + "step": 79375 + }, + { + "epoch": 2.7838160016829114, + "grad_norm": 6.707885265350342, + "learning_rate": 4.003428170733292e-06, + "loss": 1.4473, + "step": 79400 + }, + { + "epoch": 2.784692518056237, + "grad_norm": 10.062215805053711, + "learning_rate": 3.987196301730966e-06, + "loss": 1.5288, + "step": 79425 + }, + { + "epoch": 2.785569034429563, + "grad_norm": 3.5610263347625732, + "learning_rate": 3.970964432728643e-06, + "loss": 1.2645, + "step": 79450 + }, + { + "epoch": 2.786445550802889, + "grad_norm": 3.734896183013916, + "learning_rate": 3.954732563726317e-06, + "loss": 1.3574, + "step": 79475 + }, + { + "epoch": 2.787322067176215, + "grad_norm": 6.6372270584106445, + "learning_rate": 3.938500694723994e-06, + "loss": 1.3396, + "step": 79500 + }, + { + "epoch": 2.7881985835495406, + "grad_norm": 10.294515609741211, + "learning_rate": 3.922268825721669e-06, + "loss": 1.6193, + "step": 79525 + }, + { + "epoch": 2.7890750999228664, + "grad_norm": 6.8479437828063965, + "learning_rate": 3.906036956719345e-06, + "loss": 1.3885, + "step": 79550 + }, + { + "epoch": 2.789951616296192, + "grad_norm": 5.300049781799316, + "learning_rate": 3.88980508771702e-06, + "loss": 1.3419, + "step": 79575 + }, + { + "epoch": 2.7908281326695183, + "grad_norm": 14.37138557434082, + "learning_rate": 3.8735732187146964e-06, + "loss": 1.4329, + "step": 79600 + }, + { + "epoch": 2.791704649042844, + "grad_norm": 5.152371406555176, + "learning_rate": 3.857341349712371e-06, + "loss": 1.2614, + "step": 79625 + }, + { + "epoch": 2.79258116541617, + "grad_norm": 3.491879463195801, + "learning_rate": 3.8411094807100474e-06, + "loss": 1.3874, + "step": 79650 + }, + { + "epoch": 2.793457681789496, + "grad_norm": 4.934691905975342, + "learning_rate": 3.824877611707723e-06, + "loss": 1.3822, + "step": 79675 + }, + { + "epoch": 2.794334198162822, + "grad_norm": 4.129957675933838, + "learning_rate": 3.808645742705398e-06, + "loss": 1.3387, + "step": 79700 + }, + { + "epoch": 2.7952107145361476, + "grad_norm": 3.677853584289551, + "learning_rate": 3.792413873703074e-06, + "loss": 1.1727, + "step": 79725 + }, + { + "epoch": 2.7960872309094733, + "grad_norm": 7.801118850708008, + "learning_rate": 3.776182004700749e-06, + "loss": 1.6395, + "step": 79750 + }, + { + "epoch": 2.796963747282799, + "grad_norm": 7.72705078125, + "learning_rate": 3.759950135698425e-06, + "loss": 1.3956, + "step": 79775 + }, + { + "epoch": 2.7978402636561253, + "grad_norm": 6.382078170776367, + "learning_rate": 3.743718266696101e-06, + "loss": 1.2889, + "step": 79800 + }, + { + "epoch": 2.798716780029451, + "grad_norm": 7.35825252532959, + "learning_rate": 3.727486397693776e-06, + "loss": 1.4318, + "step": 79825 + }, + { + "epoch": 2.7995932964027768, + "grad_norm": 9.731989860534668, + "learning_rate": 3.711254528691452e-06, + "loss": 1.3292, + "step": 79850 + }, + { + "epoch": 2.8004698127761025, + "grad_norm": 6.887131214141846, + "learning_rate": 3.6950226596891277e-06, + "loss": 1.3596, + "step": 79875 + }, + { + "epoch": 2.8013463291494283, + "grad_norm": 7.106298923492432, + "learning_rate": 3.6787907906868028e-06, + "loss": 1.4116, + "step": 79900 + }, + { + "epoch": 2.8022228455227545, + "grad_norm": 18.433433532714844, + "learning_rate": 3.6625589216844787e-06, + "loss": 1.3441, + "step": 79925 + }, + { + "epoch": 2.80309936189608, + "grad_norm": 10.6728515625, + "learning_rate": 3.6463270526821546e-06, + "loss": 1.2858, + "step": 79950 + }, + { + "epoch": 2.803975878269406, + "grad_norm": 7.218353748321533, + "learning_rate": 3.6300951836798297e-06, + "loss": 1.3489, + "step": 79975 + }, + { + "epoch": 2.804852394642732, + "grad_norm": 7.104700088500977, + "learning_rate": 3.6138633146775056e-06, + "loss": 1.5083, + "step": 80000 + }, + { + "epoch": 2.805728911016058, + "grad_norm": 7.2009735107421875, + "learning_rate": 3.5976314456751807e-06, + "loss": 1.2507, + "step": 80025 + }, + { + "epoch": 2.8066054273893837, + "grad_norm": 7.000702381134033, + "learning_rate": 3.5813995766728566e-06, + "loss": 1.4229, + "step": 80050 + }, + { + "epoch": 2.8074819437627094, + "grad_norm": 6.31213903427124, + "learning_rate": 3.5651677076705325e-06, + "loss": 1.2419, + "step": 80075 + }, + { + "epoch": 2.808358460136035, + "grad_norm": 4.740888595581055, + "learning_rate": 3.5489358386682076e-06, + "loss": 1.1388, + "step": 80100 + }, + { + "epoch": 2.8092349765093614, + "grad_norm": 6.305190086364746, + "learning_rate": 3.5327039696658835e-06, + "loss": 1.3354, + "step": 80125 + }, + { + "epoch": 2.810111492882687, + "grad_norm": 3.9003729820251465, + "learning_rate": 3.5164721006635594e-06, + "loss": 1.2819, + "step": 80150 + }, + { + "epoch": 2.810988009256013, + "grad_norm": 3.3928885459899902, + "learning_rate": 3.5002402316612345e-06, + "loss": 1.2283, + "step": 80175 + }, + { + "epoch": 2.8118645256293386, + "grad_norm": 15.806440353393555, + "learning_rate": 3.4840083626589104e-06, + "loss": 1.2074, + "step": 80200 + }, + { + "epoch": 2.8127410420026644, + "grad_norm": 5.793200969696045, + "learning_rate": 3.4677764936565863e-06, + "loss": 1.4672, + "step": 80225 + }, + { + "epoch": 2.8136175583759906, + "grad_norm": 8.38286018371582, + "learning_rate": 3.4515446246542614e-06, + "loss": 1.4306, + "step": 80250 + }, + { + "epoch": 2.8144940747493163, + "grad_norm": 5.444870948791504, + "learning_rate": 3.4353127556519373e-06, + "loss": 1.1624, + "step": 80275 + }, + { + "epoch": 2.815370591122642, + "grad_norm": 7.395496845245361, + "learning_rate": 3.4190808866496124e-06, + "loss": 1.2619, + "step": 80300 + }, + { + "epoch": 2.8162471074959683, + "grad_norm": 8.144991874694824, + "learning_rate": 3.4028490176472883e-06, + "loss": 1.4257, + "step": 80325 + }, + { + "epoch": 2.8171236238692936, + "grad_norm": 5.304666519165039, + "learning_rate": 3.3866171486449638e-06, + "loss": 1.1621, + "step": 80350 + }, + { + "epoch": 2.81800014024262, + "grad_norm": 5.488581657409668, + "learning_rate": 3.3703852796426393e-06, + "loss": 1.2161, + "step": 80375 + }, + { + "epoch": 2.8188766566159456, + "grad_norm": 5.32285213470459, + "learning_rate": 3.354153410640315e-06, + "loss": 1.4757, + "step": 80400 + }, + { + "epoch": 2.8197531729892713, + "grad_norm": 5.963435173034668, + "learning_rate": 3.3379215416379907e-06, + "loss": 1.6708, + "step": 80425 + }, + { + "epoch": 2.8206296893625975, + "grad_norm": 3.8267877101898193, + "learning_rate": 3.321689672635666e-06, + "loss": 1.3241, + "step": 80450 + }, + { + "epoch": 2.8215062057359233, + "grad_norm": 7.623220443725586, + "learning_rate": 3.3054578036333417e-06, + "loss": 1.2062, + "step": 80475 + }, + { + "epoch": 2.822382722109249, + "grad_norm": 5.632768630981445, + "learning_rate": 3.289225934631017e-06, + "loss": 1.3744, + "step": 80500 + }, + { + "epoch": 2.8232592384825748, + "grad_norm": 3.782719135284424, + "learning_rate": 3.2729940656286926e-06, + "loss": 1.2975, + "step": 80525 + }, + { + "epoch": 2.8241357548559005, + "grad_norm": 3.5922834873199463, + "learning_rate": 3.2567621966263686e-06, + "loss": 1.2296, + "step": 80550 + }, + { + "epoch": 2.8250122712292267, + "grad_norm": 6.114442825317383, + "learning_rate": 3.2405303276240436e-06, + "loss": 1.2197, + "step": 80575 + }, + { + "epoch": 2.8258887876025525, + "grad_norm": 12.867429733276367, + "learning_rate": 3.2242984586217195e-06, + "loss": 1.7236, + "step": 80600 + }, + { + "epoch": 2.826765303975878, + "grad_norm": 10.661908149719238, + "learning_rate": 3.2080665896193955e-06, + "loss": 1.3798, + "step": 80625 + }, + { + "epoch": 2.827641820349204, + "grad_norm": 10.40960693359375, + "learning_rate": 3.1918347206170705e-06, + "loss": 1.2775, + "step": 80650 + }, + { + "epoch": 2.8285183367225297, + "grad_norm": 11.382696151733398, + "learning_rate": 3.1756028516147464e-06, + "loss": 1.2276, + "step": 80675 + }, + { + "epoch": 2.829394853095856, + "grad_norm": 3.87174129486084, + "learning_rate": 3.1593709826124224e-06, + "loss": 1.1232, + "step": 80700 + }, + { + "epoch": 2.8302713694691817, + "grad_norm": 7.642102241516113, + "learning_rate": 3.1431391136100974e-06, + "loss": 1.5679, + "step": 80725 + }, + { + "epoch": 2.8311478858425074, + "grad_norm": 6.910764694213867, + "learning_rate": 3.1269072446077733e-06, + "loss": 1.5345, + "step": 80750 + }, + { + "epoch": 2.8320244022158336, + "grad_norm": 6.0284881591796875, + "learning_rate": 3.110675375605449e-06, + "loss": 1.2483, + "step": 80775 + }, + { + "epoch": 2.8329009185891594, + "grad_norm": 9.472990989685059, + "learning_rate": 3.0944435066031243e-06, + "loss": 1.2737, + "step": 80800 + }, + { + "epoch": 2.833777434962485, + "grad_norm": 11.245659828186035, + "learning_rate": 3.0782116376008002e-06, + "loss": 1.2855, + "step": 80825 + }, + { + "epoch": 2.834653951335811, + "grad_norm": 8.226028442382812, + "learning_rate": 3.0619797685984757e-06, + "loss": 1.4676, + "step": 80850 + }, + { + "epoch": 2.8355304677091366, + "grad_norm": 5.627901077270508, + "learning_rate": 3.0457478995961512e-06, + "loss": 1.3422, + "step": 80875 + }, + { + "epoch": 2.836406984082463, + "grad_norm": 6.487750053405762, + "learning_rate": 3.0295160305938267e-06, + "loss": 1.4095, + "step": 80900 + }, + { + "epoch": 2.8372835004557886, + "grad_norm": 0.04943365976214409, + "learning_rate": 3.0132841615915026e-06, + "loss": 1.5158, + "step": 80925 + }, + { + "epoch": 2.8381600168291143, + "grad_norm": 4.715605735778809, + "learning_rate": 2.997052292589178e-06, + "loss": 1.3482, + "step": 80950 + }, + { + "epoch": 2.83903653320244, + "grad_norm": 5.4805803298950195, + "learning_rate": 2.9808204235868536e-06, + "loss": 1.1517, + "step": 80975 + }, + { + "epoch": 2.839913049575766, + "grad_norm": 5.006535530090332, + "learning_rate": 2.964588554584529e-06, + "loss": 1.2784, + "step": 81000 + }, + { + "epoch": 2.840789565949092, + "grad_norm": 5.935400485992432, + "learning_rate": 2.948356685582205e-06, + "loss": 1.2102, + "step": 81025 + }, + { + "epoch": 2.841666082322418, + "grad_norm": 9.105225563049316, + "learning_rate": 2.9321248165798805e-06, + "loss": 1.4337, + "step": 81050 + }, + { + "epoch": 2.8425425986957435, + "grad_norm": 5.483630657196045, + "learning_rate": 2.915892947577556e-06, + "loss": 1.5199, + "step": 81075 + }, + { + "epoch": 2.8434191150690697, + "grad_norm": 14.222515106201172, + "learning_rate": 2.8996610785752315e-06, + "loss": 1.3967, + "step": 81100 + }, + { + "epoch": 2.8442956314423955, + "grad_norm": 9.472376823425293, + "learning_rate": 2.8834292095729074e-06, + "loss": 1.4819, + "step": 81125 + }, + { + "epoch": 2.8451721478157213, + "grad_norm": 3.555896043777466, + "learning_rate": 2.867197340570583e-06, + "loss": 1.4064, + "step": 81150 + }, + { + "epoch": 2.846048664189047, + "grad_norm": 9.98731517791748, + "learning_rate": 2.8509654715682584e-06, + "loss": 1.2325, + "step": 81175 + }, + { + "epoch": 2.8469251805623728, + "grad_norm": 3.5095250606536865, + "learning_rate": 2.8347336025659343e-06, + "loss": 1.2396, + "step": 81200 + }, + { + "epoch": 2.847801696935699, + "grad_norm": 6.878811836242676, + "learning_rate": 2.81850173356361e-06, + "loss": 1.1551, + "step": 81225 + }, + { + "epoch": 2.8486782133090247, + "grad_norm": 7.287352085113525, + "learning_rate": 2.8022698645612853e-06, + "loss": 1.3212, + "step": 81250 + }, + { + "epoch": 2.8495547296823505, + "grad_norm": 7.846263885498047, + "learning_rate": 2.786037995558961e-06, + "loss": 1.3025, + "step": 81275 + }, + { + "epoch": 2.850431246055676, + "grad_norm": 6.758120059967041, + "learning_rate": 2.7698061265566363e-06, + "loss": 1.3478, + "step": 81300 + }, + { + "epoch": 2.851307762429002, + "grad_norm": 4.598637104034424, + "learning_rate": 2.753574257554312e-06, + "loss": 1.5256, + "step": 81325 + }, + { + "epoch": 2.852184278802328, + "grad_norm": 3.775156259536743, + "learning_rate": 2.7373423885519873e-06, + "loss": 1.4409, + "step": 81350 + }, + { + "epoch": 2.853060795175654, + "grad_norm": 3.5845754146575928, + "learning_rate": 2.7211105195496628e-06, + "loss": 1.3711, + "step": 81375 + }, + { + "epoch": 2.8539373115489797, + "grad_norm": 7.024445056915283, + "learning_rate": 2.7048786505473387e-06, + "loss": 1.5177, + "step": 81400 + }, + { + "epoch": 2.8548138279223054, + "grad_norm": 7.209851264953613, + "learning_rate": 2.688646781545014e-06, + "loss": 1.4123, + "step": 81425 + }, + { + "epoch": 2.855690344295631, + "grad_norm": 6.876039505004883, + "learning_rate": 2.6724149125426897e-06, + "loss": 1.4622, + "step": 81450 + }, + { + "epoch": 2.8565668606689574, + "grad_norm": 9.791444778442383, + "learning_rate": 2.6561830435403656e-06, + "loss": 1.2702, + "step": 81475 + }, + { + "epoch": 2.857443377042283, + "grad_norm": 6.677450180053711, + "learning_rate": 2.639951174538041e-06, + "loss": 1.2048, + "step": 81500 + }, + { + "epoch": 2.858319893415609, + "grad_norm": 3.6715807914733887, + "learning_rate": 2.6237193055357166e-06, + "loss": 1.3688, + "step": 81525 + }, + { + "epoch": 2.859196409788935, + "grad_norm": 7.23144006729126, + "learning_rate": 2.607487436533392e-06, + "loss": 1.2488, + "step": 81550 + }, + { + "epoch": 2.860072926162261, + "grad_norm": 3.679901599884033, + "learning_rate": 2.591255567531068e-06, + "loss": 1.3243, + "step": 81575 + }, + { + "epoch": 2.8609494425355866, + "grad_norm": 5.301158428192139, + "learning_rate": 2.5750236985287435e-06, + "loss": 1.3441, + "step": 81600 + }, + { + "epoch": 2.8618259589089123, + "grad_norm": 7.830169677734375, + "learning_rate": 2.558791829526419e-06, + "loss": 1.3023, + "step": 81625 + }, + { + "epoch": 2.862702475282238, + "grad_norm": 11.17105770111084, + "learning_rate": 2.5425599605240945e-06, + "loss": 1.7733, + "step": 81650 + }, + { + "epoch": 2.8635789916555643, + "grad_norm": 5.496183395385742, + "learning_rate": 2.5263280915217704e-06, + "loss": 1.6426, + "step": 81675 + }, + { + "epoch": 2.86445550802889, + "grad_norm": 7.51998233795166, + "learning_rate": 2.510096222519446e-06, + "loss": 1.348, + "step": 81700 + }, + { + "epoch": 2.865332024402216, + "grad_norm": 10.542284965515137, + "learning_rate": 2.4938643535171214e-06, + "loss": 1.2864, + "step": 81725 + }, + { + "epoch": 2.8662085407755415, + "grad_norm": 6.630239963531494, + "learning_rate": 2.4776324845147973e-06, + "loss": 1.3784, + "step": 81750 + }, + { + "epoch": 2.8670850571488673, + "grad_norm": 5.702381610870361, + "learning_rate": 2.4614006155124728e-06, + "loss": 1.5418, + "step": 81775 + }, + { + "epoch": 2.8679615735221935, + "grad_norm": 10.695563316345215, + "learning_rate": 2.4451687465101483e-06, + "loss": 1.2647, + "step": 81800 + }, + { + "epoch": 2.8688380898955192, + "grad_norm": 10.111488342285156, + "learning_rate": 2.4289368775078238e-06, + "loss": 1.3915, + "step": 81825 + }, + { + "epoch": 2.869714606268845, + "grad_norm": 6.803624629974365, + "learning_rate": 2.4127050085054997e-06, + "loss": 1.5603, + "step": 81850 + }, + { + "epoch": 2.870591122642171, + "grad_norm": 7.840823650360107, + "learning_rate": 2.396473139503175e-06, + "loss": 1.5207, + "step": 81875 + }, + { + "epoch": 2.871467639015497, + "grad_norm": 3.760937452316284, + "learning_rate": 2.3802412705008507e-06, + "loss": 1.3783, + "step": 81900 + }, + { + "epoch": 2.8723441553888227, + "grad_norm": 7.564725875854492, + "learning_rate": 2.364009401498526e-06, + "loss": 1.5353, + "step": 81925 + }, + { + "epoch": 2.8732206717621485, + "grad_norm": 7.278017997741699, + "learning_rate": 2.347777532496202e-06, + "loss": 1.4097, + "step": 81950 + }, + { + "epoch": 2.874097188135474, + "grad_norm": 6.836549282073975, + "learning_rate": 2.3315456634938776e-06, + "loss": 1.2513, + "step": 81975 + }, + { + "epoch": 2.8749737045088004, + "grad_norm": 5.547390937805176, + "learning_rate": 2.315313794491553e-06, + "loss": 1.506, + "step": 82000 + }, + { + "epoch": 2.875850220882126, + "grad_norm": 7.882322788238525, + "learning_rate": 2.299081925489229e-06, + "loss": 1.44, + "step": 82025 + }, + { + "epoch": 2.876726737255452, + "grad_norm": 3.327530860900879, + "learning_rate": 2.2828500564869045e-06, + "loss": 1.2726, + "step": 82050 + }, + { + "epoch": 2.8776032536287777, + "grad_norm": 13.413652420043945, + "learning_rate": 2.26661818748458e-06, + "loss": 1.7182, + "step": 82075 + }, + { + "epoch": 2.8784797700021034, + "grad_norm": 3.8757758140563965, + "learning_rate": 2.2503863184822555e-06, + "loss": 1.2166, + "step": 82100 + }, + { + "epoch": 2.8793562863754296, + "grad_norm": 6.753551483154297, + "learning_rate": 2.234154449479931e-06, + "loss": 1.1602, + "step": 82125 + }, + { + "epoch": 2.8802328027487554, + "grad_norm": 6.49480676651001, + "learning_rate": 2.217922580477607e-06, + "loss": 1.3563, + "step": 82150 + }, + { + "epoch": 2.881109319122081, + "grad_norm": 4.793148994445801, + "learning_rate": 2.2016907114752824e-06, + "loss": 1.3259, + "step": 82175 + }, + { + "epoch": 2.8819858354954073, + "grad_norm": 0.04445815831422806, + "learning_rate": 2.185458842472958e-06, + "loss": 1.388, + "step": 82200 + }, + { + "epoch": 2.8828623518687326, + "grad_norm": 7.342043876647949, + "learning_rate": 2.1692269734706334e-06, + "loss": 1.3659, + "step": 82225 + }, + { + "epoch": 2.883738868242059, + "grad_norm": 6.30033540725708, + "learning_rate": 2.152995104468309e-06, + "loss": 1.2738, + "step": 82250 + }, + { + "epoch": 2.8846153846153846, + "grad_norm": 7.709619045257568, + "learning_rate": 2.1367632354659843e-06, + "loss": 1.2607, + "step": 82275 + }, + { + "epoch": 2.8854919009887103, + "grad_norm": 7.80535364151001, + "learning_rate": 2.1205313664636603e-06, + "loss": 1.3747, + "step": 82300 + }, + { + "epoch": 2.8863684173620365, + "grad_norm": 11.506998062133789, + "learning_rate": 2.1042994974613357e-06, + "loss": 1.5805, + "step": 82325 + }, + { + "epoch": 2.8872449337353623, + "grad_norm": 5.014824390411377, + "learning_rate": 2.0880676284590112e-06, + "loss": 1.5054, + "step": 82350 + }, + { + "epoch": 2.888121450108688, + "grad_norm": 6.634560585021973, + "learning_rate": 2.0718357594566867e-06, + "loss": 1.495, + "step": 82375 + }, + { + "epoch": 2.888997966482014, + "grad_norm": 10.682136535644531, + "learning_rate": 2.0556038904543626e-06, + "loss": 1.3876, + "step": 82400 + }, + { + "epoch": 2.8898744828553395, + "grad_norm": 4.950882911682129, + "learning_rate": 2.039372021452038e-06, + "loss": 1.7001, + "step": 82425 + }, + { + "epoch": 2.8907509992286657, + "grad_norm": 15.593979835510254, + "learning_rate": 2.0231401524497136e-06, + "loss": 1.536, + "step": 82450 + }, + { + "epoch": 2.8916275156019915, + "grad_norm": 4.711767196655273, + "learning_rate": 2.006908283447389e-06, + "loss": 1.3119, + "step": 82475 + }, + { + "epoch": 2.8925040319753172, + "grad_norm": 6.746140956878662, + "learning_rate": 1.990676414445065e-06, + "loss": 1.4047, + "step": 82500 + }, + { + "epoch": 2.893380548348643, + "grad_norm": 6.238670349121094, + "learning_rate": 1.9744445454427405e-06, + "loss": 1.4868, + "step": 82525 + }, + { + "epoch": 2.8942570647219688, + "grad_norm": 4.4912333488464355, + "learning_rate": 1.958212676440416e-06, + "loss": 1.2612, + "step": 82550 + }, + { + "epoch": 2.895133581095295, + "grad_norm": 6.691516876220703, + "learning_rate": 1.9419808074380915e-06, + "loss": 1.3816, + "step": 82575 + }, + { + "epoch": 2.8960100974686207, + "grad_norm": 6.834463119506836, + "learning_rate": 1.9257489384357674e-06, + "loss": 1.4328, + "step": 82600 + }, + { + "epoch": 2.8968866138419465, + "grad_norm": 6.966244697570801, + "learning_rate": 1.909517069433443e-06, + "loss": 1.4086, + "step": 82625 + }, + { + "epoch": 2.8977631302152727, + "grad_norm": 6.179404258728027, + "learning_rate": 1.8932852004311184e-06, + "loss": 1.6716, + "step": 82650 + }, + { + "epoch": 2.8986396465885984, + "grad_norm": 6.157968997955322, + "learning_rate": 1.8770533314287943e-06, + "loss": 1.2304, + "step": 82675 + }, + { + "epoch": 2.899516162961924, + "grad_norm": 7.863152503967285, + "learning_rate": 1.8608214624264698e-06, + "loss": 1.2792, + "step": 82700 + }, + { + "epoch": 2.90039267933525, + "grad_norm": 5.5484747886657715, + "learning_rate": 1.8445895934241453e-06, + "loss": 1.2132, + "step": 82725 + }, + { + "epoch": 2.9012691957085757, + "grad_norm": 3.317124843597412, + "learning_rate": 1.8283577244218208e-06, + "loss": 1.3939, + "step": 82750 + }, + { + "epoch": 2.902145712081902, + "grad_norm": 5.301036357879639, + "learning_rate": 1.8121258554194967e-06, + "loss": 1.4373, + "step": 82775 + }, + { + "epoch": 2.9030222284552276, + "grad_norm": 6.623536586761475, + "learning_rate": 1.7958939864171722e-06, + "loss": 1.2917, + "step": 82800 + }, + { + "epoch": 2.9038987448285534, + "grad_norm": 6.5122175216674805, + "learning_rate": 1.7796621174148477e-06, + "loss": 1.2983, + "step": 82825 + }, + { + "epoch": 2.904775261201879, + "grad_norm": 3.83125901222229, + "learning_rate": 1.7634302484125232e-06, + "loss": 1.3625, + "step": 82850 + }, + { + "epoch": 2.905651777575205, + "grad_norm": 5.136961936950684, + "learning_rate": 1.747198379410199e-06, + "loss": 1.339, + "step": 82875 + }, + { + "epoch": 2.906528293948531, + "grad_norm": 5.2338032722473145, + "learning_rate": 1.7309665104078744e-06, + "loss": 1.3448, + "step": 82900 + }, + { + "epoch": 2.907404810321857, + "grad_norm": 4.6407952308654785, + "learning_rate": 1.71473464140555e-06, + "loss": 1.2479, + "step": 82925 + }, + { + "epoch": 2.9082813266951826, + "grad_norm": 5.999884128570557, + "learning_rate": 1.6985027724032258e-06, + "loss": 1.6388, + "step": 82950 + }, + { + "epoch": 2.9091578430685088, + "grad_norm": 5.32928466796875, + "learning_rate": 1.6822709034009013e-06, + "loss": 1.4097, + "step": 82975 + }, + { + "epoch": 2.9100343594418345, + "grad_norm": 4.838665962219238, + "learning_rate": 1.6660390343985768e-06, + "loss": 1.6315, + "step": 83000 + }, + { + "epoch": 2.9109108758151603, + "grad_norm": 6.9875006675720215, + "learning_rate": 1.6498071653962523e-06, + "loss": 1.2897, + "step": 83025 + }, + { + "epoch": 2.911787392188486, + "grad_norm": 0.05813458934426308, + "learning_rate": 1.6335752963939282e-06, + "loss": 1.4908, + "step": 83050 + }, + { + "epoch": 2.912663908561812, + "grad_norm": 6.930561065673828, + "learning_rate": 1.6173434273916037e-06, + "loss": 1.4492, + "step": 83075 + }, + { + "epoch": 2.913540424935138, + "grad_norm": 9.306961059570312, + "learning_rate": 1.6011115583892792e-06, + "loss": 1.3415, + "step": 83100 + }, + { + "epoch": 2.9144169413084637, + "grad_norm": 10.850668907165527, + "learning_rate": 1.5848796893869547e-06, + "loss": 1.6043, + "step": 83125 + }, + { + "epoch": 2.9152934576817895, + "grad_norm": 5.812777042388916, + "learning_rate": 1.5686478203846306e-06, + "loss": 1.193, + "step": 83150 + }, + { + "epoch": 2.9161699740551152, + "grad_norm": 7.04646635055542, + "learning_rate": 1.552415951382306e-06, + "loss": 1.5244, + "step": 83175 + }, + { + "epoch": 2.917046490428441, + "grad_norm": 5.351557731628418, + "learning_rate": 1.5361840823799818e-06, + "loss": 1.493, + "step": 83200 + }, + { + "epoch": 2.917923006801767, + "grad_norm": 3.335606813430786, + "learning_rate": 1.5199522133776573e-06, + "loss": 1.548, + "step": 83225 + }, + { + "epoch": 2.918799523175093, + "grad_norm": 12.96681022644043, + "learning_rate": 1.5037203443753328e-06, + "loss": 1.5412, + "step": 83250 + }, + { + "epoch": 2.9196760395484187, + "grad_norm": 7.92736291885376, + "learning_rate": 1.4874884753730085e-06, + "loss": 1.5259, + "step": 83275 + }, + { + "epoch": 2.9205525559217445, + "grad_norm": 8.63117790222168, + "learning_rate": 1.471256606370684e-06, + "loss": 1.4147, + "step": 83300 + }, + { + "epoch": 2.92142907229507, + "grad_norm": 6.835650444030762, + "learning_rate": 1.4550247373683595e-06, + "loss": 1.3965, + "step": 83325 + }, + { + "epoch": 2.9223055886683964, + "grad_norm": 9.403380393981934, + "learning_rate": 1.4387928683660352e-06, + "loss": 1.5433, + "step": 83350 + }, + { + "epoch": 2.923182105041722, + "grad_norm": 6.092621326446533, + "learning_rate": 1.4225609993637107e-06, + "loss": 1.4292, + "step": 83375 + }, + { + "epoch": 2.924058621415048, + "grad_norm": 5.058342456817627, + "learning_rate": 1.4063291303613864e-06, + "loss": 1.3239, + "step": 83400 + }, + { + "epoch": 2.924935137788374, + "grad_norm": 4.947695255279541, + "learning_rate": 1.3900972613590619e-06, + "loss": 1.3637, + "step": 83425 + }, + { + "epoch": 2.9258116541617, + "grad_norm": 5.050571918487549, + "learning_rate": 1.3738653923567376e-06, + "loss": 1.1296, + "step": 83450 + }, + { + "epoch": 2.9266881705350256, + "grad_norm": 5.5503315925598145, + "learning_rate": 1.357633523354413e-06, + "loss": 1.2361, + "step": 83475 + }, + { + "epoch": 2.9275646869083514, + "grad_norm": 5.762114524841309, + "learning_rate": 1.3414016543520888e-06, + "loss": 1.3368, + "step": 83500 + }, + { + "epoch": 2.928441203281677, + "grad_norm": 6.809871673583984, + "learning_rate": 1.3251697853497645e-06, + "loss": 1.4921, + "step": 83525 + }, + { + "epoch": 2.9293177196550033, + "grad_norm": 3.193204641342163, + "learning_rate": 1.30893791634744e-06, + "loss": 1.4083, + "step": 83550 + }, + { + "epoch": 2.930194236028329, + "grad_norm": 4.994760513305664, + "learning_rate": 1.2927060473451157e-06, + "loss": 1.8803, + "step": 83575 + }, + { + "epoch": 2.931070752401655, + "grad_norm": 4.923537254333496, + "learning_rate": 1.2764741783427912e-06, + "loss": 1.4467, + "step": 83600 + }, + { + "epoch": 2.9319472687749806, + "grad_norm": 7.576258659362793, + "learning_rate": 1.2602423093404669e-06, + "loss": 1.2592, + "step": 83625 + }, + { + "epoch": 2.9328237851483063, + "grad_norm": 6.293764114379883, + "learning_rate": 1.2440104403381424e-06, + "loss": 1.45, + "step": 83650 + }, + { + "epoch": 2.9337003015216325, + "grad_norm": 6.6587653160095215, + "learning_rate": 1.227778571335818e-06, + "loss": 1.2467, + "step": 83675 + }, + { + "epoch": 2.9345768178949583, + "grad_norm": 7.096979141235352, + "learning_rate": 1.2115467023334936e-06, + "loss": 1.3349, + "step": 83700 + }, + { + "epoch": 2.935453334268284, + "grad_norm": 5.47869348526001, + "learning_rate": 1.195314833331169e-06, + "loss": 1.3458, + "step": 83725 + }, + { + "epoch": 2.9363298506416102, + "grad_norm": 6.501884937286377, + "learning_rate": 1.1790829643288446e-06, + "loss": 1.5482, + "step": 83750 + }, + { + "epoch": 2.937206367014936, + "grad_norm": 5.655632019042969, + "learning_rate": 1.1628510953265203e-06, + "loss": 1.6172, + "step": 83775 + }, + { + "epoch": 2.9380828833882617, + "grad_norm": 6.5420122146606445, + "learning_rate": 1.146619226324196e-06, + "loss": 1.2777, + "step": 83800 + }, + { + "epoch": 2.9389593997615875, + "grad_norm": 5.139003276824951, + "learning_rate": 1.1303873573218715e-06, + "loss": 1.5141, + "step": 83825 + }, + { + "epoch": 2.9398359161349132, + "grad_norm": 11.761894226074219, + "learning_rate": 1.1141554883195472e-06, + "loss": 1.2653, + "step": 83850 + }, + { + "epoch": 2.9407124325082394, + "grad_norm": 9.426977157592773, + "learning_rate": 1.0979236193172227e-06, + "loss": 1.3008, + "step": 83875 + }, + { + "epoch": 2.941588948881565, + "grad_norm": 6.777964115142822, + "learning_rate": 1.0816917503148984e-06, + "loss": 1.3055, + "step": 83900 + }, + { + "epoch": 2.942465465254891, + "grad_norm": 6.450350761413574, + "learning_rate": 1.0654598813125738e-06, + "loss": 1.3347, + "step": 83925 + }, + { + "epoch": 2.9433419816282167, + "grad_norm": 7.376315116882324, + "learning_rate": 1.0492280123102496e-06, + "loss": 1.3295, + "step": 83950 + }, + { + "epoch": 2.9442184980015425, + "grad_norm": 5.449368476867676, + "learning_rate": 1.032996143307925e-06, + "loss": 1.3753, + "step": 83975 + }, + { + "epoch": 2.9450950143748686, + "grad_norm": 7.375312328338623, + "learning_rate": 1.0167642743056008e-06, + "loss": 1.3614, + "step": 84000 + }, + { + "epoch": 2.9459715307481944, + "grad_norm": 3.371840476989746, + "learning_rate": 1.0005324053032762e-06, + "loss": 1.3035, + "step": 84025 + }, + { + "epoch": 2.94684804712152, + "grad_norm": 10.109431266784668, + "learning_rate": 9.84300536300952e-07, + "loss": 1.3691, + "step": 84050 + }, + { + "epoch": 2.9477245634948464, + "grad_norm": 3.580687999725342, + "learning_rate": 9.680686672986277e-07, + "loss": 1.4121, + "step": 84075 + }, + { + "epoch": 2.948601079868172, + "grad_norm": 6.638641357421875, + "learning_rate": 9.51836798296303e-07, + "loss": 1.2467, + "step": 84100 + }, + { + "epoch": 2.949477596241498, + "grad_norm": 3.4695615768432617, + "learning_rate": 9.356049292939787e-07, + "loss": 1.3755, + "step": 84125 + }, + { + "epoch": 2.9503541126148236, + "grad_norm": 6.767643928527832, + "learning_rate": 9.193730602916542e-07, + "loss": 1.233, + "step": 84150 + }, + { + "epoch": 2.9512306289881494, + "grad_norm": 5.364767551422119, + "learning_rate": 9.031411912893299e-07, + "loss": 1.4234, + "step": 84175 + }, + { + "epoch": 2.9521071453614756, + "grad_norm": 10.162002563476562, + "learning_rate": 8.869093222870054e-07, + "loss": 1.6887, + "step": 84200 + }, + { + "epoch": 2.9529836617348013, + "grad_norm": 10.419241905212402, + "learning_rate": 8.706774532846811e-07, + "loss": 1.5099, + "step": 84225 + }, + { + "epoch": 2.953860178108127, + "grad_norm": 5.041766166687012, + "learning_rate": 8.544455842823566e-07, + "loss": 1.4796, + "step": 84250 + }, + { + "epoch": 2.954736694481453, + "grad_norm": 0.05199576914310455, + "learning_rate": 8.382137152800322e-07, + "loss": 1.3224, + "step": 84275 + }, + { + "epoch": 2.9556132108547786, + "grad_norm": 10.651880264282227, + "learning_rate": 8.219818462777077e-07, + "loss": 1.4291, + "step": 84300 + }, + { + "epoch": 2.9564897272281048, + "grad_norm": 5.444519519805908, + "learning_rate": 8.057499772753834e-07, + "loss": 1.6066, + "step": 84325 + }, + { + "epoch": 2.9573662436014305, + "grad_norm": 5.551947593688965, + "learning_rate": 7.895181082730589e-07, + "loss": 1.3626, + "step": 84350 + }, + { + "epoch": 2.9582427599747563, + "grad_norm": 4.42881965637207, + "learning_rate": 7.732862392707346e-07, + "loss": 1.3537, + "step": 84375 + }, + { + "epoch": 2.959119276348082, + "grad_norm": 6.630386829376221, + "learning_rate": 7.570543702684102e-07, + "loss": 1.4471, + "step": 84400 + }, + { + "epoch": 2.959995792721408, + "grad_norm": 4.410088062286377, + "learning_rate": 7.408225012660858e-07, + "loss": 1.215, + "step": 84425 + }, + { + "epoch": 2.960872309094734, + "grad_norm": 7.3211469650268555, + "learning_rate": 7.245906322637614e-07, + "loss": 1.5231, + "step": 84450 + }, + { + "epoch": 2.9617488254680597, + "grad_norm": 5.893024921417236, + "learning_rate": 7.08358763261437e-07, + "loss": 1.2829, + "step": 84475 + }, + { + "epoch": 2.9626253418413855, + "grad_norm": 5.515042781829834, + "learning_rate": 6.921268942591125e-07, + "loss": 1.2006, + "step": 84500 + }, + { + "epoch": 2.9635018582147117, + "grad_norm": 6.88463830947876, + "learning_rate": 6.758950252567882e-07, + "loss": 1.2064, + "step": 84525 + }, + { + "epoch": 2.9643783745880374, + "grad_norm": 3.1688601970672607, + "learning_rate": 6.596631562544638e-07, + "loss": 1.3059, + "step": 84550 + }, + { + "epoch": 2.965254890961363, + "grad_norm": 6.859038352966309, + "learning_rate": 6.434312872521394e-07, + "loss": 1.3327, + "step": 84575 + }, + { + "epoch": 2.966131407334689, + "grad_norm": 6.476950168609619, + "learning_rate": 6.27199418249815e-07, + "loss": 1.2588, + "step": 84600 + }, + { + "epoch": 2.9670079237080147, + "grad_norm": 9.72080135345459, + "learning_rate": 6.109675492474906e-07, + "loss": 1.227, + "step": 84625 + }, + { + "epoch": 2.967884440081341, + "grad_norm": 6.773140907287598, + "learning_rate": 5.947356802451662e-07, + "loss": 1.4957, + "step": 84650 + }, + { + "epoch": 2.9687609564546666, + "grad_norm": 6.282393932342529, + "learning_rate": 5.785038112428418e-07, + "loss": 1.3579, + "step": 84675 + }, + { + "epoch": 2.9696374728279924, + "grad_norm": 3.422999143600464, + "learning_rate": 5.622719422405174e-07, + "loss": 1.5381, + "step": 84700 + }, + { + "epoch": 2.970513989201318, + "grad_norm": 9.627105712890625, + "learning_rate": 5.460400732381929e-07, + "loss": 1.4498, + "step": 84725 + }, + { + "epoch": 2.971390505574644, + "grad_norm": 7.481487274169922, + "learning_rate": 5.298082042358685e-07, + "loss": 1.7276, + "step": 84750 + }, + { + "epoch": 2.97226702194797, + "grad_norm": 3.839251756668091, + "learning_rate": 5.135763352335441e-07, + "loss": 1.3805, + "step": 84775 + }, + { + "epoch": 2.973143538321296, + "grad_norm": 3.056807279586792, + "learning_rate": 4.973444662312197e-07, + "loss": 1.2578, + "step": 84800 + }, + { + "epoch": 2.9740200546946216, + "grad_norm": 3.617598295211792, + "learning_rate": 4.811125972288954e-07, + "loss": 1.3127, + "step": 84825 + }, + { + "epoch": 2.974896571067948, + "grad_norm": 4.980025291442871, + "learning_rate": 4.64880728226571e-07, + "loss": 1.2466, + "step": 84850 + }, + { + "epoch": 2.9757730874412736, + "grad_norm": 5.3992180824279785, + "learning_rate": 4.4864885922424655e-07, + "loss": 1.5291, + "step": 84875 + }, + { + "epoch": 2.9766496038145993, + "grad_norm": 5.324915409088135, + "learning_rate": 4.3241699022192215e-07, + "loss": 1.2586, + "step": 84900 + }, + { + "epoch": 2.977526120187925, + "grad_norm": 5.639527797698975, + "learning_rate": 4.1618512121959774e-07, + "loss": 1.3663, + "step": 84925 + }, + { + "epoch": 2.978402636561251, + "grad_norm": 5.36099910736084, + "learning_rate": 3.9995325221727334e-07, + "loss": 1.2608, + "step": 84950 + }, + { + "epoch": 2.979279152934577, + "grad_norm": 13.200787544250488, + "learning_rate": 3.8372138321494894e-07, + "loss": 1.3804, + "step": 84975 + }, + { + "epoch": 2.9801556693079028, + "grad_norm": 3.1735517978668213, + "learning_rate": 3.674895142126245e-07, + "loss": 1.4364, + "step": 85000 + }, + { + "epoch": 2.9810321856812285, + "grad_norm": 3.4145123958587646, + "learning_rate": 3.512576452103001e-07, + "loss": 1.2349, + "step": 85025 + }, + { + "epoch": 2.9819087020545543, + "grad_norm": 7.469667434692383, + "learning_rate": 3.3502577620797574e-07, + "loss": 1.3728, + "step": 85050 + }, + { + "epoch": 2.98278521842788, + "grad_norm": 7.8860673904418945, + "learning_rate": 3.1879390720565134e-07, + "loss": 1.2518, + "step": 85075 + }, + { + "epoch": 2.9836617348012062, + "grad_norm": 4.802858352661133, + "learning_rate": 3.025620382033269e-07, + "loss": 1.4206, + "step": 85100 + }, + { + "epoch": 2.984538251174532, + "grad_norm": 7.24426794052124, + "learning_rate": 2.863301692010025e-07, + "loss": 1.213, + "step": 85125 + }, + { + "epoch": 2.9854147675478577, + "grad_norm": 7.422196388244629, + "learning_rate": 2.700983001986781e-07, + "loss": 1.3573, + "step": 85150 + }, + { + "epoch": 2.9862912839211835, + "grad_norm": 4.947884559631348, + "learning_rate": 2.538664311963537e-07, + "loss": 1.4392, + "step": 85175 + }, + { + "epoch": 2.9871678002945092, + "grad_norm": 12.918807029724121, + "learning_rate": 2.376345621940293e-07, + "loss": 1.4336, + "step": 85200 + }, + { + "epoch": 2.9880443166678354, + "grad_norm": 7.2936015129089355, + "learning_rate": 2.2140269319170488e-07, + "loss": 1.2962, + "step": 85225 + }, + { + "epoch": 2.988920833041161, + "grad_norm": 4.885268688201904, + "learning_rate": 2.0517082418938047e-07, + "loss": 1.0996, + "step": 85250 + }, + { + "epoch": 2.989797349414487, + "grad_norm": 3.8588790893554688, + "learning_rate": 1.8893895518705607e-07, + "loss": 1.356, + "step": 85275 + }, + { + "epoch": 2.990673865787813, + "grad_norm": 5.2050275802612305, + "learning_rate": 1.7270708618473167e-07, + "loss": 1.3532, + "step": 85300 + }, + { + "epoch": 2.991550382161139, + "grad_norm": 8.816625595092773, + "learning_rate": 1.5647521718240727e-07, + "loss": 1.2285, + "step": 85325 + }, + { + "epoch": 2.9924268985344646, + "grad_norm": 5.454915523529053, + "learning_rate": 1.4024334818008284e-07, + "loss": 1.1729, + "step": 85350 + }, + { + "epoch": 2.9933034149077904, + "grad_norm": 3.153261423110962, + "learning_rate": 1.2401147917775847e-07, + "loss": 1.2619, + "step": 85375 + }, + { + "epoch": 2.994179931281116, + "grad_norm": 9.11295223236084, + "learning_rate": 1.0777961017543404e-07, + "loss": 1.6805, + "step": 85400 + }, + { + "epoch": 2.9950564476544423, + "grad_norm": 9.054948806762695, + "learning_rate": 9.154774117310965e-08, + "loss": 1.2365, + "step": 85425 + }, + { + "epoch": 2.995932964027768, + "grad_norm": 8.385968208312988, + "learning_rate": 7.531587217078524e-08, + "loss": 1.2888, + "step": 85450 + }, + { + "epoch": 2.996809480401094, + "grad_norm": 9.218907356262207, + "learning_rate": 5.9084003168460837e-08, + "loss": 1.3345, + "step": 85475 + }, + { + "epoch": 2.9976859967744196, + "grad_norm": 3.659796953201294, + "learning_rate": 4.285213416613643e-08, + "loss": 1.6295, + "step": 85500 + }, + { + "epoch": 2.9985625131477454, + "grad_norm": 5.246033668518066, + "learning_rate": 2.662026516381202e-08, + "loss": 1.362, + "step": 85525 + }, + { + "epoch": 2.9994390295210716, + "grad_norm": 4.123128414154053, + "learning_rate": 1.038839616148762e-08, + "loss": 1.4338, + "step": 85550 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.4551574223406493, + "eval_f1_macro": 0.21842933805832918, + "eval_f1_micro": 0.4551574223406493, + "eval_f1_weighted": 0.306703002026862, + "eval_loss": 1.3794080018997192, + "eval_precision_macro": 0.19546905037281545, + "eval_precision_micro": 0.4551574223406493, + "eval_precision_weighted": 0.2510467302490216, + "eval_recall_macro": 0.2811753463927377, + "eval_recall_micro": 0.4551574223406493, + "eval_recall_weighted": 0.4551574223406493, + "eval_runtime": 3145.4166, + "eval_samples_per_second": 4.534, + "eval_steps_per_second": 1.134, + "step": 85566 + } + ], + "logging_steps": 25, + "max_steps": 85566, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.01 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.12569882852672e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}